In [None]:
# alteration attack
# alteration

import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

seed = 10000
np.random.seed(seed)
attack_proportion = [0.2, 0.4, 0.6, 0.8, 1.0]

dataset = "housing"
loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
watermarked_data = loaded_results['watermarked_data']

p = 20

np.random.seed(123)
for proportion in attack_proportion:
    temp = watermarked_data.copy()
    indices = np.random.choice(len(temp), size=int(proportion * len(temp)), replace=False)
    perturb_values = np.random.uniform(-p, p, size=len(indices)) 
    temp.loc[indices, 'MEDV'] += perturb_values
    temp.to_csv(f"alteration_{dataset}-{seed}-{proportion}.csv", index=False)



In [None]:
# evaluate alteration attack

# z_score
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import hashlib


gamma = 1/2 # ratio between the length of green domain and red domain
k = 10

seed = 10000
dataset='housing'
secret_key = "114514"

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

def hash_mod(key, mod_value):
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)
    return hash_value % mod_value  

def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  
    if len(digits) == 1:  
        return digits + "0"  
    return digits[:2]  

columns_of_interest = ['RM', 'AGE']  

proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

for proportion in proportions:
    loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
    watermarked_data = pd.read_csv(f"alteration_{dataset}-{seed}-{proportion}.csv")
    watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)  

    green_cell = 0
    n_cell = 0
    
    for idx in range(len(watermarked_data)):
        np.random.seed(seed)

        
        selected_data = watermarked_data.loc[idx, columns_of_interest]
        first_two_digits_data = selected_data.apply(first_two_digits)
        composite_numbers = ''.join(first_two_digits_data.values)
        
        if(hash_mod(composite_numbers, 3) != 0):
            continue
        
        n_cell += 1
        
        intervals = np.linspace(medv_min, medv_max, k + 1)
        segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
        np.random.shuffle(segments)

        half_k = k // 2
        green_domains = segments[:half_k]
        red_domains = segments[half_k:]
        
        for low, high in green_domains:
            if low <= watermarked_data.loc[idx, 'MEDV'] < high:
                green_cell += 1
                break

    z_score = (green_cell - n_cell/2) / math.sqrt(n_cell/4)

    print(f"{proportion}: The average z-score is ", z_score)





In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np


seed = 10000
dataset='housing'

original_file = '../dataset/HousingData.csv'
origin = pd.read_csv(original_file)

X = origin.drop(columns=['MEDV'])
y = origin['MEDV']

_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_test = X_test.dropna()
y_test = y_test[X_test.index]

proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

for proportion in proportions:
    loaded_results = np.load(f"{dataset}-{10000}.npy", allow_pickle=True).item()
    # watermarked_data = loaded_results['watermarked_data']
    watermarked_data = pd.read_csv(f"alteration_housing-10000-{proportion}.csv")

    X = watermarked_data.drop(columns=['MEDV'])
    y = watermarked_data['MEDV']

    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)

    X_train = X_train.dropna()
    y_train = y_train[X_train.index]
    
    model = LinearRegression()

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)

    print(f"{proportion}: MSE is {mse:.4f}")







In [None]:
# insertion attack
#insertion

import os
import numpy as np
import pandas as pd

seed = 10000
np.random.seed(seed)
attack_proportion = [0.2, 0.4, 0.6, 0.8, 1.0]

dataset = "housing"
loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
watermarked_data = loaded_results['watermarked_data']

def insert_tuples(temp, proportion, medv_min, medv_max):
    num_insertions = int(proportion * len(temp))
    sampled_rows = pd.DataFrame(columns=temp.columns)
    
    for column in temp.columns:
        if column == 'MEDV':  # 对 MEDV 列单独处理
            sampled_rows[column] = np.random.uniform(medv_min, medv_max, size=num_insertions)
        else:
            sampled_rows[column] = temp[column].sample(n=num_insertions, replace=True).reset_index(drop=True)
    
    insertion_indices = np.random.choice(len(temp) + num_insertions, size=num_insertions, replace=False)
    
    expanded_temp = pd.DataFrame(index=range(len(temp) + num_insertions), columns=temp.columns)
    original_indices = np.setdiff1d(np.arange(len(temp) + num_insertions), insertion_indices)
    
    expanded_temp.iloc[original_indices] = temp.values
    expanded_temp.iloc[insertion_indices] = sampled_rows.values
    
    return expanded_temp

original_file = '../dataset/HousingData.csv'
origin = pd.read_csv(original_file)

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

for proportion in attack_proportion:
    temp = watermarked_data.copy()
    temp = insert_tuples(temp, proportion, medv_min, medv_max)
    temp.to_csv(f"insertion_{dataset}-{seed}-{proportion}.csv", index=False)


In [None]:
# evaluate insertion attack

# z_score
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import hashlib


gamma = 1/2 # ratio between the length of green domain and red domain
k = 10

seed = 10000
dataset='housing'
secret_key = "114514"

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

def hash_mod(key, mod_value):
    # 使用 SHA-256 生成哈希
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)  # 将哈希转换为整数
    return hash_value % mod_value  # 对结果取模

# 提取前两位非零数字的函数
def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  # 去掉前导0和小数点
    if len(digits) == 1:  # 如果只有一位数字
        return digits + "0"  # 补0
    return digits[:2]  # 返回前两位

columns_of_interest = ['RM', 'AGE']  # 可根据需求更改

proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

for proportion in proportions:
    loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
    watermarked_data = pd.read_csv(f"insertion_{dataset}-{seed}-{proportion}.csv")
    watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)  # 填充NA为0

    green_cell = 0
    n_cell = 0
    
    for idx in range(len(watermarked_data)):
        np.random.seed(seed)

        
        selected_data = watermarked_data.loc[idx, columns_of_interest]
        first_two_digits_data = selected_data.apply(first_two_digits)
        composite_numbers = ''.join(first_two_digits_data.values)
        
        if(hash_mod(composite_numbers, 3) != 0):
            continue
        
        n_cell += 1
        
        intervals = np.linspace(medv_min, medv_max, k + 1)
        segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
        np.random.shuffle(segments)

        half_k = k // 2
        green_domains = segments[:half_k]
        red_domains = segments[half_k:]
        
        for low, high in green_domains:
            if low <= watermarked_data.loc[idx, 'MEDV'] < high:
                green_cell += 1
                break

    z_score = (green_cell - n_cell/2) / math.sqrt(n_cell/4)

    print(f"{proportion}: The average z-score is ", z_score)


In [None]:
# deletion attack
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

seed = 10000
np.random.seed(seed)
attack_proportion = [0.2, 0.4, 0.6, 0.8, 1.0]

dataset = "housing"
loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
watermarked_data = loaded_results['watermarked_data']

for proportion in attack_proportion:
    temp = watermarked_data.copy()
    indices = np.random.choice(len(temp), size=int(proportion * len(temp)), replace=False)
    temp = temp.drop(indices)
    temp.to_csv(f"deletion_{dataset}-{seed}-{proportion}.csv", index=False)


In [None]:
# evaluate deletion attack
# evaluate insertion attack

# z_score
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import hashlib


gamma = 1/2 # ratio between the length of green domain and red domain
k = 10

seed = 10000
dataset='housing'
secret_key = "114514"

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

def hash_mod(key, mod_value):
    # 使用 SHA-256 生成哈希
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)  # 将哈希转换为整数
    return hash_value % mod_value  # 对结果取模

# 提取前两位非零数字的函数
def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  # 去掉前导0和小数点
    if len(digits) == 1:  # 如果只有一位数字
        return digits + "0"  # 补0
    return digits[:2]  # 返回前两位

columns_of_interest = ['RM', 'AGE']  # 可根据需求更改

proportions = [0.2, 0.4, 0.6, 0.8]

for proportion in proportions:
    loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
    watermarked_data = pd.read_csv(f"deletion_{dataset}-{seed}-{proportion}.csv")
    watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)  # 填充NA为0

    green_cell = 0
    n_cell = 0
    
    for idx in range(len(watermarked_data)):
        np.random.seed(seed)

        
        selected_data = watermarked_data.loc[idx, columns_of_interest]
        first_two_digits_data = selected_data.apply(first_two_digits)
        composite_numbers = ''.join(first_two_digits_data.values)
        
        if(hash_mod(composite_numbers, 3) != 0):
            continue
        
        n_cell += 1
        
        intervals = np.linspace(medv_min, medv_max, k + 1)
        segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
        np.random.shuffle(segments)

        half_k = k // 2
        green_domains = segments[:half_k]
        red_domains = segments[half_k:]
        
        for low, high in green_domains:
            if low <= watermarked_data.loc[idx, 'MEDV'] < high:
                green_cell += 1
                break

    z_score = (green_cell - n_cell/2) / math.sqrt(n_cell/4)

    print(f"{proportion}: The average z-score is ", z_score)
