In [45]:
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import hashlib


gamma = 1/2
k = 10
g = 3

secret_key = "123"

seed = 10000

dataset='housing'


original_file = '../dataset/HousingData.csv'
origin = pd.read_csv(original_file)

np.random.seed(seed)

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

def hash_mod(key, mod_value):
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)  
    return hash_value % mod_value  

columns_of_interest = ['RM', 'AGE']  
origin[columns_of_interest] = origin[columns_of_interest].fillna(0)  

def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  
    if len(digits) == 1:  
        return digits + "0"  
    return digits[:2] 


np.random.seed(seed)
intervals = np.linspace(medv_min, medv_max, k + 1)
segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
np.random.shuffle(segments)
half_k = k // 2
green_domains = segments[:half_k]
red_domains = segments[half_k:]
green_domain_values = [(low, np.nextafter(high, low)) for low, high in green_domains]

for idx in range(len(origin)):
    selected_data = origin.loc[idx, columns_of_interest]
    first_two_digits_data = selected_data.apply(first_two_digits)
    composite_numbers = ''.join(first_two_digits_data.values)
    
    if(hash_mod(composite_numbers, g) != 0):
        continue    
    
    original_medv = origin.loc[idx, 'MEDV']
    closest_value = None
    
    green_mid_values = [(seg[0] + seg[1]) / 2 for seg in green_domain_values]  
    closest_mid = min(green_mid_values, key=lambda x: abs(x - original_medv))  
    closest_idx = green_mid_values.index(closest_mid)
    
    if original_medv >= green_domain_values[closest_idx][0] and original_medv <= green_domain_values[closest_idx][1]:
        closest_value = original_medv
    else:
        closest_value = np.random.uniform(green_domain_values[closest_idx][0], green_domain_values[closest_idx][1])

    origin.loc[idx, 'MEDV'] = closest_value

    results = {
        'watermarked_data': origin
    }

    np.save(f"{dataset}-{seed}.npy", results)






In [47]:
# detect
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import hashlib


gamma = 1/2 
k = 10
g = 3

seed = 10000
dataset='housing'
secret_key = "123"

medv_max = origin['MEDV'].max()
medv_min = origin['MEDV'].min()

def hash_mod(key, mod_value):
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)  
    return hash_value % mod_value  

def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  
    if len(digits) == 1: 
        return digits + "0"  
    return digits[:2]  

columns_of_interest = ['RM', 'AGE']  



z_scores = []
for seed in range(10000, 10050):
    loaded_results = np.load(f"{dataset}-{seed}.npy", allow_pickle=True).item()
    watermarked_data = loaded_results['watermarked_data']
    # watermarked_data = pd.read_csv("/home/zhengyihao/BlindTabularMark/output_domain/dataset/HousingData.csv")
    watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)  

    green_cell = 0
    n_cell = 0
    
    np.random.seed(seed)
    intervals = np.linspace(medv_min, medv_max, k + 1)
    segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
    np.random.shuffle(segments)

    half_k = k // 2
    green_domains = segments[:half_k]
    red_domains = segments[half_k:]
    
    for idx in range(len(watermarked_data)):
        selected_data = watermarked_data.loc[idx, columns_of_interest]
        first_two_digits_data = selected_data.apply(first_two_digits)
        composite_numbers = ''.join(first_two_digits_data.values)
        if(hash_mod(composite_numbers, g) != 0):
            continue
        n_cell += 1
        
        for low, high in green_domains:
            if low <= watermarked_data.loc[idx, 'MEDV'] < high:
                green_cell += 1
                break
    z_score = (green_cell - n_cell/2) / math.sqrt(n_cell/4)
    z_scores.append(z_score)

print("The average z-score is ",np.mean(z_scores))







The average z-score is  12.206555615733702
