In [26]:
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

mu = 0
sigma = 20

n = 300 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain
p_s = [0.5*sigma, 1*sigma, 1.5*sigma, 2*sigma, 2.5*sigma]
k = 500

seed = 10000
dataset = 'synthetic'

original_file = '/home/zhengyihao/TabularMark/datasets/synthetic_dataset/synthetic_data.csv'
origin = pd.read_csv(original_file)
np.random.seed(seed)

for p in p_s:
    for i in range(1):
        temp = origin.copy()
        divide_seeds = np.random.randint(0, 2**32 - 1, size=n)

        if len(temp) < n:
            raise ValueError("data中的记录数小于所请求的n个记录")

        indices = np.random.choice(len(temp), size=n, replace=False)
        # 验证索引列表和种子列表的长度是否一致
        if len(indices) != len(divide_seeds):
            raise ValueError("索引文件和种子文件的长度不一致")

        # 开始替换Cover_Type值
        for idx, divide_seed in zip(indices, divide_seeds):
            np.random.seed(divide_seed)
            # 生成等分点
            intervals = np.linspace(-p, p, k + 1)
            # 将 [-p, p] 等分为 k 份
            segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
            np.random.shuffle(segments)
            # 将 segments 分为 green domains 和 red domains
            half_k = k // 2
            green_domains = segments[:half_k]
            red_domains = segments[half_k:]

            green_domain_values = [np.random.uniform(low, np.nextafter(high, low)) for low, high in green_domains]
            perturb_value = np.random.choice(green_domain_values)
            temp.loc[idx, 'dimension_0'] += perturb_value
            
            
        results = {
            'watermarked_data': temp,
            'divide_seeds': divide_seeds,
            'indices': indices
        }

        np.save(f"/home/zhengyihao/TabularMark/datasets/synthetic_dataset/p_tradeoffs/{dataset}-{seed}-{p}-{i}.npy", results)
        

In [28]:
# detect z_scores
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

mu = 0
sigma = 20

n = 300 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain
p_s = [0.5*sigma, 1*sigma, 1.5*sigma, 2*sigma, 2.5*sigma]
k = 500

parser = argparse.ArgumentParser()
parser.add_argument("-seed", type=int, default=10000)
# Parse arguments.

seed = 10000
dataset = 'synthetic'

original_file = '/home/zhengyihao/TabularMark/datasets/synthetic_dataset/synthetic_data.csv'
origin = pd.read_csv(original_file)
np.random.seed(seed)

for p in p_s:
    for seed in range(10000, 10001):
        temp = origin.copy()
        loaded_results = np.load(f"/home/zhengyihao/TabularMark/datasets/synthetic_dataset/p_tradeoffs/{dataset}-{seed}-{p}-0.npy", allow_pickle=True).item()
        watermarked_data = loaded_results['watermarked_data']
        divide_seeds = loaded_results['divide_seeds']
        indices = loaded_results['indices']
        green_cell = 0
        for idx, divide_seed in zip(indices, divide_seeds):
            np.random.seed(divide_seed)
            # 生成等分点
            intervals = np.linspace(-p, p, k + 1)
            # 将 [-p, p] 等分为 k 份
            segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
            np.random.shuffle(segments)
            # 将 segments 分为 green domains 和 red domains
            half_k = k // 2
            green_domains = segments[:half_k]
            red_domains = segments[half_k:]

            difference = watermarked_data.loc[idx, 'dimension_0'] - temp.loc[idx, 'dimension_0']
            # if p == 1 * sigma:
            #     print(divide_seed)
            for low, high in green_domains:
                if low <= difference < high:
                    green_cell += 1
                    break
            
        z_score = (green_cell - n/2) / math.sqrt(n/4)

    print(f"{p} z-score is ",z_score)

10.0 z-score is  17.32050807568877
20 z-score is  17.32050807568877
30.0 z-score is  17.32050807568877
40 z-score is  17.32050807568877
50.0 z-score is  17.32050807568877


In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np

mu = 0
sigma = 20

n = 300 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain
p_s = [0.5*sigma, 1*sigma, 1.5*sigma, 2*sigma, 2.5*sigma]
k = 500

parser = argparse.ArgumentParser()
parser.add_argument("-seed", type=int, default=10000)
# Parse arguments.

seed = 10000
dataset = 'synthetic'

original_file = '/home/zhengyihao/TabularMark/datasets/synthetic_dataset/synthetic_data.csv'
origin = pd.read_csv(original_file)
np.random.seed(seed)

# 分离特征和目标变量
X = origin.drop(columns=['target'])
y = origin['target']

# 将目标变量进行标签编码
le = LabelEncoder()
y = le.fit_transform(y)

# 划分训练集和测试集
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


for p in p_s:
    for seed in range(10000, 10001):
        temp = origin.copy()
        loaded_results = np.load(f"/home/zhengyihao/TabularMark/datasets/synthetic_dataset/p_tradeoffs/{dataset}-{seed}-{p}-0.npy", allow_pickle=True).item()
        watermarked_data = loaded_results['watermarked_data']
         # 分离特征和目标变量
        X = watermarked_data.drop(columns=['target'])
        y = watermarked_data['target']

        # 将目标变量进行标签编码
        le = LabelEncoder()
        y = le.fit_transform(y)

        # 划分训练集和测试集 
        X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)
        
        # 定义随机森林模型
        model = LogisticRegression(max_iter=1000)

        # 训练模型
        model.fit(X_train, y_train)

        # 预测
        y_pred = model.predict(X_test)

        # 计算准确率
        accuracy = accuracy_score(y_test, y_pred)

        
        print(f"{p}: Accuracy: {accuracy:.4f}")



10.0: Accuracy: 0.8717
20: Accuracy: 0.8617
30.0: Accuracy: 0.8617
40: Accuracy: 0.8617
50.0: Accuracy: 0.8533
