In [3]:
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

n = 50 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain
p = 25
k = 500

seed = 10000
dataset='housing'


original_file = '../../datasets/boston_housing_prices/HousingData.csv'
origin = pd.read_csv(original_file)

np.random.seed(seed)

divide_seeds = np.random.randint(0, 2**32 - 1, size=n)

if len(origin) < n:
    raise ValueError("data中的记录数小于所请求的n个记录")

indices = np.random.choice(len(origin), size=n, replace=False)

# 验证索引列表和种子列表的长度是否一致
if len(indices) != len(divide_seeds):
    raise ValueError("索引文件和种子文件的长度不一致")

# 开始替换Cover_Type值
for idx, divide_seed in zip(indices, divide_seeds):
    np.random.seed(divide_seed)
    # 生成等分点
    intervals = np.linspace(-p, p, k + 1)
    # 将 [-p, p] 等分为 k 份
    segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
    np.random.shuffle(segments)
    # 将 segments 分为 green domains 和 red domains
    half_k = k // 2
    green_domains = segments[:half_k]
    red_domains = segments[half_k:]

    green_domain_values = [np.random.uniform(low, np.nextafter(high, low)) for low, high in green_domains]
    perturb_value = np.random.choice(green_domain_values)

    origin.loc[idx, 'MEDV'] += perturb_value
    
results = {
    'watermarked_data': origin,
    'divide_seeds': divide_seeds,
    'indices': indices
}

np.save(f"../../datasets/boston_housing_prices/discussion/uniform-{dataset}-{seed}.npy", results)






In [8]:
# normal distribution
import os
import numpy as np
import pandas as pd
from scipy.stats import norm

n = 50  # number of key cells
gamma = 1/2  # ratio between the length of green domain and red domain
p = 25
k = 500

seed = 10000
dataset = 'housing'

original_file = '/home/zhengyihao/TabularMark/datasets/boston_housing_prices/HousingData.csv'
origin = pd.read_csv(original_file)

np.random.seed(seed)

divide_seeds = np.random.randint(0, 2**32 - 1, size=n)

if len(origin) < n:
    raise ValueError("data中的记录数小于所请求的n个记录")

indices = np.random.choice(len(origin), size=n, replace=False)

# 验证索引列表和种子列表的长度是否一致
if len(indices) != len(divide_seeds):
    raise ValueError("索引文件和种子文件的长度不一致")

# 正态分布参数
mu = 0
sigma = 1

# 标准化因子，确保在 [-p, p] 上积分为 1
normalization_factor = norm.cdf(p, loc=mu, scale=sigma) - norm.cdf(-p, loc=mu, scale=sigma)

def truncated_normal_sample(mu, sigma, p):
    """在 [-p, p] 范围内进行正态分布采样"""
    while True:
        sample = np.random.uniform(-p, p)
        pdf_value = norm.pdf(sample, mu, sigma) / normalization_factor
        if np.random.uniform(0, 1) < pdf_value:
            return sample

# 开始替换Cover_Type值
for idx, divide_seed in zip(indices, divide_seeds):
    np.random.seed(divide_seed)
    # 生成等分点
    intervals = np.linspace(-p, p, k + 1)
    # 将 [-p, p] 等分为 k 份
    segments = [(intervals[i], intervals[i + 1]) for i in range(k)]
    np.random.shuffle(segments)
    # 将 segments 分为 green domains 和 red domains
    half_k = k // 2
    green_domains = segments[:half_k]
    red_domains = segments[half_k:]
    
    while True:
        perturb_value = truncated_normal_sample(mu, sigma, p)
        # 判断采样值是否在绿色域中
        in_green_domain = any(low <= perturb_value < high for low, high in green_domains)
        if in_green_domain:
            break

    origin.loc[idx, 'MEDV'] += perturb_value
    
results = {
    'watermarked_data': origin,
    'divide_seeds': divide_seeds,
    'indices': indices
}

np.save(f"/home/zhengyihao/TabularMark/datasets/boston_housing_prices/discussion/gaussian-{dataset}-{seed}.npy", results)


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

# 读取数据
origin = pd.read_csv("/home/zhengyihao/TabularMark/datasets/boston_housing_prices/HousingData.csv")

# 分离特征和目标变量
X = origin.drop(columns=['MEDV'])
y = origin['MEDV']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_train = X_train.dropna()
y_train = y_train[X_train.index]

X_test = X_test.dropna()
y_test = y_test[X_test.index]


# 定义线性回归模型
model = LinearRegression()

# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)

# 输出均方误差
print(f"MSE: {mse:.4f}")


MSE: 23.1586


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

dataset = "housing"
seed = 10000
    
# 读取数据
origin = pd.read_csv("/home/zhengyihao/TabularMark/datasets/boston_housing_prices/HousingData.csv")
loaded_results = np.load(f"/home/zhengyihao/TabularMark/datasets/boston_housing_prices/discussion/uniform-{dataset}-{seed}.npy", allow_pickle=True).item()
watermarked_data = loaded_results['watermarked_data']
# watermarked_data = pd.read_csv("/home/zhengyihao/TabularMark/datasets/boston_housing_prices/perturbed/HOG-10000-1.0-0.csv")
divide_seeds = loaded_results['divide_seeds']
indices = loaded_results['indices']


# 分离特征和目标变量
X = watermarked_data.drop(columns=['MEDV'])
y = watermarked_data['MEDV']

# 划分训练集和测试集
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_train = X_train.dropna()
y_train = y_train[X_train.index]

# 分离特征和目标变量
X = origin.drop(columns=['MEDV'])
y = origin['MEDV']

# 划分训练集和测试集
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# 定义随机森林模型
model = LinearRegression()

# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)

# 输出均方误差
print(f"MSE: {mse:.4f}")



MSE: 24.3045


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

dataset = "housing"
seed = 10000
    
# 读取数据
origin = pd.read_csv("/home/zhengyihao/TabularMark/datasets/boston_housing_prices/HousingData.csv")
loaded_results = np.load(f"/home/zhengyihao/TabularMark/datasets/boston_housing_prices/discussion/gaussian-{dataset}-{seed}.npy", allow_pickle=True).item()
watermarked_data = loaded_results['watermarked_data']
# watermarked_data = pd.read_csv("/home/zhengyihao/TabularMark/datasets/boston_housing_prices/perturbed/HOG-10000-1.0-0.csv")
divide_seeds = loaded_results['divide_seeds']
indices = loaded_results['indices']


# 分离特征和目标变量
X = watermarked_data.drop(columns=['MEDV'])
y = watermarked_data['MEDV']

# 划分训练集和测试集
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_train = X_train.dropna()
y_train = y_train[X_train.index]

# 分离特征和目标变量
X = origin.drop(columns=['MEDV'])
y = origin['MEDV']

# 划分训练集和测试集
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_test = X_test.dropna()
y_test = y_test[X_test.index]

# 定义随机森林模型
model = LinearRegression()

# 训练模型
model.fit(X_train, y_train)

# 预测
y_pred = model.predict(X_test)

# 计算均方误差
mse = mean_squared_error(y_test, y_pred)

# 输出均方误差
print(f"MSE: {mse:.4f}")



MSE: 23.1753
