In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import numpy as np

n = 150 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain

seed = 10000
dataset='HOG'
proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

# 读取数据
origin = pd.read_csv("../../../../datasets/HOG/digits_HOG.csv")
# 分离特征和目标变量
X = origin.drop(columns=['target'])
y = origin['target']

# 将目标变量进行标签编码
le = LabelEncoder()
y = le.fit_transform(y)

# 划分训练集和测试集
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

for proportion in proportions:
    loaded_results = np.load(f"../../../../datasets/HOG/watermarked/{dataset}-{10000}.npy", allow_pickle=True).item()
    # watermarked_data = loaded_results['watermarked_data']
    watermarked_data = pd.read_csv(f"../../../../datasets/HOG/alteration/HOG-10000-{proportion}-0.csv")
    
    # 分离特征和目标变量
    X = watermarked_data.drop(columns=['target'])
    y = watermarked_data['target']

    # 将目标变量进行标签编码
    le = LabelEncoder()
    y = le.fit_transform(y)

    # 划分训练集和测试集
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # 定义随机森林模型
    model = RandomForestClassifier(n_estimators=100, n_jobs=4)

    # 训练模型
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)

    # 计算准确率
    accuracy = accuracy_score(y_test, y_pred)

    
    print(f"{proportion}: Accuracy: {accuracy:.4f}")


0.2: Accuracy: 0.7889
0.4: Accuracy: 0.7204
0.6: Accuracy: 0.6167
0.8: Accuracy: 0.4000
1.0: Accuracy: 0.1204


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
import numpy as np

n = 50 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain
p = 25
k = 500

seed = 10000
dataset='housing'

original_file = '../../../../datasets/boston_housing_prices/HousingData.csv'
origin = pd.read_csv(original_file)
# 分离特征和目标变量
X = origin.drop(columns=['MEDV'])
y = origin['MEDV']

# 划分训练集和测试集
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 删除训练集和测试集中包含 NaN 值的行
X_test = X_test.dropna()
y_test = y_test[X_test.index]

proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

for proportion in proportions:
    loaded_results = np.load(f"../../../../datasets/boston_housing_prices/watermarked/{dataset}-{10000}.npy", allow_pickle=True).item()
    # watermarked_data = loaded_results['watermarked_data']
    watermarked_data = pd.read_csv(f"../../../../datasets/boston_housing_prices/alteration/housing-10000-{proportion}-0.csv")

    # 分离特征和目标变量
    X = watermarked_data.drop(columns=['MEDV'])
    y = watermarked_data['MEDV']

    # 划分训练集和测试集
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.3, random_state=42)

    # 删除训练集和测试集中包含 NaN 值的行
    X_train = X_train.dropna()
    y_train = y_train[X_train.index]
    
    # 定义随机森林模型
    model = LinearRegression()

    # 训练模型
    model.fit(X_train, y_train)

    # 预测
    y_pred = model.predict(X_test)

    # 计算均方误差
    mse = mean_squared_error(y_test, y_pred)

    print(f"{proportion}: MSE is {mse:.4f}")







0.2: MSE is 28.5618
0.4: MSE is 34.4025
0.6: MSE is 29.5125
0.8: MSE is 54.3652
1.0: MSE is 67.2484
