In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

class MLUtilityLinearRegression:
    
    def __init__(self, dataset_path, label, watermarked_data_path=None, test_size=0.3, random_state=45):
        self.dataset_path = dataset_path
        self.watermarked_data_path = watermarked_data_path
        self.test_size = test_size
        self.random_state = random_state
        self.label = label
        self.model = LinearRegression()

    def load_data(self, file_path):
        _, file_extension = os.path.splitext(file_path)
        if(file_extension == '.csv'):
            data = pd.read_csv(file_path)
        elif(file_extension == '.npy'):
            loaded_results = np.load(file_path, allow_pickle=True).item()
            data = loaded_results['watermarked_data']
        return data

    def preprocess_data(self, data):
        X = data.drop(columns=[self.label])
        y = data[self.label]
        X = X.dropna()
        X = pd.get_dummies(X)

        y = y[X.index]
        return X, y

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_model(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict_and_evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        return mean_squared_error(y_test, y_pred)

    def compute_mse(self):
        origin_data = self.load_data(self.dataset_path)
        X_origin, y_origin = self.preprocess_data(origin_data)
        X_train, X_test, y_train, y_test = self.split_data(X_origin, y_origin)
        
        self.train_model(X_train, y_train)
        mse_origin = self.predict_and_evaluate(X_test, y_test)
        # print(f"MSE of the original data: {mse_origin:.4f}")

        if self.watermarked_data_path:
            watermarked_data = self.load_data(self.watermarked_data_path)
            X_watermarked, y_watermarked = self.preprocess_data(watermarked_data)
            X_train, _, y_train, _ = self.split_data(X_watermarked, y_watermarked)

            self.train_model(X_train, y_train)
            mse_watermarked = self.predict_and_evaluate(X_test, y_test)
            # print(f"MSE of the watermarked data: {mse_watermarked:.4f}")
            return mse_watermarked, mse_origin



In [None]:
from watermarking_schemes.B2Mark import B2MarkWatermarkEmbedding
import pandas as pd

seed_range = range(10000, 10020)
datasets = ["energy", "student"]
# datasets = ["energy"]
# datasets = ["student"]

for dataset in datasets:
    for seed in seed_range:
        g = 2
        
        secret_key = "4d3f2e1c"
        if dataset == "energy":
            original_data_path = "dataset/ENB2012_data.csv"
            b2Mark_embedding = B2MarkWatermarkEmbedding(dataset = dataset, seed = seed, g = g, secret_key = secret_key, k = 15, columns_of_interest=['X1', 'X2'], watermarked_column='Y2')
            b2Mark_embedding.load_data(original_data_path)

            b2Mark_embedding.generate_segments()
            b2Mark_embedding.process_data()
            b2Mark_embedding.save_results(f"dataset/B2Mark_dataset/B2Mark_{dataset}_{seed}.npy")
        
        elif dataset == "student":
            original_data_path = "dataset/student-por.csv"            
            b2Mark_embedding = B2MarkWatermarkEmbedding(dataset = dataset, seed = seed, g = g, secret_key = secret_key, k = 10, columns_of_interest=['age', 'Medu'], watermarked_column='G3')
            b2Mark_embedding.load_data(original_data_path)
            
            b2Mark_embedding.generate_segments()
            b2Mark_embedding.process_data()
            b2Mark_embedding.save_results(f"dataset/B2Mark_dataset/B2Mark_{dataset}_{seed}.npy")
      

In [None]:
# HeMark
from watermarking_schemes.HeMark import HeMarkWatermarkEmbedding

seed_range = range(10000, 10020)
datasets = ["energy", "student"]

for dataset in datasets:
    for seed in seed_range:
        
        if dataset == "energy":
            original_data_path = "dataset/ENB2012_data.csv"
            HeMark_embedding = HeMarkWatermarkEmbedding(dataset = dataset, seed = seed, watermarked_column='Y2')
            HeMark_embedding.load_data(original_data_path)

            HeMark_embedding.generate_segments()
            HeMark_embedding.process_data()
            HeMark_embedding.save_results(f"dataset/HeMark_dataset/HeMark_{dataset}_{seed}.npy")
            
        elif dataset == "student":
            original_data_path = "dataset/student-por.csv"
            HeMark_embedding = HeMarkWatermarkEmbedding(dataset = dataset, seed = seed, watermarked_column='G3')
            HeMark_embedding.load_data(original_data_path)

            HeMark_embedding.generate_segments()
            HeMark_embedding.process_data()
            HeMark_embedding.save_results(f"dataset/HeMark_dataset/HeMark_{dataset}_{seed}.npy")
   

In [None]:
# NgoMark
from watermarking_schemes.NgoMark import NgoMarkWatermarkEmbedding

seed_range = range(10000, 10020)
datasets = ["energy", "student"]
# datasets = ["energy"]
# datasets = ["student"]


for dataset in datasets:
    for seed in seed_range:
        if dataset == "energy":
            original_data_path = "dataset/ENB2012_data.csv"
            NgoMark_embedding = NgoMarkWatermarkEmbedding(dataset = dataset, seed = seed, b = 25, watermarked_column='Y2')
            NgoMark_embedding.load_data(original_data_path)

            NgoMark_embedding.generate_segments()
            NgoMark_embedding.process_data()
            NgoMark_embedding.save_results(f"dataset/NgoMark_dataset/NgoMark_{dataset}_{seed}.npy")
        
        elif dataset == "student":
            original_data_path = "dataset/student-por.csv"
            NgoMark_embedding = NgoMarkWatermarkEmbedding(dataset = dataset, seed = seed, b = 15, watermarked_column='G3')
            NgoMark_embedding.load_data(original_data_path)

            NgoMark_embedding.generate_segments()
            NgoMark_embedding.process_data()
            NgoMark_embedding.save_results(f"dataset/NgoMark_dataset/NgoMark_{dataset}_{seed}.npy")
         

In [None]:
from watermarking_schemes.TabularMark import TabualrMarkWatermarkEmbedding

seed_range = range(10000, 10020)
datasets = ["energy", "student"]
# datasets = ["energy"]
# datasets = ["student"]


for dataset in datasets:
    for seed in seed_range:
        if dataset == "energy":
            original_data_path = "dataset/ENB2012_data.csv"
            tabularMark_embedding = TabualrMarkWatermarkEmbedding(dataset = dataset, original_file =  original_data_path, seed = seed, n = int(500/2), p = 18, k = 60, watermarked_column='Y2')
            tabularMark_embedding.apply_watermark()
            tabularMark_embedding.save_results(f"dataset/TabularMark_dataset/TabularMark_{dataset}_{seed}.npy")
            
        elif dataset == "student":
            original_data_path = "dataset/student-por.csv"
            tabularMark_embedding = TabualrMarkWatermarkEmbedding(dataset = dataset, original_file =  original_data_path, seed = seed, n = int(500/2), p = 6, k = 40, watermarked_column='G3')
            tabularMark_embedding.apply_watermark()
            tabularMark_embedding.save_results(f"dataset/TabularMark_dataset/TabularMark_{dataset}_{seed}.npy")
        

align_nonintrusiveness

In [None]:
watermark_schemes = ['B2Mark', 'TabularMark', 'HeMark', 'NgoMark']
label = 'Y2'

for watermark_scheme in watermark_schemes:
    dataset = "energy"
    dataset_path = "dataset/ENB2012_data.csv"
    
    mses = []
    mses_origin = []
    for seed in range(10000,10020):
    # watermarked_data_path = f"../../datasets/watermark/{dataset}/housing-10050.npy"  # 替换为实际的水印数据路径
        watermarked_data_path = f"dataset/{watermark_scheme}_dataset/{watermark_scheme}_{dataset}_{seed}.npy"
        
        # 创建对象并计算 MSE
        measure_ml_util = MLUtilityLinearRegression(dataset_path, label, watermarked_data_path, random_state=seed)
        mse, mse_orgin = measure_ml_util.compute_mse()
        mses.append(mse)
        mses_origin.append(mse_orgin)
        
    print(f"Watermark scheme: {watermark_scheme}")
    print(f"Mean MSE: {np.mean(mses):.4f}")
    print(f"Mean MSE Origin: {np.mean(mses_origin):.4f}")
    
        

In [None]:
watermark_schemes = ['B2Mark', 'TabularMark', 'HeMark', 'NgoMark']
label = 'G3'

for watermark_scheme in watermark_schemes:
    dataset = "student"
    dataset_path = "dataset/student-por.csv"
    
    mses = []
    mses_origin = []
    for seed in range(10000,10020):
        watermarked_data_path = f"dataset/{watermark_scheme}_dataset/{watermark_scheme}_{dataset}_{seed}.npy"
        
        # 创建对象并计算 MSE
        measure_ml_util = MLUtilityLinearRegression(dataset_path, label, watermarked_data_path, random_state=seed)
        mse, mse_orgin = measure_ml_util.compute_mse()
        mses.append(mse)
        mses_origin.append(mse_orgin)
        
    print(f"Watermark scheme: {watermark_scheme}")
    print(f"Mean MSE: {np.mean(mses):.4f}")
    print(f"Mean MSE Origin: {np.mean(mses_origin):.4f}")
    
        