Generate watermarked data

In [None]:
import sys
import os
import string
import numpy as np

from watermarking_schemes.B2Mark import WatermarkEmbedding

# B2Mark

seed_range = range(10000, 10010)

# 随机生成n个单位长度为 6 的字符串数组secret_key_1s与secret_key_2s

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s

g = 6

secret_key_1s, secret_key_2s = generate_secret_keys(30)

for idx, seed in enumerate(seed_range):
    dataset = "covertype"
    original_data_path = f"dataset/covtype_with_key.subset.data"
    
    b2Mark_embedding = WatermarkEmbedding(dataset = dataset, watermark_information="1010110011", g = g, seed = 10000, secret_key_1=secret_key_1s[0], secret_key_2=secret_key_2s[0], original_file=original_data_path)                                         
    b2Mark_embedding.load_dataset()
    b2Mark_embedding.get_quality_domains()
    b2Mark_embedding.apply_watermark()
    b2Mark_embedding.save_results(f"B2Mark_dataset/B2Mark_{dataset}_{seed}.npy")

In [None]:

import sys
import os
import string
import numpy as np
 
from watermarking_schemes.GAHSW import WatermarkEmbedding

dataset = "covertype"

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s

secret_key_1s, secret_key_2s = generate_secret_keys(30)

watermark_information="1010110011"

for idx, seed in enumerate(range(10000, 10010)):
    original_file = f'dataset/covtype_with_key.subset.data'

    gahsw_embedding = WatermarkEmbedding(dataset=dataset, watermark_information=watermark_information, seed=10000, original_file=original_file, Ks=secret_key_1s[0], N_g=len(watermark_information))
    gahsw_embedding.load_dataset()
    gahsw_embedding.apply_watermark()
    gahsw_embedding.save_results(f"GAHSW_dataset/GAHSW_{dataset}_{seed}.npy")

In [None]:

import sys
import os
import string
import numpy as np

from watermarking_schemes.SCPW import WatermarkEmbedding

dataset = "covertype"

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s

secret_key_1s, secret_key_2s = generate_secret_keys(30)

watermark_information="1010110011"


for idx, seed in enumerate(range(10000, 10010)):
    orginal_file = f"dataset/covtype_with_key.subset.data"

    scpw_embedding = WatermarkEmbedding(dataset=dataset, watermark_information=watermark_information, seed=10000, original_file=orginal_file, Ks=secret_key_1s[1])
    scpw_embedding.load_dataset()
    scpw_embedding.apply_watermark()
    scpw_embedding.save_results(f"SCPW_dataset/SCPW_{dataset}_{seed}.npy")

align the non-intrusiveness

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb

class MLUtilityXGBoost:
  
    def __init__(self, dataset_path, watermarked_data_path=None, test_size=0.3, random_state=42):
        
        self.dataset_path = dataset_path
        self.watermarked_data_path = watermarked_data_path
        self.test_size = test_size
        self.random_state = random_state
        self.model = xgb.XGBClassifier(n_estimators=30, max_depth=10, n_jobs=4)

    def load_data(self, file_path):
        _, file_extension = os.path.splitext(file_path)
        data = None
        if file_extension == '.npy':
            loaded_results = np.load(file_path, allow_pickle=True).item()
            data = loaded_results['watermarked_data']
        else:
            data = pd.read_csv(file_path)
        return data

    def preprocess_data(self, data):
        X = data.drop(columns=['Cover_Type'])
        y = data['Cover_Type']
        le = LabelEncoder()
        y = le.fit_transform(y)
        return X, y

    def split_data(self, X, y):
        return train_test_split(X, y, test_size=self.test_size, random_state=self.random_state)

    def train_model(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict_and_evaluate(self, X_test, y_test):
        y_pred = self.model.predict(X_test)
        return f1_score(y_test, y_pred, average="weighted")

    def compute_f1(self):
        # 加载原始数据
        origin_data = self.load_data(self.dataset_path)
        origin_data = origin_data.drop(columns=['primary_key'])
        X_origin, y_origin = self.preprocess_data(origin_data)
        X_train, X_test, y_train, y_test = self.split_data(X_origin, y_origin)

        # 训练模型并评估 F1-score
        self.train_model(X_train, y_train)
        f1_origin = self.predict_and_evaluate(X_test, y_test)
        # print(f"F1-score on original data: {f1_origin:.4f}")
        

        # 如果水印数据路径提供了，加载水印数据并计算 F1-score
        if self.watermarked_data_path:
            watermarked_data = self.load_data(self.watermarked_data_path)
            watermarked_data = watermarked_data[origin_data.columns]
            X_watermarked, y_watermarked = self.preprocess_data(watermarked_data)
            X_train, _, y_train, _ = self.split_data(X_watermarked, y_watermarked)

            # 训练模型并评估 F1-score
            self.train_model(X_train, y_train)
            f1_watermarked = self.predict_and_evaluate(X_test, y_test)
            # print(f"F1-score on watermarked data: {f1_watermarked:.4f}")
        
        return f1_watermarked, f1_origin
            


In [None]:
watermark_schemes = ['B2Mark', 'GAHSW', 'SCPW']

for watermark_scheme in watermark_schemes:
    dataset = "covertype"
    dataset_path = "dataset/covtype_with_key.subset.data"
    
    f1_scores = []
    f1_scores_origin = []
    for seed in range(10000,10010):
        watermarked_data_path = f"{watermark_scheme}_dataset/{watermark_scheme}_{dataset}_{seed}.npy"
        
        measure_ml_util = MLUtilityXGBoost(dataset_path, watermarked_data_path)
        f1, f1_origin = measure_ml_util.compute_f1()
        f1_scores.append(f1)
        f1_scores_origin.append(f1_origin)
        
    print(f"Watermark scheme: {watermark_scheme}")
    print(f"Mean F1: {np.mean(f1_scores):.4f}")
    print(f"Mean F1 Origin: {np.mean(f1_scores_origin):.4f}")
    
        

attack and compute BER

In [None]:
import os
import numpy as np
import pandas as pd

class AlterationNumericalAttack:
    
    def __init__(self, watermarked_data_path, attack_proportions=None, dataset='covertype', p=3, perturbed_attribute = 'Cover_Type', random_seed=10000):

        self.watermarked_data_path = watermarked_data_path
        self.attack_proportions = attack_proportions if attack_proportions is not None else [0.2, 0.4, 0.6, 0.8, 1.0]
        self.p = p
        self.random_seed = random_seed
        self.perturbed_attribute = perturbed_attribute
        self.dataset = dataset
        np.random.seed(self.random_seed)
        
        # 加载水印数据
        self.loaded_results = np.load(self.watermarked_data_path, allow_pickle=True).item()
        self.watermarked_data = self.loaded_results['watermarked_data']
        
    def apply_attack(self, proportion, save_path):

        temp = self.watermarked_data.copy()
        indices = np.random.choice(len(temp), size=int(proportion * len(temp)), replace=False)
        perturb_values = np.random.uniform(-self.p, self.p, size=len(indices))  # 扰动值
        perturb_values = perturb_values.round(0)
        perturb_choices = np.arange(1, 8)  # 假设 CoverType 的值范围是 1 到 7
        perturb_values = np.random.choice(perturb_choices, size=len(indices))

        temp.loc[indices, self.perturbed_attribute] = perturb_values
        self.loaded_results['watermarked_data'] = temp
        np.save(save_path, self.loaded_results)

    def execute(self, save_path):
        for proportion in self.attack_proportions:
            self.apply_attack(proportion, save_path)



In [None]:

# test robustness
attack_range = [3]
attack_proportions = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

seeds = range(10000, 10010)
dataset = "covertype"
watermark_schemes = ['B2Mark', 'GAHSW', 'SCPW']

for watermark_scheme in watermark_schemes:
    for p in attack_range:
            for attack_proportion in attack_proportions:
                for seed in seeds:
                    watermarked_data_path = f"{watermark_scheme}_dataset/{watermark_scheme}_{dataset}_{seed}.npy"
                    save_path = f"{watermark_scheme}_dataset/{watermark_scheme}_{dataset}_{seed}_{p}_{attack_proportion}.npy"
                    AlterationNumericalAttack(watermarked_data_path, attack_proportions=[attack_proportion], dataset=dataset, p=p, random_seed=seed).execute(save_path)
        
        
        
    

In [None]:
import sys
import os
import string
import numpy as np
import json

from watermarking_schemes.B2Mark import WatermarkDetection

seed_range = range(10000, 10010)

# 随机生成n个单位长度为 6 的字符串数组secret_key_1s与secret_key_2s

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s

g = 6

secret_key_1s, secret_key_2s = generate_secret_keys(30)

attack_range = [3]
attack_proportions = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

original_data_path = "dataset/covtype_with_key.subset.data"

watermark_information="1010110011"
dataset = "covertype"

for p in attack_range:
    for attack_proportion in attack_proportions:
        BERs = [] 
        for idx, seed in enumerate(seed_range):
            file_path = f"B2Mark_dataset/B2Mark_{dataset}_{seed}_{p}_{attack_proportion}.npy"
            b2Mark_detection = WatermarkDetection(dataset=dataset, seed=seed , g=g, secret_key_1=secret_key_1s[idx], secret_key_2=secret_key_2s[idx], watermark_information="1010110011",threshold=3)
            detected_watermark = b2Mark_detection.run_detection(file_path) 
            z_scores = b2Mark_detection.get_z_scores()

            # 计算 detected_watermark的BER
            detected_bits = np.array(list(map(int, detected_watermark)))
            true_bits = np.array(list(map(int, watermark_information)))

            ber = np.sum(detected_bits != true_bits) / len(true_bits)
            BERs.append(ber)
    
      
        print(f"p: {attack_proportion}, BER: {np.mean(BERs):.4f}")



In [None]:
import sys
import os
import string
import numpy as np
import json

from watermarking_schemes.GAHSW import WatermarkDetection

seed_range = range(10000, 10010)

# 随机生成n个单位长度为 6 的字符串数组secret_key_1s与secret_key_2s

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s

g = 2

secret_key_1s, secret_key_2s = generate_secret_keys(30)

attack_range = [3]
# attack_proportions = [0.2, 0.4, 0.6, 0.8, 1.0]
attack_proportions = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


original_data_path = "dataset/covtype_with_key.subset.data"

watermark_information="1010110011"
dataset = "covertype"

for p in attack_range:
    for attack_proportion in attack_proportions:
        # detected_watermarks = [] 
        BERs = []
        for idx, seed in enumerate(seed_range):
            file_path = f"GAHSW_dataset/GAHSW_{dataset}_{seed}_{p}_{attack_proportion}.npy"
            gahsw_detection = WatermarkDetection(dataset=dataset, watermark_information="1010110011", seed=seed, Ks=secret_key_1s[idx])
            detected_watermark = gahsw_detection.run_detection(file_path) 
            detected_bits = np.array(list(map(int, detected_watermark)))
            true_bits = np.array(list(map(int, watermark_information)))

            ber = np.sum(detected_bits != true_bits) / len(true_bits)
            BERs.append(ber)

       
        print(f"p: {attack_proportion}, BER: {np.mean(BERs):.4f}")



In [None]:
import sys
import os
import string
import numpy as np
import json

from watermarking_schemes.SCPW import WatermarkDetection


seed_range = range(10000, 10010)

# 随机生成n个单位长度为 6 的字符串数组secret_key_1s与secret_key_2s

def generate_secret_keys(n, length=6):
    np.random.seed(99)  # 设置随机种子
    charset = list(string.ascii_letters + string.digits)  # 转换成字符列表
    secret_key_1s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    secret_key_2s = [''.join(np.random.choice(charset, size=length)) for _ in range(n)]
    return secret_key_1s, secret_key_2s


secret_key_1s, secret_key_2s = generate_secret_keys(30)

attack_range = [3]
# attack_proportions = [0.2, 0.4, 0.6, 0.8, 1.0]
attack_proportions = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

original_data_path = "dataset/covtype_with_key.subset.data"

watermark_information="1010110011"
dataset = "covertype"

for p in attack_range:
    for attack_proportion in attack_proportions:
        # detected_watermarks = [] 
        BERs = []
        for idx, seed in enumerate(seed_range):
            file_path = f"SCPW_dataset/SCPW_{dataset}_{seed}_{p}_{attack_proportion}.npy"
            scpw_detection = WatermarkDetection(dataset=dataset, watermark_information="1010110011", seed=seed, Ks=secret_key_1s[idx])
            detected_watermark = scpw_detection.run_detection(file_path) 
            # detected_watermarks.append(detected_watermark)
            detected_bits = np.array(list(map(int, detected_watermark)))
            true_bits = np.array(list(map(int, watermark_information)))

            ber = np.sum(detected_bits != true_bits) / len(true_bits)
            BERs.append(ber)

        print(f"p: {attack_proportion}, BER: {np.mean(BERs):.4f}")

