B2Mark

In [None]:
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

seed = 111
np.random.seed(seed)

attack_proportion = [0.2, 0.4, 0.6, 0.8, 1.0]
dataset = "covertype"

origin = pd.read_csv("../dataset/covtype_with_key.subset.data")
cover_types = origin['Cover_Type'].unique()
cover_types.sort()

data_path = "different_version_datasets/original"
for file_name in os.listdir(data_path):
    if file_name.startswith("covertype-"):
        # 提取 "-" 后的数字部分
        base_name, ext = os.path.splitext(file_name)
        leble_decimal_number = int(base_name.split("-")[-1])
    
        loaded_results = np.load(f"{data_path}/{file_name}", allow_pickle=True).item()
        watermarked_data = loaded_results['watermarked_data']
        
        for proportion in attack_proportion:
            temp = watermarked_data.copy()
            indices = np.random.choice(len(temp), size=int(proportion * len(temp)), replace=False)
            perturb_values = np.random.choice(cover_types, size=int(proportion * len(temp)))
            temp.loc[indices, 'Cover_Type'] = perturb_values
            temp.to_csv(f"different_version_datasets/{proportion}attack/{file_name}", index=False)



In [20]:
# detect
import os
import torch
import argparse
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
import hashlib


g = 8
seed = 10000
secret_key_1 = "123"
secret_key_2 = "456"
threshold = 3
watermark_information_length = 8

dataset='covertype'


original_file = '../dataset/covtype_with_key.subset.data'
origin = pd.read_csv(original_file)

np.random.seed(seed)

cover_types = origin['Cover_Type'].unique()
cover_types.sort()

def hash_mod(key, mod_value, secret_key):
    # 使用 SHA-256 生成哈希
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)  # 将哈希转换为整数
    return hash_value % mod_value  # 对结果取模

columns_of_interest = ['Elevation', 'Aspect']  # 可根据需求更改
origin[columns_of_interest] = origin[columns_of_interest].fillna(0)  # 填充NA为0

# 提取前两位非零数字的函数
def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')  # 去掉前导0和小数点
    if len(digits) == 1:  # 如果只有一位数字
        return digits + "0"  # 补0
    return digits[:2]  # 返回前两位

cover_types = origin['Cover_Type'].unique()
cover_types.sort()  

np.random.seed(seed)
shuffled_cover_types = list(cover_types)
shuffled_cover_types = sorted(shuffled_cover_types)
np.random.shuffle(shuffled_cover_types)
half_size = len(shuffled_cover_types) // 2
green_domain = shuffled_cover_types[:half_size]
red_domain = shuffled_cover_types[half_size:]


attack_proportion = [0.2, 0.4, 0.6, 0.8, 1.0]
for proportion in attack_proportion:
    matches = []
    data_path = f"different_version_datasets/{proportion}attack"
    for file_name in os.listdir(data_path):
        if file_name.startswith("covertype-"):
            # 提取 "-" 后的数字部分
            base_name, ext = os.path.splitext(file_name)
            leble_decimal_number = int(base_name.split("-")[-1])
            # print(f"leble_decimal_number: {leble_decimal_number}")
            # label_decimal_watermark_information.append(decimal_number)
            watermarked_data = pd.read_csv(f"{data_path}/{file_name}")

            detected_watermark_information = ""
            watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)  # 填充NA为0


            green_cells = np.zeros(watermark_information_length)
            n_cells = np.zeros(watermark_information_length)
            z_scores = np.zeros(watermark_information_length)

            for idx in range(len(watermarked_data)):

                selected_data = watermarked_data.loc[idx, columns_of_interest]
                first_two_digits_data = selected_data.apply(first_two_digits)
                composite_numbers = ''.join(first_two_digits_data.values)
                
                w_index = hash_mod(composite_numbers, watermark_information_length, secret_key_1)
                
                if(hash_mod(composite_numbers, g, secret_key_2) == 0): 
                    n_cells[w_index] += 1    
                    if watermarked_data.loc[idx, 'Cover_Type'] in green_domain:
                        green_cells[w_index] += 1
                
                
            for idx in range(watermark_information_length):
                if n_cells[idx] != 0:
                    z_scores[idx] = (green_cells[idx] - n_cells[idx]/2 ) / math.sqrt(n_cells[idx]/4)
                else:
                    z_scores[idx] = 0

            for idx in range(len(z_scores)):
                if z_scores[idx] > threshold:
                    detected_watermark_information += '1'
                else:
                    detected_watermark_information += '0'
            # print(f"Detected watermark information: {detected_watermark_information}")
            # print("----")
            if int(detected_watermark_information, 2) == leble_decimal_number:
                matches.append(1)
            else:
                matches.append(0)
            
    # print(f"Accuracy: {sum(matches) / len(matches)}")
    print(f"Proportion: {proportion}, Accuracy: {sum(matches) / len(matches)}")
        
        





Proportion: 0.2, Accuracy: 1.0
Proportion: 0.4, Accuracy: 1.0
Proportion: 0.6, Accuracy: 0.5078125
Proportion: 0.8, Accuracy: 0.0
Proportion: 1.0, Accuracy: 0.0


TabularMark

In [None]:
# match detect
import os
import numpy as np
import pandas as pd
import math
from datetime import datetime
import argparse

class WatermarkDetection:
    
    def __init__(self, origin_file, watermarked_file, results_file, n=1400, gamma=1/2, seed=10000, dataset='covertype', primary_key_cols=None):
      
        self.origin_data = self.load_data(origin_file)
        self.results = np.load(results_file, allow_pickle=True).item()
        self.watermarked_data = self.load_data(watermarked_file)
        self.primary_key_cols = primary_key_cols or ['Elevation', 'Aspect']
        self.n = n
        self.gamma = gamma
        self.seed = seed
        self.dataset = dataset
        
        # 获取水印的结果
        self.divide_seeds = self.results['divide_seeds']
        self.indices = self.results['indices']
        
        # 初始化种子
        np.random.seed(self.seed)
        
        self.cover_types = self.watermarked_data['Cover_Type'].unique()
        self.cover_types.sort()

    def load_data(self, file_path):
        _, file_extension = os.path.splitext(file_path)
       
        if(file_extension == '.npy'):
            loaded_results = np.load(file_path, allow_pickle=True).item()
            data = loaded_results['watermarked_data']
        else:
            data = pd.read_csv(file_path)
        
        return data
    
    def binary_search(self, arr, key):
      
        left, right = 0, len(arr) - 1
        while left <= right:
            mid = (left + right) // 2
            if arr[mid] == key:
                return mid
            elif arr[mid] < key:
                left = mid + 1
            else:
                right = mid - 1
        return -1
    
    def match_tuples(self, origin_data, watermarked_data, indices):
      
        match_indices = []
        watermarked_keys = [tuple(row) for row in watermarked_data[self.primary_key_cols].values]
        for idx in indices:
            key_do = tuple(origin_data.loc[idx, self.primary_key_cols])
            match_idx = self.binary_search(watermarked_keys, key_do)
            if match_idx != -1:
                match_indices.append(watermarked_data.index[match_idx])
            else:
                match_indices.append(-1)
        return match_indices

    def detect_watermark(self):
       
        green_cell = 0
        self.watermarked_data = self.watermarked_data.sort_values(by=self.primary_key_cols).reset_index(drop=True)
        
        # 获取原始数据和水印数据中的匹配索引
        match_indices = self.match_tuples(self.origin_data, self.watermarked_data, self.indices)

        for idx, match_idx, divide_seed in zip(self.indices, match_indices, self.divide_seeds):
            if match_idx == -1:
                continue
            np.random.seed(divide_seed)
            shuffled_cover_types = list(self.cover_types)
            np.random.shuffle(shuffled_cover_types)

            # 划分 green_domain 和 red_domain
            half_size = len(shuffled_cover_types) // 2
            green_domain = shuffled_cover_types[:half_size]
            red_domain = shuffled_cover_types[half_size:]

            if self.watermarked_data.loc[match_idx, 'Cover_Type'] in green_domain:
                green_cell += 1
            
        z_score = (green_cell - self.n / 2) / math.sqrt(self.n / 4)
        # print(f"The average z-score of original data is: {np.mean(z_score)}")
        return z_score

    


In [None]:
# Attack
import os
import numpy as np
import pandas as pd

class AlterationNumericalAttack:
    
    def __init__(self, watermarked_data_path, attack_proportions=None, dataset='covertype', perturbed_attribute = 'Cover_Type', random_seed=10000):

        self.watermarked_data_path = watermarked_data_path
        self.attack_proportions = attack_proportions if attack_proportions is not None else [0.2, 0.4, 0.6, 0.8, 1.0]
        self.random_seed = random_seed
        self.perturbed_attribute = perturbed_attribute
        self.dataset = dataset
        np.random.seed(self.random_seed)
        
        # 加载水印数据
        self.loaded_results = np.load(self.watermarked_data_path, allow_pickle=True).item()
        self.watermarked_data = self.loaded_results['watermarked_data']
        
    def apply_attack(self, proportion, save_path):

        np.random.seed(111)
        temp = self.watermarked_data.copy()
        indices = np.random.choice(len(temp), size=int(proportion * len(temp)), replace=False)
        perturb_choices = np.arange(1, 8)  # 假设 CoverType 的值范围是 1 到 7
        perturb_values = np.random.choice(perturb_choices, size=len(indices))

        temp.loc[indices, self.perturbed_attribute] = perturb_values
        
        self.loaded_results['watermarked_data'] = temp
        np.save(save_path, self.loaded_results)

    def execute(self, save_path):
        for proportion in self.attack_proportions:
            self.apply_attack(proportion, save_path)



In [None]:

# test robustness
attack_range = [3]
attack_proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

seeds = range(10000, 10128)
dataset = "covertype"

for attack_proportion in attack_proportions:
    for seed in seeds:
        watermarked_data_path = f"dataset/watermarked/covertype-{seed}.npy"
        save_path = f"dataset/watermarked/{dataset}_{seed}_{attack_proportion}.npy"
        AlterationNumericalAttack(watermarked_data_path, attack_proportions=[attack_proportion], dataset=dataset, random_seed=seed).execute(save_path)
        
        
        
    

In [None]:
import numpy as np

detected_seeds = range(10000, 10128)
seeds = list(range(10000, 10128))
np.random.shuffle(seeds)

threshold = 4

matches = 0

attack_proportions = [0.2, 0.4, 0.6, 0.8, 1.0]

for attack_proportion in attack_proportions:
    matches = 0
    for detected_seed in detected_seeds:
        watermarked_file = f"dataset/watermarked/covertype_{detected_seed}_{attack_proportion}.npy"
        for seed in seeds:
            dataset = 'covertype'
            origin_file = "datasets/covtype_with_key.subset.data"
            results_file = f"dataset/watermarked/covertype_{seed}_{attack_proportion}.npy"
            
            watermark_detection = WatermarkDetection(origin_file, watermarked_file, results_file, n=int(14000/10), gamma=1/2, seed=seed, dataset=dataset)
            z_score = watermark_detection.detect_watermark()
            # print(z_score)
            if z_score > threshold:
                if detected_seed == seed:
                    matches += 1
                break
    
    print(f"{attack_proportion}", matches / len(detected_seeds))
        

        
        