B2Mark

In [None]:
import os
import numpy as np
import pandas as pd
import math
import time
import hashlib

g = 8
seed = 10000
secret_key_1 = "123"
secret_key_2 = "456"
threshold = 3
watermark_information_length = 8

dataset = 'covertype'

original_file = '../dataset/covtype_with_key.subset.data'
origin = pd.read_csv(original_file)

np.random.seed(seed)

cover_types = origin['Cover_Type'].unique()
cover_types.sort()

# 计算哈希取模
def hash_mod(key, mod_value, secret_key):
    combined = f"{secret_key}{key}"
    hash_value = int(hashlib.sha256(combined.encode()).hexdigest(), 16)
    return hash_value % mod_value

columns_of_interest = ['Elevation', 'Aspect']
origin[columns_of_interest] = origin[columns_of_interest].fillna(0)

# 提取前两位非零数字
def first_two_digits(x):
    if x == 0:
        return "00"
    digits = str(x).lstrip('0.').replace('.', '')
    if len(digits) == 1:
        return digits + "0"
    return digits[:2]

# 测试不同数量文件的检测时间
file_counts = [16, 32, 48, 64, 80, 96, 112, 128]
timing_results = {}

data_path = "different_version_datasets/original"

for file_count in file_counts:
    print(f"Processing {file_count} files...")
    total_time = 0  # 累计时间
    selected_files = sorted(os.listdir(data_path))[:file_count]  # 按顺序取前 file_count 个文件

    for file_name in selected_files:
        if file_name.startswith("covertype-"):
            start_time = time.time()  # 开始计时

            watermarked_data = np.load(f"{data_path}/{file_name}", allow_pickle=True).item() 
            watermarked_data = watermarked_data['watermarked_data']

            detected_watermark_information = ""
            watermarked_data[columns_of_interest] = watermarked_data[columns_of_interest].fillna(0)

            green_cells = np.zeros(watermark_information_length)
            n_cells = np.zeros(watermark_information_length)
            z_scores = np.zeros(watermark_information_length)

            # 遍历数据进行检测
            for idx in range(len(watermarked_data)):
                selected_data = watermarked_data.loc[idx, columns_of_interest]
                first_two_digits_data = selected_data.apply(first_two_digits)
                composite_numbers = ''.join(first_two_digits_data.values)

                w_index = hash_mod(composite_numbers, watermark_information_length, secret_key_1)

                if hash_mod(composite_numbers, g, secret_key_2) == 0:
                    n_cells[w_index] += 1
                    if watermarked_data.loc[idx, 'Cover_Type'] in green_domain:
                        green_cells[w_index] += 1

            # 计算 z_scores
            for idx in range(watermark_information_length):
                if n_cells[idx] != 0:
                    z_scores[idx] = (green_cells[idx] - n_cells[idx] / 2) / math.sqrt(n_cells[idx] / 4)
                else:
                    z_scores[idx] = 0

            # 更新检测水印信息
            for idx in range(len(z_scores)):
                if z_scores[idx] > threshold:
                    detected_watermark_information += '1'
                else:
                    detected_watermark_information += '0'

            end_time = time.time()  # 结束计时
            total_time += (end_time - start_time)  # 累计时间

    # 求平均时间
    average_time = total_time / file_count
    timing_results[file_count] = average_time
    print(f"Average time for {file_count} files: {average_time:.2f} seconds\n")

# 打印总结果
print("Timing results:")
for file_count, duration in timing_results.items():
    print(f"{file_count} files: {duration:.2f} seconds")


TabularMark

In [None]:
import os
import numpy as np
import pandas as pd
import math
import time

seed_start = 10000
seed_end = 10000 + 128  # 总共 128 份文件
dataset = 'covertype'
threshold = 3

original_file = '../dataset/covtype_with_key.subset.data'
origin = pd.read_csv(original_file)

n = int(len(origin) / 8)
gamma = 1 / 2

primary_key_cols = ['Elevation', 'Aspect']

# 二分查找函数
def binary_search(arr, key):
    left, right = 0, len(arr) - 1
    while left <= right:
        mid = (left + right) // 2
        if arr[mid] == key:
            return mid
        elif arr[mid] < key:
            left = mid + 1
        else:
            right = mid - 1
    return -1

# 匹配元组函数
def match_tuples(origin_data, watermarked_data, indices):
    match_indices = []
    watermarked_keys = [tuple(row) for row in watermarked_data[primary_key_cols].values]
    for idx in indices:
        key_do = tuple(origin_data.loc[idx, primary_key_cols])
        match_idx = binary_search(watermarked_keys, key_do)
        if match_idx != -1:
            match_indices.append(watermarked_data.index[match_idx])
        else:
            match_indices.append(-1)
    return match_indices

# 测试不同数量文件所需时间
file_counts = [16, 32, 48, 64, 80, 96, 112, 128]
timing_results = {}


for file_count in file_counts:
    print(f"Processing {file_count} files...")
    
    random_files = np.random.choice(file_count, 10, replace=False)
    
    timing_results[file_count] = 0
    for file in random_files:
        start_time = time.time()  # 开始计时
        watermarked_data = np.load(f"different_version_datasets/tabularmark/{dataset}-{file+10000}.npy", allow_pickle=True).item()
        watermarked_data = watermarked_data['watermarked_data']
        watermarked_data = watermarked_data.sort_values(by=primary_key_cols).reset_index(drop=True)
    
        for seed in range(seed_start, seed_start + file_count):
            loaded_results = np.load(f"different_version_datasets/tabularmark/{dataset}-{seed}.npy", allow_pickle=True).item()

            divide_seeds = loaded_results['divide_seeds']
            indices = loaded_results['indices']
            
            cover_types = watermarked_data['Cover_Type'].unique()
            cover_types.sort()

            match_indices = match_tuples(origin, watermarked_data, indices)
            green_cell = 0
            for idx, divide_seed in zip(match_indices, divide_seeds):
                np.random.seed(divide_seed)
                candidate_set = cover_types

                shuffled_cover_types = list(cover_types)
                np.random.shuffle(shuffled_cover_types)

                half_size = len(shuffled_cover_types) // 2

                green_domain = shuffled_cover_types[:half_size]
                red_domain = shuffled_cover_types[half_size:]

                if idx != -1 and watermarked_data.loc[idx, 'Cover_Type'] in green_domain:
                    green_cell += 1

            z_score = (green_cell - n / 2) / math.sqrt(n / 4)
            if(z_score > threshold):
                break
        end_time = time.time()  # 结束计时
        # print(f"Time taken for {seed - seed_start} files: {end_time - start_time:.2f} seconds")
        timing_results[file_count] += end_time - start_time
    timing_results[file_count] = timing_results[file_count] / 10
    print(f"Time taken for {file_count} files: {timing_results[file_count]:.2f} seconds\n")

# 打印总结果
print("Timing results:")
for file_count, duration in timing_results.items():
    print(f"{file_count} files: {duration:.2f} seconds")
