In [12]:
import random
n = 400 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain

# 使用random模块生成n个随机数种子
seeds = [random.randrange(4294967296) for _ in range(n)]  # 4294967296是2的32次方

# 将生成的种子保存到文件中
with open('tabularmark_seed.txt', 'w') as f:
    for seed in seeds:
        f.write(str(seed) + '\n')



In [13]:
import pandas as pd
import random

data = pd.read_csv("/home/zhengyihao/dataset/covertype/covtype_without_key.data")

if len(data) < n:
    raise ValueError("data中的记录数小于所请求的n个记录")

random_indices = random.sample(range(len(data)), n)

with open('tabularmark_index.txt', 'w') as findex:
    for index in random_indices:
        findex.write(str(index) + '\n')

print(f"{n}个随机索引已生成并保存到文件 'tabularmark_index.txt' 中。")

400个随机索引已生成并保存到文件 'tabularmark_index.txt' 中。


### 添加水印

In [30]:
# generate candidate set for every key cell
import pandas as pd
import random
import numpy as np

# 假设df是你的DataFrame，已包含Cover_Type属性列
# 假设Cover_Type允许的所有可能的类别值已被加载到cover_types列表中

# 加载索引列表
with open('tabularmark_index.txt', 'r') as f:
    indices = [int(line.strip()) for line in f.readlines()]

# 加载种子列表
with open('tabularmark_seed.txt', 'r') as f:
    seeds = [int(line.strip()) for line in f.readlines()]

df = data
cover_types = data['Cover_Type'].unique()
 
# 验证索引列表和种子列表的长度是否一致
if len(indices) != len(seeds):
    raise ValueError("索引文件和种子文件的长度不一致")

cover_types.sort()

# 开始替换Cover_Type值
saved_state = random.getstate()
for idx, seed in zip(indices, seeds):
    random.seed(seed)
    candidate_set = cover_types
     # 打乱cover_types的顺序
    shuffled_cover_types = list(cover_types)
    # print(shuffled_cover_types)
    random.shuffle(shuffled_cover_types)

    # 确保cover_types能被划分为两个相等大小的部分
    half_size = len(shuffled_cover_types) // 2

    # 划分成green_domain和red_domain
    green_domain = shuffled_cover_types[:half_size]
    red_domain = shuffled_cover_types[half_size:]
    # print(seed)
    # print(green_domain)

    perturb_value = random.choice(green_domain)

    # 将df中对应索引的Cover_Type属性替换为这个选定的值
    df.loc[idx, 'Cover_Type'] = perturb_value
    
df.to_csv('/home/zhengyihao/dataset/covertype/tabularmark_covertype.data.csv', index=False)



In [32]:
# 获取Cover_Type的值域（唯一值）
unique_values = df['Cover_Type'].unique()

# 获取每个值的计数
value_counts = df['Cover_Type'].value_counts()

# 打印结果
print("Cover_Type的值域: ", unique_values)
print("\n每个值的大小:")
print(value_counts)

Cover_Type的值域:  [5 2 1 3 4 7 6]

每个值的大小:
Cover_Type
1    2009
7    2003
3    2003
6    2000
5    2000
4    1997
2    1988
Name: count, dtype: int64


### 检测水印

In [31]:
import pandas as pd
import random
import math

# define hyperparameters
n = 400 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain

green_cell = 0

file_path = '/home/zhengyihao/dataset/covertype/tabularmark_covertype.data.csv'
detected_data = pd.read_csv(file_path)

# 加载索引列表
with open('tabularmark_index.txt', 'r') as f:
    indices = [int(line.strip()) for line in f.readlines()]

# 加载种子列表
with open('tabularmark_seed.txt', 'r') as f:
    seeds = [int(line.strip()) for line in f.readlines()]

df = detected_data
cover_types = detected_data['Cover_Type'].unique()

cover_types.sort()

# 开始替换Cover_Type值
random.setstate(saved_state)
for idx, seed in zip(indices, seeds):
    random.seed(seed)
    candidate_set = cover_types
     # 打乱cover_types的顺序
    shuffled_cover_types = list(cover_types)
    # print(shuffled_cover_types)
    random.shuffle(shuffled_cover_types)

    # 确保cover_types能被划分为两个相等大小的部分
    half_size = len(shuffled_cover_types) // 2

    # 划分成green_domain和red_domain
    green_domain = shuffled_cover_types[:half_size]
    red_domain = shuffled_cover_types[half_size:]

    if df.loc[idx, 'Cover_Type'] in green_domain:
        green_cell += 1
    
print(green_cell)

# calculate z-score
z_score = (green_cell - n/2) / math.sqrt(n/4)

print("The z-score is ",z_score)

400
The z-score is  20.0
