In [1]:
# define hyperparameters
# p = 2 # size of candidate set
n = 400 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain

In [4]:
# show info of dataset
import pandas as pd

# 数据集文件路径
file_path = '/home/zhengyihao/dataset/covertype/covtype.data'

# 根据所提供的信息，创建一个包含所有列名的列表
column_names = [
    'Elevation', 'Aspect', 'Slope',
    'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
    'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'
]

# 添加Wilderness_Area的4个二进制列
wilderness_areas = ['Wilderness_Area' + str(i) for i in range(1, 5)]
column_names += wilderness_areas

# 添加Soil_Type的40个二进制列
soil_types = ['Soil_Type' + str(i) for i in range(1, 41)]
column_names += soil_types

# 添加Cover_Type列
column_names.append('Cover_Type')

# 使用列名列表读取数据
data = pd.read_csv(file_path, names=column_names)

# 获取Cover_Type的值域（唯一值）
unique_values = data['Cover_Type'].unique()

# 获取每个值的计数
value_counts = data['Cover_Type'].value_counts()

# 打印结果
print("Cover_Type的值域: ", unique_values)
print("\n每个值的大小:")
print(value_counts)

Cover_Type的值域:  [5 2 1 7 3 6 4]

每个值的大小:
Cover_Type
2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: count, dtype: int64


In [5]:
import random

# 使用random模块生成n个随机数种子
seeds = [random.randrange(4294967296) for _ in range(n)]  # 4294967296是2的32次方

# 将生成的种子保存到文件中
with open('tabularmark_seed.txt', 'w') as f:
    for seed in seeds:
        f.write(str(seed) + '\n')



In [6]:
import pandas as pd
import random

if len(data) < n:
    raise ValueError("data中的记录数小于所请求的n个记录")

random_indices = random.sample(range(len(data)), n)

with open('tabularmark_index.txt', 'w') as findex:
    for index in random_indices:
        findex.write(str(index) + '\n')

print(f"{n}个随机索引已生成并保存到文件 'tabularmark_index.txt' 中。")

400个随机索引已生成并保存到文件 'tabularmark_index.txt' 中。


### 添加水印

In [7]:
# generate candidate set for every key cell
import pandas as pd
import random

# 假设df是你的DataFrame，已包含Cover_Type属性列
# 假设Cover_Type允许的所有可能的类别值已被加载到cover_types列表中

# 加载索引列表
with open('tabularmark_index.txt', 'r') as f:
    indices = [int(line.strip()) for line in f.readlines()]

# 加载种子列表
with open('tabularmark_seed.txt', 'r') as f:
    seeds = [int(line.strip()) for line in f.readlines()]

df = data
cover_types = data['Cover_Type'].unique()
 
# 验证索引列表和种子列表的长度是否一致
if len(indices) != len(seeds):
    raise ValueError("索引文件和种子文件的长度不一致")

# 开始替换Cover_Type值
for idx, seed in zip(indices, seeds):
    random.seed(seed)
    candidate_set = cover_types
     # 打乱cover_types的顺序
    shuffled_cover_types = list(cover_types)
    random.shuffle(shuffled_cover_types)

    # 确保cover_types能被划分为两个相等大小的部分
    half_size = len(shuffled_cover_types) // 2

    # 划分成green_domain和red_domain
    green_domain = shuffled_cover_types[:half_size]
    red_domain = shuffled_cover_types[half_size:]

    perturb_value = random.choice(green_domain)

    # 将df中对应索引的Cover_Type属性替换为这个选定的值
    df.loc[idx, 'Cover_Type'] = perturb_value
    
df.to_csv('/home/zhengyihao/dataset/covertype/tabularmark_covertype.data.csv', index=False)



### 检测水印

In [2]:
import pandas as pd
import random
import math

# define hyperparameters
# p = 2 # size of candidate set
n = 400 # number of key cells
gamma = 1/2 # ratio between the length of green domain and red domain

green_cell = 0

file_path = '/home/zhengyihao/dataset/covertype/tabularmark_covertype.data.csv'
detected_data = pd.read_csv(file_path)

# 加载索引列表
with open('tabularmark_index.txt', 'r') as f:
    indices = [int(line.strip()) for line in f.readlines()]

# 加载种子列表
with open('tabularmark_seed.txt', 'r') as f:
    seeds = [int(line.strip()) for line in f.readlines()]

df = detected_data
cover_types = detected_data['Cover_Type'].unique()


# 开始替换Cover_Type值
for idx, seed in zip(indices, seeds):
    random.seed(seed)
    candidate_set = cover_types
     # 打乱cover_types的顺序
    shuffled_cover_types = list(cover_types)
    random.shuffle(shuffled_cover_types)

    # 确保cover_types能被划分为两个相等大小的部分
    half_size = len(shuffled_cover_types) // 2

    # 划分成green_domain和red_domain
    green_domain = shuffled_cover_types[:half_size]
    red_domain = shuffled_cover_types[half_size:]

    if df.loc[idx, 'Cover_Type'] in green_domain:
        green_cell += 1
    
print(green_cell)

# calculate z-score
z_score = (green_cell - n/2) / math.sqrt(n/4)

print("The z-score is ",z_score)

400
The z-score is  20.0
