In [3]:
# 读取数据
# show info of dataset
import pandas as pd

# 数据集文件路径
file_path = '/home/zhengyihao/dataset/covertype/covtype.data'

# 根据所提供的信息，创建一个包含所有列名的列表
column_names = [
    'Elevation', 'Aspect', 'Slope',
    'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology',
    'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon',
    'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points'
]

# 添加Wilderness_Area的4个二进制列
wilderness_areas = ['Wilderness_Area' + str(i) for i in range(1, 5)]
column_names += wilderness_areas

# 添加Soil_Type的40个二进制列
soil_types = ['Soil_Type' + str(i) for i in range(1, 41)]
column_names += soil_types

# 添加Cover_Type列
column_names.append('Cover_Type')

# 使用列名列表读取数据
data = pd.read_csv(file_path, names=column_names)

# 获取Cover_Type的值域（唯一值）
unique_values = data['Cover_Type'].unique()

# 获取每个值的计数
value_counts = data['Cover_Type'].value_counts()

# 打印结果
print("Cover_Type的值域: ", unique_values)
print("\n每个值的大小:")
print(value_counts)



# 在原始数据上加上 primary key列
# 通过reset_index函数为数据集添加索引列，这列就是“primary_key”
data = data.reset_index()

# 设置新的列名，将之前的index变为 primary_key
data = data.rename(columns={'index':'primary_key'})

# 将新的数据帧保存到硬盘上
data.to_csv('/home/zhengyihao/dataset/covertype/covtype_with_key.data', index=False)



# 定义用户私钥 Ks
import secrets

# 生成 16 bits 的随机密钥
Ks = secrets.token_hex(2)

print("Generated 16 bits private key: ", Ks)



# 定义水印长度 l 与组数 N_g（数值相同）
l = 400
N_g = 400

  

Cover_Type的值域:  [5 2 1 7 3 6 4]

每个值的大小:
Cover_Type
2    283301
1    211840
3     35754
7     20510
6     17367
5      9493
4      2747
Name: count, dtype: int64
Generated 16 bits private key:  c002


In [5]:
# 分组
import hashlib

def hash_function(Ks, PK, N_g):
    # 使用sha256作为哈希函数
    hash_obj = hashlib.sha256()
    # 计算H(Ks|tu.PK)
    inner_value = (str(Ks) + str(PK)).encode('utf-8')
    hash_obj.update(inner_value)
    inner_hash = hash_obj.hexdigest()
    # 计算H(Ks|H(Ks|tu.PK))
    outer_value = (str(Ks) + inner_hash).encode('utf-8')
    hash_obj.update(outer_value)
    outer_hash = hash_obj.hexdigest()
    # 组别序号
    group_number = int(outer_hash, 16) % N_g
    return group_number

import pandas as pd

# 加载数据集
data = pd.read_csv('/home/zhengyihao/dataset/covertype/covtype_with_key.data')

# 应用 hash 函数
data['group_number'] = data['primary_key'].apply(lambda x: hash_function(Ks, x, N_g))

# 按 group_number 排序重置
sorted_data = data.sort_values(by='group_number')

# 将结果保存到新的 CSV 文件
sorted_data.to_csv('/home/zhengyihao/dataset/covertype/covtype_with_group.data', index=False)

In [16]:
# test
# 加载数据集
data = pd.read_csv('/home/zhengyihao/dataset/covertype/covtype_with_group.data')

# 统计每个组的 tuples 的数量
group_counts = data.groupby('group_number').size()

# 打印统计结果
print(group_counts)

group_number
0      1428
1      1474
2      1445
3      1477
4      1477
       ... 
395    1440
396    1474
397    1541
398    1546
399    1384
Length: 400, dtype: int64


## 嵌入水印

In [17]:
# 生成嵌入水印信息

import random

# 生成一个长度为 400 的字符串，它由随机的 0 和 1 组成
watermark = ''.join(random.choice('01') for _ in range(400))
# 打印二进制字符串
print("要嵌入的水印信息是",watermark)

# 统计二进制字符串中 1 的个数
count_1 = watermark.count('1')

# 统计二进制字符串中 0 的个数
count_0 = watermark.count('0')

# 打印结果
print("Count of 1: ", count_1)
print("Count of 0: ", count_0)

要嵌入的水印信息是 0111100011010110111011111111001111001000001100110100111011111011010101000110110111111111101100011011110001101001101101100010010001110111010010000011101110000011101001001010011011001010100001010110011011101001100101110011001000100101101010110010011001100001010001101000000111010100100101100010001100101001011011010111010111010010101001101001000011101011110000100010101010111100010110110111101001110110
Count of 1:  208
Count of 0:  192


In [18]:
import pandas as pd
import numpy as np
from collections import Counter

# 初始化 group_number
group_number = 0
_max = data['Cover_Type'].max()
_min = data['Cover_Type'].min()

# 创建 y_hat 列
data['y_hat'] = (_max + _min) / 2
pa = {}
mp = []

for bit in watermark:
    # 选取当前 group 的数据
    mask = data['group_number'] == group_number
    group_data = data[mask]

    # 计算 p_e (忽略 min 和 max)
    mask2 = (group_data['Cover_Type'] != _max) & (group_data['Cover_Type'] != _min)
    p_e = group_data.loc[mask2, 'Cover_Type'] - group_data.loc[mask2, 'y_hat']

    # 使用 Counter 统计每个 p_e 出现的频数
    counter = Counter(np.abs(p_e))

    # 找出出现频数最高的 p_e 并赋值给 p
    p = counter.most_common(1)[0][0]

    # 把当前 group 中, Cover_Type 为 min 或 max 的 primary_key 存入 mp
    mask_min_or_max = (group_data['Cover_Type'] == _max) | (group_data['Cover_Type'] == _min)
    mp.extend(group_data.loc[mask_min_or_max, 'primary_key'].values.tolist())

    # 对于那些不等于 min 或 max 的 'Cover_Type'，更新原始数据集数据
    mask &= mask2

    group_cover_type = data.loc[mask, 'Cover_Type']
    y_hat = (_max + _min) / 2
    p_e = group_cover_type - y_hat

    # 更新 p_e 的值
    p_e = np.where((p_e == p) & (bit == '0'), p_e,
                   np.where((p_e == p) & (bit == '1'), p_e + 1,
                            np.where((p_e == -p) & (bit == '0'), p_e,
                                     np.where((p_e == -p) & (bit == '1'), p_e - 1,
                                              np.where(p_e >= p + 1, p_e + 1,
                                                       np.where(p_e <= -(p + 1), p_e - 1, p_e))))))

    # 计算 y_prime，并更新 'Cover_Type'
    data.loc[mask, 'Cover_Type'] = p_e + y_hat

    pa[group_number] = p

    # 更新 group_number 到下一组
    group_number += 1

data.to_csv('/home/zhengyihao/dataset/covertype/histogrammark_covertype.data.csv', index=False)




In [19]:
file_path = '/home/zhengyihao/dataset/covertype/histogrammark_covertype.data.csv'

# 使用列名列表读取数据
data = pd.read_csv(file_path)

# 获取Cover_Type的值域（唯一值）
unique_values = data['Cover_Type'].unique()

# 获取每个值的计数
value_counts = data['Cover_Type'].value_counts()

# 打印结果
print("Cover_Type的值域: ", unique_values)
print("\n每个值的大小:")
print(value_counts)



Cover_Type的值域:  [2 1 3 6 4 5 7]

每个值的大小:
Cover_Type
1    359075
2    136066
3     35754
7     29589
5      9493
6      8288
4      2747
Name: count, dtype: int64


## 提取水印

In [35]:
import pandas as pd
import numpy as np
from collections import Counter

#提取水印
file_path = '/home/zhengyihao/dataset/covertype/histogrammark_covertype.data.csv'
detected_data = pd.read_csv(file_path)



_max = detected_data['Cover_Type'].max()
_min = detected_data['Cover_Type'].min()
y_hat = (_max + _min) / 2

group_number = 0

W_det = ""

for bit in watermark:
    w_det = 0
    mask = detected_data['group_number'] == group_number
    group_data = detected_data[mask]
    a = 0 # count bit = 0
    b = 0 # count bit = 1
    for _,row in group_data.iterrows():
        pe = row['Cover_Type'] - y_hat
        p = pa.get(group_number)
        if row['primary_key'] not in mp:
            if pe == p:
                w_det = 0
            elif pe == p+1 or pe == p-1:
                w_det = 1

            if w_det == 1:
                b += 1
            else:
                a += 1
    if a > b:
        w_det = 0
    else:
        w_det = 1
    print(w_det)
    W_det += str(w_det)

    group_number += 1




0
1
1
1
1
0
0
0
1
1
0
1
0
1
1


KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np

file_path = '/home/zhengyihao/dataset/covertype/histogrammark_covertype.data.csv'
detected_data = pd.read_csv(file_path)

# 计算 y_hat
_max = detected_data['Cover_Type'].max()
_min = detected_data['Cover_Type'].min()
y_hat = (_max + _min) / 2

# 计算 pe 值
detected_data['pe'] = detected_data['Cover_Type'] - y_hat

# 将原有的list类型转化为集合数据类型，提高在其中查找项的速度
mp_set = set(mp)
W_det = ""

for group_number, bit in enumerate(watermark):
    # 对当前组进行操作
    group_data = detected_data[detected_data['group_number'] == group_number]
    p = pa[group_number]

    a = 0 # count bit = 0
    b = 0 # count bit = 1

    # 通过将一组条件（每行是否满足要求）应用于数据框并进行求和，避免了逐行运算
    mask = ~group_data['primary_key'].isin(mp_set) & ((group_data['pe'] == p+1) | (group_data['pe'] == p-1))
    b = mask.sum()
    mask = ~group_data['primary_key'].isin(mp_set) & (group_data['pe'] == p)
    a = mask.sum()

    W_det += '0' if a > b else '1'
print(W_det)

NameError: name 'mp' is not defined

In [2]:
#计算 BER
def mismatch_percentage(W, W_det):
    # 首先检查 W 和 W_det 是否长度相同
    if len(W) != len(W_det):
        print('Error: The lengths of W and W_det are not the same!')
        return
    # 计算不匹配度
    count_mismatch = sum(a != b for a, b in zip(W, W_det))
    percentage = count_mismatch / len(W)
    return percentage

# W = '1010101010'
# W_det = '1010001010'
print("W: ",watermark)
print("W_det: ",W_det)
print(f'Mismatch Percentage: {mismatch_percentage(watermark, W_det):.2%}')

NameError: name 'watermark' is not defined