## alteration attack

### tabularmark

In [1]:
# 攻击
import pandas as pd
import numpy as np

# 加载数据
data = pd.read_csv('/home/zhengyihao/dataset/covertype/tabularmark_covertype.data.csv')

# 确定要扰动数据的数量
alter_percentages = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for alter_percentage in alter_percentages:
    # 创建数据的深拷贝来扰动，这样就不会影响原始数据
    altered_data = data.copy()

    # 扰动数据
    num_alter = int(len(data) * alter_percentage)
    # print(num_alter)
    indices_to_alter = np.random.choice(data.index, size=num_alter, replace=False)
    altered_data.loc[indices_to_alter, 'Cover_Type'] = np.random.randint(1, 8, size=num_alter)

    # 保存结果
    altered_data.to_csv(f'/home/zhengyihao/dataset/covertype/tabularmark_alter{int(alter_percentage*100)}.data.csv', index=False)

In [3]:
import pandas as pd
import random
import math

seed_covertype_map = {}

def calculate_mismatch(file_path):
    # 定义超参数和变量
    n = 400      
    gamma = 1/2  
    green_cell = 0

    # 加载数据
    detected_data = pd.read_csv(file_path)

    # 加载索引列表和种子列表
    with open('tabularmark_index.txt', 'r') as f:
        indices = [int(line.strip()) for line in f]
    with open('tabularmark_seed.txt', 'r') as f:
        seeds = [int(line.strip()) for line in f]

    df = detected_data
    cover_types = detected_data['Cover_Type'].unique()
    cover_types.sort()
    # 替换Cover_Type值
    for idx, seed in zip(indices, seeds):
        
        if seed not in seed_covertype_map:
            random.seed(seed)
            shuffled_cover_types = list(cover_types)
            random.shuffle(shuffled_cover_types)

            # 划分green_domain和red_domain
            half_size = len(shuffled_cover_types) // 2
            green_domain = shuffled_cover_types[:half_size]
            
            # 将种子和对应的green_domain保存到字典里
            seed_covertype_map[seed] = green_domain
        else:
            # 如果已经存在，直接从字典中取出green_domain
            green_domain = seed_covertype_map[seed]

        if df.loc[idx, 'Cover_Type'] in green_domain:
            green_cell += 1

        # if seed == 1570787129:
        #     print(green_domain)

    # 计算并返回不匹配度
    percentage = (n - green_cell) / n
    z_score = (green_cell - n/2) / math.sqrt(n/4)
    return percentage,z_score


# 计算并打印各个扰动数据的不匹配度
file_paths = [f'/home/zhengyihao/dataset/covertype/tabularmark_alter{int(percentage*100)}.data.csv' 
              for percentage in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]]

for file_path in file_paths:
    mismatch_percentage,z_score = calculate_mismatch(file_path)
    
    print(f'File: {file_path} - Mismatch Percentage: {mismatch_percentage:.2%}',f"z-score:{z_score}")
    

File: /home/zhengyihao/dataset/covertype/tabularmark_alter0.data.csv - Mismatch Percentage: 0.00% z-score:20.0
File: /home/zhengyihao/dataset/covertype/tabularmark_alter10.data.csv - Mismatch Percentage: 5.00% z-score:18.0
File: /home/zhengyihao/dataset/covertype/tabularmark_alter20.data.csv - Mismatch Percentage: 12.50% z-score:15.0
File: /home/zhengyihao/dataset/covertype/tabularmark_alter30.data.csv - Mismatch Percentage: 19.50% z-score:12.2
File: /home/zhengyihao/dataset/covertype/tabularmark_alter40.data.csv - Mismatch Percentage: 25.50% z-score:9.8
File: /home/zhengyihao/dataset/covertype/tabularmark_alter50.data.csv - Mismatch Percentage: 28.75% z-score:8.5
File: /home/zhengyihao/dataset/covertype/tabularmark_alter60.data.csv - Mismatch Percentage: 36.25% z-score:5.5
File: /home/zhengyihao/dataset/covertype/tabularmark_alter70.data.csv - Mismatch Percentage: 37.25% z-score:5.1
File: /home/zhengyihao/dataset/covertype/tabularmark_alter80.data.csv - Mismatch Percentage: 43.75% z-s

## histogrammark

In [4]:
# 攻击
import pandas as pd
import numpy as np

# 加载数据
data = pd.read_csv('/home/zhengyihao/dataset/covertype/histogrammark_covertype.data.csv')

# 确定要扰动数据的数量
alter_percentages = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for alter_percentage in alter_percentages:
    # 创建数据的深拷贝来扰动，这样就不会影响原始数据
    altered_data = data.copy()

    # 扰动数据
    num_alter = int(len(data) * alter_percentage)
    indices_to_alter = np.random.choice(data.index, size=num_alter, replace=False)
    altered_data.loc[indices_to_alter, 'Cover_Type'] = np.random.randint(1, 8, size=num_alter)

    # 保存结果
    altered_data.to_csv(f'/home/zhengyihao/dataset/covertype/histogrammark_alter{int(alter_percentage*100)}.data.csv', index=False)

In [16]:
import pandas as pd
import random
import math

def calculate_mismatch(file_path):
    detected_data = pd.read_csv(file_path)

    # 读取 histogram_pa.csv 并将其转换为字典
    df_pa = pd.read_csv('histogram_pa.csv')
    pa = dict(zip(df_pa['Number'], df_pa['Value']))

    # 读取 histogram_mp.csv 并将其转换为列表
    df_mp = pd.read_csv('histogram_mp.csv')
    mp = df_mp['Key'].tolist()

    watermark = ""

    with open("histogram_mark.txt", "r") as file:
        watermark = file.read()

    # 计算 y_hat
    _max = detected_data['Cover_Type'].max()
    _min = detected_data['Cover_Type'].min()
    y_hat = (_max + _min) / 2

    # 计算 pe 值
    detected_data['pe'] = detected_data['Cover_Type'] - y_hat

    # 将原有的list类型转化为集合数据类型，提高在其中查找项的速度
    mp_set = set(mp)
    W_det = ""

    for group_number, bit in enumerate(watermark):
        # 对当前组进行操作
        group_data = detected_data[detected_data['group_number'] == group_number]
        p = pa[group_number]

        a = 0 # count bit = 0
        b = 0 # count bit = 1

        # 通过将一组条件（每行是否满足要求）应用于数据框并进行求和，避免了逐行运算
        mask = ~group_data['primary_key'].isin(mp_set) & ((group_data['pe'] == p+1) | (group_data['pe'] == p-1))
        b = mask.sum()
        mask = ~group_data['primary_key'].isin(mp_set) & (group_data['pe'] == p)
        a = mask.sum()

        W_det += '0' if a > b else '1'

    # 首先检查 W 和 W_det 是否长度相同
    if len(watermark) != len(W_det):
        print('Error: The lengths of W and W_det are not the same!')
        return
    # 计算不匹配度
    count_mismatch = sum(a != b for a, b in zip(watermark, W_det))
    percentage = count_mismatch / len(watermark)
    return percentage


# 计算并打印各个扰动数据的不匹配度
file_paths = [f'/home/zhengyihao/dataset/covertype/histogrammark_alter{int(percentage*100)}.data.csv' 
              for percentage in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]]

for file_path in file_paths:
    mismatch_percentage = calculate_mismatch(file_path)
    
    print(f'File: {file_path} - Mismatch Percentage: {mismatch_percentage:.2%}')
    

File: /home/zhengyihao/dataset/covertype/histogrammark_alter0.data.csv - Mismatch Percentage: 17.75%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter10.data.csv - Mismatch Percentage: 19.00%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter20.data.csv - Mismatch Percentage: 21.00%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter30.data.csv - Mismatch Percentage: 21.00%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter40.data.csv - Mismatch Percentage: 27.00%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter50.data.csv - Mismatch Percentage: 30.25%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter60.data.csv - Mismatch Percentage: 34.00%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter70.data.csv - Mismatch Percentage: 35.50%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter80.data.csv - Mismatch Percentage: 39.75%
File: /home/zhengyihao/dataset/covertype/histogrammark_alter90.data.csv - Mismatch 

## semanticmark

In [17]:
# 攻击
import pandas as pd
import numpy as np

# 加载数据
data = pd.read_csv('/home/zhengyihao/dataset/covertype/semanticmark_covertype.data.csv')

# 确定要扰动数据的数量
alter_percentages = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for alter_percentage in alter_percentages:
    # 创建数据的深拷贝来扰动，这样就不会影响原始数据
    altered_data = data.copy()

    # 扰动数据
    num_alter = int(len(data) * alter_percentage)
    indices_to_alter = np.random.choice(data.index, size=num_alter, replace=False)
    altered_data.loc[indices_to_alter, 'Cover_Type'] = np.random.randint(1, 8, size=num_alter)
    # altered_data.loc[indices_to_alter, 'Cover_Type'] = altered_data.loc[indices_to_alter, 'Cover_Type'] - 1
    # altered_data.loc[indices_to_alter, 'Cover_Type'] = altered_data.loc[indices_to_alter, 'Cover_Type'] % 7
    # altered_data.loc[indices_to_alter, 'Cover_Type'] = altered_data.loc[indices_to_alter, 'Cover_Type'] + 1

    # 保存结果
    altered_data.to_csv(f'/home/zhengyihao/dataset/covertype/semanticmark_alter{int(alter_percentage*100)}.data.csv', index=False)

    print("Finish produce ",f"/home/zhengyihao/dataset/covertype/semanticmark_alter{int(alter_percentage*100)}.data.csv")



Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter0.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter10.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter20.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter30.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter40.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter50.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter60.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter70.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter80.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter90.data.csv
Finish produce  /home/zhengyihao/dataset/covertype/semanticmark_alter100.data.csv


In [18]:
import pandas as pd
import random
import math
import hashlib

# 从文件中读取 Ks
with open('semantic_Ks.txt', 'r') as file:
    Ks = file.read()

# 从文件中读取 k
with open('semantic_k.txt', 'r') as file:
    k = file.read()

df_record_table = pd.read_csv('semantic_record_table.csv')
record_table = dict(zip(df_record_table['K_Hash'], df_record_table['Second_Hash']))


def calculate_mismatch(file_path):
    # 使用列名列表读取数据
    data = pd.read_csv(file_path)

    watermark = ""

    with open("semantic_mark.txt", "r") as file:
        watermark = file.read()

    L = len(watermark)
    gamma = 14000 / 400 / 4

    count = [[0 for _ in range(2)] for _ in range(L)]

    for index, row in data.iterrows():
        k_hash = hashlib.blake2b(f"{row['primary_key']}{k}".encode()).hexdigest()
        Ks_hash = hashlib.blake2b(f"{Ks}{k_hash}".encode()).hexdigest()


        C = int(Ks_hash,16)
        if C % gamma == 0:
            # zeta = int(Ks_hash,16) % L
            zeta = int(hashlib.blake2b(f"{Ks}{k_hash}{1}".encode()).hexdigest(),16) % L
            x = row['Cover_Type']
            # print(f"{row['primary_key']}")
            # print(k)
            # print(k_hash)
            # print("************")
            # determine the watermark bit
            if x == 1 and record_table[k_hash] ==  hashlib.blake2b(f"{1}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 2 and record_table[k_hash] ==  hashlib.blake2b(f"{2}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 2 and record_table[k_hash] ==  hashlib.blake2b(f"{2}{k}{2}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 3 and record_table[k_hash] ==  hashlib.blake2b(f"{3}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 3 and record_table[k_hash] ==  hashlib.blake2b(f"{3}{k}{2}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 3 and record_table[k_hash] ==  hashlib.blake2b(f"{3}{k}{1}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 4 and record_table[k_hash] ==  hashlib.blake2b(f"{4}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 4 and record_table[k_hash] ==  hashlib.blake2b(f"{4}{k}{1}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 4 and record_table[k_hash] ==  hashlib.blake2b(f"{4}{k}{0}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 5 and record_table[k_hash] ==  hashlib.blake2b(f"{5}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 5 and record_table[k_hash] ==  hashlib.blake2b(f"{5}{k}{0}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 5 and record_table[k_hash] ==  hashlib.blake2b(f"{5}{k}{1}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 6 and record_table[k_hash] ==  hashlib.blake2b(f"{6}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            elif x == 6 and record_table[k_hash] ==  hashlib.blake2b(f"{6}{k}{1}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 6 and record_table[k_hash] ==  hashlib.blake2b(f"{6}{k}{2}".encode()).hexdigest():
                count[zeta][1] += 1
            elif x == 7 and record_table[k_hash] ==  hashlib.blake2b(f"{7}{k}".encode()).hexdigest():
                count[zeta][0] += 1
            
            # if zeta == 0:
            #     print(x)
                # print("********")
            # break
            # cnt += 1
            # if cnt == 3:
            #     break
    W_det = ""
    for i in range(L):
        if count[i][0] > count[i][1]:
            W_det += '0'
        else:
            W_det += '1'
    # 首先检查 W 和 W_det 是否长度相同
    if len(watermark) != len(W_det):
        print('Error: The lengths of W and W_det are not the same!')
        return
    # 计算不匹配度
    count_mismatch = sum(a != b for a, b in zip(watermark, W_det))
    percentage = count_mismatch / len(watermark)

    # print(count)
    # print(watermark)
    # print(W_det)

    return percentage



# 计算并打印各个扰动数据的不匹配度
file_paths = [f'/home/zhengyihao/dataset/covertype/semanticmark_alter{int(percentage*100)}.data.csv' 
              for percentage in [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]]

for file_path in file_paths:
    mismatch_percentage = calculate_mismatch(file_path)
    
    print(f'File: {file_path} - Mismatch Percentage: {mismatch_percentage:.2%}')
    

File: /home/zhengyihao/dataset/covertype/semanticmark_alter0.data.csv - Mismatch Percentage: 18.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter10.data.csv - Mismatch Percentage: 20.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter20.data.csv - Mismatch Percentage: 24.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter30.data.csv - Mismatch Percentage: 24.50%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter40.data.csv - Mismatch Percentage: 27.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter50.data.csv - Mismatch Percentage: 28.50%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter60.data.csv - Mismatch Percentage: 32.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter70.data.csv - Mismatch Percentage: 35.75%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter80.data.csv - Mismatch Percentage: 35.00%
File: /home/zhengyihao/dataset/covertype/semanticmark_alter90.data.csv - Mismatch Percentage