# Code Block Index

- [data_gen](#data_gen)


<a id="data_gen"></a>

In [None]:
# data_gen
import os
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm  # 进度条工具（可选）

# --------------------------
# 1. 路径设置
# --------------------------
raw_image_dir = "/mnt/d/VsCode/Data/input/images"                     # 原始图像路径
label_dir = "/mnt/d/VsCode/Data/input/breast_bm_b-mode/labels"        # JSON标注路径
output_image_dir = "/mnt/d/VsCode/Data/input/images_anonymous"        # 匿名化图像保存路径
output_csv = "/mnt/d/VsCode/Data/input/features_anonymous.csv"        # 匿名化CSV路径

os.makedirs(output_image_dir, exist_ok=True)  # 创建匿名图像目录

# --------------------------
# 2. 匿名化处理（图像重命名 + 数据记录）
# --------------------------
anonymous_mapping = []  # 记录原始名和匿名名的映射（可选）

# 遍历原始图像文件
for idx, img_file in tqdm(enumerate(os.listdir(raw_image_dir)), desc="匿名化处理"):
    if img_file.endswith(('.jpg', '.png')):
        # 生成匿名ID（如 img_001.jpg）
        anonymous_id = f"img_{idx:03d}{os.path.splitext(img_file)[1]}"
        anonymous_mapping.append((img_file, anonymous_id))  # 记录映射关系（调试用）
        
        # 复制图像并重命名
        src_path = os.path.join(raw_image_dir, img_file)
        dst_path = os.path.join(output_image_dir, anonymous_id)
        if os.path.exists(src_path):
            with open(src_path, 'rb') as f_src, open(dst_path, 'wb') as f_dst:
                f_dst.write(f_src.read())  # 直接复制文件（避免PIL依赖）
        
        # 获取对应的JSON标签
        json_file = os.path.splitext(img_file)[0] + ".json"
        json_path = os.path.join(label_dir, json_file)
        label = -1  # 默认标签
        if os.path.exists(json_path):
            with open(json_path, 'r') as f:
                label_data = json.load(f)
            label = int(label_data["shapes"][0]["label"])

        # 记录数据
        anonymous_mapping[-1] = anonymous_mapping[-1] + (label,)  # 扩展为 (原文件名, 匿名文件名, 标签)

# --------------------------
# 3. 生成仿真特征（10个普通特征）
# --------------------------
n_samples = len(anonymous_mapping)
n_features = 10

# 生成低相关性特征（均值为0.5，标准差0.2）
np.random.seed(42)
features = np.random.normal(loc=0.5, scale=0.2, size=(n_samples, n_features))
features = np.clip(features, 0, 1)  # 限制在[0,1]
scaler = MinMaxScaler()
features = scaler.fit_transform(features)  # 再次归一化

# --------------------------
# 4. 构建匿名化CSV
# --------------------------
# 匿名化ID（如 patient_001）
anonymous_ids = [f"patient_{i:03d}" for i in range(n_samples)]
labels = [item[2] for item in anonymous_mapping]  # 提取标签

# 创建DataFrame
df = pd.DataFrame({
    "anonymous_id": anonymous_ids,
    "label": labels
})

# 添加特征列
feature_cols = [f"feature_{i+1}" for i in range(n_features)]
df[feature_cols] = features

# 保存CSV
df.to_csv(output_csv, index=False)
print(f"匿名化数据已生成：\n- 图像目录: {output_image_dir}\n- CSV文件: {output_csv}")

# （可选）保存映射关系（用于调试）
pd.DataFrame(anonymous_mapping, columns=["original_name", "anonymous_name", "label"])\
    .to_csv("/mnt/d/VsCode/Data/input/name_mapping.csv", index=False)