<a href="https://colab.research.google.com/github/t8101349/group-project-202503/blob/main/sampling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip insall pandas
# !pip install tqdm
# !pip install pickle

In [None]:
# 分布統計
import pandas as pd
from tqdm import tqdm
import pickle

# 參數設定
filename = "train.csv"
chunksize = 1_000_000
bb_columns = ["buildingblock1_smiles", "buildingblock2_smiles", "buildingblock3_smiles"]

# 計算分層分布
def compute_strata_counts(bb_col):
    print(f"🔍 統計 {bb_col} 分層分布...")
    strata_counts = {"pos": {}, "neg": {}}
    for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize, usecols=[bb_col, "binds"]), desc=f"Counting {bb_col}"):
        for bb, group in chunk.groupby(bb_col):
            pos_count = len(group[group["binds"] == 1])
            neg_count = len(group[group["binds"] == 0])
            strata_counts["pos"][bb] = strata_counts["pos"].get(bb, 0) + pos_count
            strata_counts["neg"][bb] = strata_counts["neg"].get(bb, 0) + neg_count
    total_pos = sum(strata_counts["pos"].values())
    total_neg = sum(strata_counts["neg"].values())
    return {"strata_counts": strata_counts, "total_pos": total_pos, "total_neg": total_neg}

# 執行並儲存
strata_data = {}
for bb_col in bb_columns:
    strata_data[bb_col] = compute_strata_counts(bb_col)
    print(f"{bb_col} - 總正類: {strata_data[bb_col]['total_pos']}, 總負類: {strata_data[bb_col]['total_neg']}")

# 儲存預處理資料
with open("strata_data.pkl", "wb") as f:
    pickle.dump(strata_data, f)
print("✅ 分布統計已儲存至 'strata_data.pkl'")

In [None]:
# 抽樣
import pandas as pd
from tqdm import tqdm
import pickle

# 載入預處理資料
with open("strata_data.pkl", "rb") as f:
    strata_data = pickle.load(f)

# 參數設定 可調整特徵抽樣比例與正負類比例
filename = "train.csv"
chunksize = 1_000_000
targets = [
    {"bb": "buildingblock1_smiles", "pos": 80000, "neg": 240000}, # 特徵一
    {"bb": "buildingblock2_smiles", "pos": 10000, "neg": 30000}, # 特徵二
    {"bb": "buildingblock3_smiles", "pos": 10000, "neg": 30000}, # 特徵三
]

# 抽樣函數
def stratified_sample(bb_col, pos_target, neg_target, strata_data):
    strata_counts = strata_data[bb_col]["strata_counts"]
    total_pos = strata_data[bb_col]["total_pos"]
    total_neg = strata_data[bb_col]["total_neg"]

    print(f"🎲 進行 {bb_col} 分層抽樣...")
    pos_samples = []
    neg_samples = []
    required_cols = ["molecule_smiles", "buildingblock1_smiles", "buildingblock2_smiles", "buildingblock3_smiles", "protein_name", "binds"]

    for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize, usecols=required_cols), desc=f"Sampling {bb_col}"):
        for bb, group in chunk.groupby(bb_col):
            pos_chunk = group[group["binds"] == 1]
            neg_chunk = group[group["binds"] == 0]

            pos_size = min(len(pos_chunk), int(pos_target * (strata_counts["pos"].get(bb, 0) / total_pos)))
            neg_size = min(len(neg_chunk), int(neg_target * (strata_counts["neg"].get(bb, 0) / total_neg)))

            if pos_size > 0 and len(pos_samples) < pos_target:
                pos_sample = pos_chunk.sample(n=min(pos_size, pos_target - len(pos_samples)), random_state=42)
                pos_samples.append(pos_sample)

            if neg_size > 0 and len(neg_samples) < neg_target:
                neg_sample = neg_chunk.sample(n=min(neg_size, neg_target - len(neg_samples)), random_state=42)
                neg_samples.append(neg_sample)

        if len(pos_samples) >= pos_target and len(neg_samples) >= neg_target:
            break

    df = pd.concat(pos_samples + neg_samples, ignore_index=True)
    df_pos = df[df["binds"] == 1].sample(n=min(pos_target, len(df[df["binds"] == 1])), random_state=42)
    df_neg = df[df["binds"] == 0].sample(n=min(neg_target, len(df[df["binds"] == 0])), random_state=42)
    return pd.concat([df_pos, df_neg], ignore_index=True)

# 執行分層抽樣
train_dfs = []
for target in targets:
    df = stratified_sample(target["bb"], target["pos"], target["neg"], strata_data)
    train_dfs.append(df)

# 合併樣本
train_df = pd.concat(train_dfs, ignore_index=True).sample(frac=1, random_state=42)

# 檢查結果
print("🔎 檢查建構塊分布...")
bb1_unique = train_df["buildingblock1_smiles"].nunique()
bb2_unique = train_df["buildingblock2_smiles"].nunique()
bb3_unique = train_df["buildingblock3_smiles"].nunique()

print(f"總樣本數: {len(train_df)}")
print(f"正類記錄數: {len(train_df[train_df['binds'] == 1])}")
print(f"負類記錄數: {len(train_df[train_df['binds'] == 0])}")
print(f"獨特分子數: {train_df['molecule_smiles'].nunique()}")
print(f"buildingblock1_smiles相異計數: {bb1_unique}（原始271）")
print(f"buildingblock2_smiles相異計數: {bb2_unique}（原始693）")
print(f"buildingblock3_smiles相異計數: {bb3_unique}（原始872）")

# 儲存結果
train_df.to_csv("1030_40_data.csv", index=False)