In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset1/Data_Transformed__MorganFingerprint(100k100k)/train_transformed__morgan(100k,100k).parquet
/kaggle/input/dataset1/Data_Transformed__TopologicalFingerprint(100k100k)/train_transformed__topological(100k,100k).parquet
/kaggle/input/train1/train_transformed_morgan(100k100k).parquet
/kaggle/input/leash-BELKA/sample_submission.csv
/kaggle/input/leash-BELKA/train.parquet
/kaggle/input/leash-BELKA/test.parquet
/kaggle/input/leash-BELKA/train.csv
/kaggle/input/leash-BELKA/test.csv


In [3]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.9.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.5-cp310-cp310-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.5


In [6]:
import duckdb
import pandas as pd
from tqdm import tqdm
import numpy as np # linear algebra
from rdkit import Chem
from rdkit.Chem import AllChem


In [None]:
import pyarrow.parquet as pq
import pandas as pd
import gc  # 引入垃圾回收模組

filename = "/kaggle/input/leash-BELKA/train.parquet"
columns_to_read = ["molecule_smiles", "protein_name", "binds"]

batch_size = 60000    # 每次讀取 60,000 筆
target_rows = 1200000  # 每輪儲存 1,200,000 筆
total_batches = 15    # 總共執行 15 次
total_rows = 0        # 記錄當前累積筆數

parquet_file = pq.ParquetFile(filename)

# 獲取總行組數量
num_row_groups = parquet_file.num_row_groups
print(f"Total row groups in file: {num_row_groups}")

# 計算每次讀取多少行組
row_groups_per_batch = target_rows // batch_size


# 開始進行批次處理
for i in range(total_batches):
    chunks = []
    current_rows = 0  # 每輪的計數器
    batch_start_row_group = i * row_groups_per_batch  # 直接依序取
    batch_end_row_group = min(batch_start_row_group + row_groups_per_batch, num_row_groups)

    if batch_end_row_group >= num_row_groups:
        batch_end_row_group = num_row_groups  # 避免超出行組範圍

    print(f"✅ 第 {i+1} 次處理：從行組 {batch_start_row_group} 到行組 {batch_end_row_group}")

    # 使用 pyarrow 的 ParquetFile 直接讀取指定範圍的行組
    for row_group_idx in range(batch_start_row_group, batch_end_row_group):
        try:
            batch = parquet_file.read_row_groups([row_group_idx], columns=columns_to_read)
            chunk = batch.to_pandas()

            # 檢查是否有資料
            if not chunk.empty:
                chunks.append(chunk)
                current_rows += len(chunk)
                total_rows += len(chunk)

            # 如果讀取到指定範圍的資料，就停止
            if total_rows >= target_rows * (i + 1):
                break  # 如果已經讀到該批次的結尾就停止

        except Exception as e:
            print(f"⚠️ 讀取行組 {row_group_idx} 時發生錯誤: {e}")

    if chunks:  # 確保有資料才進行合併
        # 合併 DataFrame
        batch_df = pd.concat(chunks, ignore_index=True)

        # 存成 parquet，每次都存不同的檔案
        output_filename = f"/kaggle/working/train_part{i+1}.parquet"
        final_df.to_parquet(output_filename, index=False)

        print(f"✅ 第 {i+1} 次存檔：{len(final_df)} 筆，已累積 {total_rows} 筆")

        # 清理無用的變數，釋放記憶體
        del batch_df, batch_pivot, smiles_df, final_df
        gc.collect()  # 執行垃圾回收
    else:
        print(f"⚠️ 第 {i+1} 次處理未讀取到任何資料，跳過該批次。")

In [None]:
# 15 個 Parquet 檔案
parquet_files = [f"/kaggle/working/train_part{i+1}.parquet" for i in range(0, 15)]

# 初始化 DuckDB 連線
con = duckdb.connect()

# 存放所有批次的 DataFrame
all_samples = []

# 逐個處理 15 個檔案
for i, file in enumerate(parquet_files):
    print(f"📂 正在處理檔案: {file}")

    df = con.query(f"""(SELECT * FROM parquet_scan('{file}')
                            WHERE bind = 0
                            ORDER BY random()
                            LIMIT 15000)
                            UNION ALL
                            (SELECT * FROM parquet_scan('{file}')
                            WHERE bind = 1 
                            ORDER BY random()
                            LIMIT 5000)""").df()

# 儲存該批次結果
    output_filename = f"/kaggle/working/sampled_test_part{i+1}.parquet"
    df.to_parquet(output_filename, index=False)
    print(f"✅ 已儲存抽樣結果: {output_filename}（共 {len(df)} 筆）")

    all_samples.append(df)

# 合併所有結果
final_test_df = pd.concat(all_samples, ignore_index=True)

# 儲存總合併的 Parquet
final_test_output = "/kaggle/working/sampled_train_all.parquet"
final_test_df.to_parquet(final_test_output, index=False)
print(f"🎯 全部 15 個檔案已處理完畢，最終合併檔案: {final_test_output}（共 {len(final_test_df)} 筆）")

# 關閉 DuckDB
con.close()

In [4]:
# 設定檔案路徑
train_path = '/kaggle/working/sampled_train_all.parquet'

# 建立 DuckDB 連線
con = duckdb.connect()

# 使用進度條來顯示進度
with tqdm(total=2, desc="Processing Data") as pbar:
    # 查詢第一部分數據
    df_part1 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 0
                              ORDER BY random()
                              LIMIT 150000""").df()
    pbar.update(1)  # 更新進度條

    # 查詢第二部分數據
    df_part2 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 1
                              ORDER BY random()
                              LIMIT 50000""").df()
    pbar.update(1)  # 更新進度條

# 合併兩部分數據
df = pd.concat([df_part1, df_part2], ignore_index=True)

# 隨機洗牌數據（frac=1 表示保持原始大小，shuffle 整個 DataFrame）
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 關閉連線
con.close()

Processing Data:   0%|          | 0/2 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing Data:  50%|█████     | 1/2 [00:41<00:41, 41.29s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing Data: 100%|██████████| 2/2 [00:56<00:00, 28.36s/it]


In [5]:
# # 確認數據
print(df.head())

          id                              buildingblock1_smiles  \
0  273353723  O=C(O)C[C@H](Cc1cccs1)NC(=O)OCC1c2ccccc2-c2ccc...   
1  248628175  O=C(O)C[C@@H](Cc1ccc(F)cc1)NC(=O)OCC1c2ccccc2-...   
2   83368046  O=C(NC[C@H]1CC[C@H](C(=O)O)CC1)OCC1c2ccccc2-c2...   
3   14589280  CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc3...   
4  221909719         O=C(Nc1cnccc1C(=O)O)OCC1c2ccccc2-c2ccccc21   

        buildingblock2_smiles         buildingblock3_smiles  \
0           Nc1cccc2[nH]ccc12               Cl.Nc1cc(O)ccn1   
1           COc1c(F)cc(N)cc1F    Cl.Cl.NCc1ccc(-n2cncn2)cc1   
2  Cl.NCC1CCN(c2ccc(Br)cc2)C1                  NCCc1ccncc1F   
3                 Cc1cnc(N)s1  CC1(C)OB(c2ccc(N)cc2)OC1(C)C   
4              NCC1(OCCO)CCC1               CC(CN)Sc1ccccc1   

                                     molecule_smiles protein_name  binds  
0  O=C(C[C@H](Cc1cccs1)Nc1nc(Nc2cc(O)ccn2)nc(Nc2c...          sEH      0  
1  COc1c(F)cc(Nc2nc(NCc3ccc(-n4cncn4)cc3)nc(N[C@@...          HSA   

In [6]:
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = AllChem.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)

# 對 "molecule_smiles" 欄位進行轉換並顯示進度條
tqdm.pandas(desc="Transforming molecule_smiles")
df["molecule_smiles"] = df["molecule_smiles"].progress_apply(lambda x: smiles_to_morgan_fingerprint(x))

# 對 protein 欄位進行 One-Hot Encoding
protein_one_hot = pd.get_dummies(df["protein_name"], prefix="protein").astype(int)

# 合併 One-Hot 結果
df_one_hot = pd.concat([df, protein_one_hot], axis=1)

# 合併需要的欄位：molecule_smiles, binds, 和經過 One-Hot Encoding 的 protein
df_one_hot = pd.concat([df_one_hot[["id", "molecule_smiles", "binds"]], protein_one_hot], axis=1)

Transforming molecule_smiles: 100%|██████████| 200000/200000 [05:16<00:00, 631.51it/s]


In [7]:
df_one_hot

Unnamed: 0,id,molecule_smiles,binds,protein_BRD4,protein_HSA,protein_sEH
0,273353723,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0,0,0,1
1,248628175,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
2,83368046,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
3,14589280,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
4,221909719,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0,0,1,0
...,...,...,...,...,...,...
199995,161814009,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
199996,90661748,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
199997,231061522,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
199998,108976265,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1


In [8]:
# # 可選：移除原始 protein 欄位
# df1_one_hot.drop("protein_name", axis=1, inplace=True)

# # 檢視處理後數據
# print(df_one_hot.head())

# 僅保留需要的欄位
columns_to_keep = ["id", "molecule_smiles", "binds"] + protein_one_hot.columns.tolist()
df_filtered = df_one_hot[columns_to_keep]

# 儲存處理後的數據
output_filename = "train_transformed_morgan(150k,50k).parquet"
df_filtered.to_parquet(output_filename, index=False)

# output_filename = f"test_transformed_morgan(10k,10k).parquet"
# df.to_parquet(output_filename, index=False)

In [9]:
morganfile = '/kaggle/working/train_transformed_morgan(150k,50k).parquet'
morgan = pd.read_parquet(morganfile)
morgan

Unnamed: 0,id,molecule_smiles,binds,protein_BRD4,protein_HSA,protein_sEH
0,273353723,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0,0,0,1
1,248628175,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
2,83368046,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
3,14589280,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
4,221909719,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0,0,1,0
...,...,...,...,...,...,...
199995,161814009,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
199996,90661748,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
199997,231061522,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
199998,108976265,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1


In [10]:
topolfile = '/kaggle/input/dataset1/Data_Transformed__TopologicalFingerprint(100k100k)/train_transformed__topological(100k,100k).parquet'
topol = pd.read_parquet(topolfile)
topol

Unnamed: 0,id,molecule_smiles,binds,protein_BRD4,protein_HSA,protein_sEH
0,285164593,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
1,25996919,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
2,164112964,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
3,24667451,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
4,267754999,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...",0,0,1,0
...,...,...,...,...,...,...
199995,83746448,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
199996,292275356,"[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
199997,276655525,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,1,0
199998,274412733,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,0


In [11]:
# 設定檔案路徑
train_path = '/kaggle/input/leash-BELKA/train.parquet'

# 建立 DuckDB 連線
con = duckdb.connect()

# 使用進度條來顯示進度
with tqdm(total=2, desc="Processing Data") as pbar:
    # 查詢第一部分數據
    df_part1 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 0
                              ORDER BY random()
                              LIMIT 100000""").df()
    pbar.update(1)  # 更新進度條

    # 查詢第二部分數據
    df_part2 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 1
                              ORDER BY random()
                              LIMIT 100000""").df()
    pbar.update(1)  # 更新進度條

# 合併兩部分數據
df = pd.concat([df_part1, df_part2], ignore_index=True)

# 隨機洗牌數據（frac=1 表示保持原始大小，shuffle 整個 DataFrame）
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 關閉連線
con.close()

Processing Data:   0%|          | 0/2 [00:00<?, ?it/s]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing Data:  50%|█████     | 1/2 [00:36<00:36, 36.28s/it]

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

Processing Data: 100%|██████████| 2/2 [00:52<00:00, 26.34s/it]


In [12]:
print(df.head())

          id                              buildingblock1_smiles  \
0  136048134     O=C(Nc1c(Br)cccc1C(=O)O)OCC1c2ccccc2-c2ccccc21   
1  152714460  O=C(Nc1c(I)c(C(=O)O)c(I)c(C(=O)O)c1I)OCC1c2ccc...   
2   18142268        CC(C)CC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O   
3   49740501  Cc1cc(Br)cc(C(=O)O)c1NC(=O)OCC1c2ccccc2-c2ccccc21   
4  156634059  O=C(Nc1cc(-n2cccn2)ccc1C(=O)O)OCC1c2ccccc2-c2c...   

   buildingblock2_smiles  buildingblock3_smiles  \
0     Nc1ccc2c(c1)CNC2=O    NCc1cn2cc(Cl)ccc2n1   
1     COC(=O)c1cnc(N)cn1           COc1ccncc1CN   
2         N#Cc1cccc(N)n1  Cc1cc2cc(CN)ccc2[nH]1   
3           N#Cc1cccnc1N  Cc1cc2cc(CN)ccc2[nH]1   
4  Nc1c2ccccc2nc2ccccc12   NCc1cccc(C(F)(F)F)c1   

                                     molecule_smiles protein_name  binds  
0  O=C1NCc2cc(Nc3nc(NCc4cn5cc(Cl)ccc5n4)nc(Nc4c(B...         BRD4      1  
1  COC(=O)c1cnc(Nc2nc(NCc3cnccc3OC)nc(Nc3c(I)c(C(...         BRD4      0  
2  Cc1cc2cc(CNc3nc(Nc4cccc(C#N)n4)nc(NC(CC(C)C)C(...          sEH

In [None]:
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = AllChem.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)

# 對 "molecule_smiles" 欄位進行轉換並顯示進度條
tqdm.pandas(desc="Transforming molecule_smiles")
df["molecule_smiles"] = df["molecule_smiles"].progress_apply(lambda x: smiles_to_morgan_fingerprint(x))

# 對 protein 欄位進行 One-Hot Encoding
protein_one_hot = pd.get_dummies(df["protein_name"], prefix="protein").astype(int)

# 合併 One-Hot 結果
df_one_hot = pd.concat([df, protein_one_hot], axis=1)

# 合併需要的欄位：molecule_smiles, binds, 和經過 One-Hot Encoding 的 protein
df_one_hot = pd.concat([df_one_hot[["id", "molecule_smiles", "binds"]], protein_one_hot], axis=1)

Transforming molecule_smiles:  41%|████      | 82419/200000 [02:10<03:04, 636.56it/s]

In [None]:
df_one_hot

In [None]:
# # 可選：移除原始 protein 欄位
# df1_one_hot.drop("protein_name", axis=1, inplace=True)

# # 檢視處理後數據
# print(df_one_hot.head())

# 僅保留需要的欄位
columns_to_keep = ["id", "molecule_smiles", "binds"] + protein_one_hot.columns.tolist()
df_filtered = df_one_hot[columns_to_keep]

# 儲存處理後的數據

output_filename = "train_transformed_morgan(100k,100k).parquet"
df_filtered.to_parquet(output_filename, index=False)

In [9]:
morganfile = '/kaggle/input/train1/train_transformed_morgan(100k100k).parquet'
morgan = pd.read_parquet(morganfile)
morgan

Unnamed: 0,id,molecule_smiles,binds,protein_BRD4,protein_HSA,protein_sEH
0,136048134,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,0
1,152714460,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0,1,0,0
2,18142268,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
3,49740501,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
4,156634059,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
...,...,...,...,...,...,...
199995,82456949,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
199996,76774351,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,1,0
199997,272008538,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1
199998,47355506,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,1


In [10]:
morganfile = '/kaggle/input/testset/test_transformed__morgan(180k20k).parquet'
morgan = pd.read_parquet(morganfile)
morgan

Unnamed: 0,id,molecule_smiles,binds,protein_BRD4,protein_HSA,protein_sEH
0,294957690,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
1,292929104,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
2,139650626,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
3,185087966,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,0,1
4,46267039,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0,1,0
...,...,...,...,...,...,...
199995,84824826,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",0,1,0,0
199996,234107412,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
199997,147934901,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,0,0,1
199998,90180048,"[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,1,0,0
