<a href="https://colab.research.google.com/github/t8101349/group-project-202503/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install rdkit
!pip install numpy
!pip install pandas

In [None]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from joblib import Parallel, delayed

# 定義 SMILES 轉換函數
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)

# 並行處理 SMILES 轉換
def parallel_smiles_conversion(smiles_series, n_jobs=4):
    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(smiles_to_morgan_fingerprint)(smiles) for smiles in smiles_series
    )
    return results

# 載入檔案
input_file = '1030_40_data.csv'
df_test = pd.read_csv(input_file)

# 分批處理參數
batch_size = 100_000  # 每批處理 10 萬筆，可根據記憶體情況調整
n_batches = (len(df_test) + batch_size - 1) // batch_size

# 儲存中間結果
output_X_dir = 'temp_X_batches'
os.makedirs(output_X_dir, exist_ok=True)

for i in range(n_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df_test))
    batch_df = df_test.iloc[start_idx:end_idx].copy()

    print(f"Processing batch {i+1}/{n_batches} ({start_idx} to {end_idx})")

    # 對當前批次的 "molecule_smiles" 進行轉換
    batch_df["molecule_smiles"] = parallel_smiles_conversion(batch_df["molecule_smiles"], n_jobs=4)

    # 轉換為指紋數據框
    fingerprints_df = pd.DataFrame(batch_df['molecule_smiles'].to_list())
    protein_onehot = pd.get_dummies(batch_df["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    X_batch = pd.concat([fingerprints_df, protein_onehot], axis=1)
    X_batch.columns = X_batch.columns.astype(str)

    # X 轉成 int8
    int_cols = X_batch.select_dtypes(include=['int64']).columns
    for col in int_cols:
        X_batch[col] = X_batch[col].astype(np.int8)

    # 儲存當前批次到臨時檔案
    batch_file = os.path.join(output_X_dir, f'X_batch_{i}.parquet')
    X_batch.to_parquet(batch_file)

    # 清理記憶體
    del batch_df, fingerprints_df, protein_onehot, X_batch

# 合併所有 X 批次
X_test = pd.concat([pd.read_parquet(os.path.join(output_X_dir, f))
                   for f in os.listdir(output_X_dir) if f.endswith('.parquet')],
                   axis=0)

# 處理 y
df_test['binds'] = df_test['binds'].astype(np.int8)
y_test = df_test['binds'].reset_index(drop=True)  # 重置索引為連續的 RangeIndex

# 儲存 X_test 為 Parquet 檔案
X_test.to_parquet('mg1030_X.parquet', index=False)

# 將 y_test 轉換為 DataFrame 並儲存為 Parquet 檔案
y_test.to_frame().to_parquet('mg1030_y.parquet', index=False)

# 可選：清理臨時目錄
import shutil
shutil.rmtree('temp_X_batches')

# restart kernal 釋放記憶體