In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leash-BELKA/sample_submission.csv
/kaggle/input/leash-BELKA/train.parquet
/kaggle/input/leash-BELKA/test.parquet
/kaggle/input/leash-BELKA/train.csv
/kaggle/input/leash-BELKA/test.csv
/kaggle/input/new-xgb/pytorch/default/1/15_05_new_xgb_model.bin


In [3]:

!pip install pandas
!pip install tqdm
!pip install pickle
!pip install rdkit
!pip install numpy
     

[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for pickle[0m[31m
[0mCollecting rdkit
  Downloading rdkit-2024.9.6-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp310-cp310-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6


In [None]:
# 分布統計
import pandas as pd
from tqdm import tqdm
import pickle

# 參數設定
filename = "/kaggle/input/leash-BELKA/train.csv"
chunksize = 1_000_000
bb_columns = ["buildingblock1_smiles", "buildingblock2_smiles", "buildingblock3_smiles"]

# 計算分層分布
def compute_strata_counts(bb_col):
    print(f"🔍 統計 {bb_col} 分層分布...")
    strata_counts = {"pos": {}, "neg": {}}
    for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize, usecols=[bb_col, "binds"]), desc=f"Counting {bb_col}"):
        for bb, group in chunk.groupby(bb_col):
            pos_count = len(group[group["binds"] == 1])
            neg_count = len(group[group["binds"] == 0])
            strata_counts["pos"][bb] = strata_counts["pos"].get(bb, 0) + pos_count
            strata_counts["neg"][bb] = strata_counts["neg"].get(bb, 0) + neg_count
    total_pos = sum(strata_counts["pos"].values())
    total_neg = sum(strata_counts["neg"].values())
    return {"strata_counts": strata_counts, "total_pos": total_pos, "total_neg": total_neg}

# 執行並儲存
strata_data = {}
for bb_col in bb_columns:
    strata_data[bb_col] = compute_strata_counts(bb_col)
    print(f"{bb_col} - 總正類: {strata_data[bb_col]['total_pos']}, 總負類: {strata_data[bb_col]['total_neg']}")

# 儲存預處理資料
with open("strata_data.pkl", "wb") as f:
    pickle.dump(strata_data, f)
print("✅ 分布統計已儲存至 'strata_data.pkl'")
     

# 抽樣
import pandas as pd
from tqdm import tqdm
import pickle

# 載入預處理資料
with open("strata_data.pkl", "rb") as f:
    strata_data = pickle.load(f)



In [None]:
# 參數設定 可調整特徵抽樣比例與正負類比例
filename = "/kaggle/input/leash-BELKA/train.csv"
chunksize = 1_000_000
targets = [
    {"bb": "buildingblock1_smiles", "pos": 80000, "neg": 240000}, # 特徵一
    {"bb": "buildingblock2_smiles", "pos": 10000, "neg": 30000}, # 特徵二
    {"bb": "buildingblock3_smiles", "pos": 10000, "neg": 30000}, # 特徵三
]

# 抽樣函數
def stratified_sample(bb_col, pos_target, neg_target, strata_data):
    strata_counts = strata_data[bb_col]["strata_counts"]
    total_pos = strata_data[bb_col]["total_pos"]
    total_neg = strata_data[bb_col]["total_neg"]

    print(f"🎲 進行 {bb_col} 分層抽樣...")
    pos_samples = []
    neg_samples = []
    required_cols = ["molecule_smiles", "buildingblock1_smiles", "buildingblock2_smiles", "buildingblock3_smiles", "protein_name", "binds"]

    for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize, usecols=required_cols), desc=f"Sampling {bb_col}"):
        for bb, group in chunk.groupby(bb_col):
            pos_chunk = group[group["binds"] == 1]
            neg_chunk = group[group["binds"] == 0]

            pos_size = min(len(pos_chunk), int(pos_target * (strata_counts["pos"].get(bb, 0) / total_pos)))
            neg_size = min(len(neg_chunk), int(neg_target * (strata_counts["neg"].get(bb, 0) / total_neg)))

            if pos_size > 0 and len(pos_samples) < pos_target:
                pos_sample = pos_chunk.sample(n=min(pos_size, pos_target - len(pos_samples)), random_state=42)
                pos_samples.append(pos_sample)

            if neg_size > 0 and len(neg_samples) < neg_target:
                neg_sample = neg_chunk.sample(n=min(neg_size, neg_target - len(neg_samples)), random_state=42)
                neg_samples.append(neg_sample)

        if len(pos_samples) >= pos_target and len(neg_samples) >= neg_target:
            break

    df = pd.concat(pos_samples + neg_samples, ignore_index=True)
    df_pos = df[df["binds"] == 1].sample(n=min(pos_target, len(df[df["binds"] == 1])), random_state=42)
    df_neg = df[df["binds"] == 0].sample(n=min(neg_target, len(df[df["binds"] == 0])), random_state=42)
    return pd.concat([df_pos, df_neg], ignore_index=True)

# 執行分層抽樣
train_dfs = []
for target in targets:
    df = stratified_sample(target["bb"], target["pos"], target["neg"], strata_data)
    train_dfs.append(df)

# 合併樣本
train_df = pd.concat(train_dfs, ignore_index=True).sample(frac=1, random_state=42)

# 檢查結果
print("🔎 檢查建構塊分布...")
bb1_unique = train_df["buildingblock1_smiles"].nunique()
bb2_unique = train_df["buildingblock2_smiles"].nunique()
bb3_unique = train_df["buildingblock3_smiles"].nunique()

print(f"總樣本數: {len(train_df)}")
print(f"正類記錄數: {len(train_df[train_df['binds'] == 1])}")
print(f"負類記錄數: {len(train_df[train_df['binds'] == 0])}")
print(f"獨特分子數: {train_df['molecule_smiles'].nunique()}")
print(f"buildingblock1_smiles相異計數: {bb1_unique}（原始271）")
print(f"buildingblock2_smiles相異計數: {bb2_unique}（原始693）")
print(f"buildingblock3_smiles相異計數: {bb3_unique}（原始872）")

# 儲存結果
train_df.to_csv("1030_40_data.csv", index=False)
     

In [None]:
import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from joblib import Parallel, delayed

# 定義 SMILES 轉換函數
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)

# 並行處理 SMILES 轉換
def parallel_smiles_conversion(smiles_series, n_jobs=4):
    results = Parallel(n_jobs=n_jobs, backend='loky')(
        delayed(smiles_to_morgan_fingerprint)(smiles) for smiles in smiles_series
    )
    return results

# 載入檔案
input_file = '/kaggle/working/1030_40_data.csv'
df_test = pd.read_csv(input_file)

# 分批處理參數
batch_size = 100_000  # 每批處理 10 萬筆，可根據記憶體情況調整
n_batches = (len(df_test) + batch_size - 1) // batch_size

# 儲存中間結果
output_X_dir = 'temp_X_batches'
os.makedirs(output_X_dir, exist_ok=True)

for i in range(n_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df_test))
    batch_df = df_test.iloc[start_idx:end_idx].copy()

    print(f"Processing batch {i+1}/{n_batches} ({start_idx} to {end_idx})")

    # 對當前批次的 "molecule_smiles" 進行轉換
    batch_df["molecule_smiles"] = parallel_smiles_conversion(batch_df["molecule_smiles"], n_jobs=4)

    # 轉換為指紋數據框
    fingerprints_df = pd.DataFrame(batch_df['molecule_smiles'].to_list())
    protein_onehot = pd.get_dummies(batch_df["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    X_batch = pd.concat([fingerprints_df, protein_onehot], axis=1)
    X_batch.columns = X_batch.columns.astype(str)

    # X 轉成 int8
    int_cols = X_batch.select_dtypes(include=['int64']).columns
    for col in int_cols:
        X_batch[col] = X_batch[col].astype(np.int8)

    # 儲存當前批次到臨時檔案
    batch_file = os.path.join(output_X_dir, f'X_batch_{i}.parquet')
    X_batch.to_parquet(batch_file)

    # 清理記憶體
    del batch_df, fingerprints_df, protein_onehot, X_batch

# 合併所有 X 批次
X_test = pd.concat([pd.read_parquet(os.path.join(output_X_dir, f))
                   for f in os.listdir(output_X_dir) if f.endswith('.parquet')],
                   axis=0)

# 處理 y
df_test['binds'] = df_test['binds'].astype(np.int8)
y_test = df_test['binds'].reset_index(drop=True)  # 重置索引為連續的 RangeIndex

from sklearn.utils import shuffle

# 打亂數據，但保持 X 和 y 的對應關係
X_test, y_test = shuffle(X_test, y_test, random_state=42)

# 再次儲存為 Parquet
X_test.to_parquet('mg1030_X.parquet', index=False)
y_test.to_frame().to_parquet('mg1030_y.parquet', index=False)

'''
# 儲存 X_test 為 Parquet 檔案
X_test.to_parquet('mg1030_X.parquet', index=False)

# 將 y_test 轉換為 DataFrame 並儲存為 Parquet 檔案
y_test.to_frame().to_parquet('mg1030_y.parquet', index=False)
'''

# 可選：清理臨時目錄
import shutil
shutil.rmtree('temp_X_batches')

# restart kernal 釋放記憶體

In [None]:
X_train = pd.read_parquet('/kaggle/working/mg1030_X.parquet')
y_train = pd.read_parquet('/kaggle/working/mg1030_y.parquet')

In [None]:
X_train.shape

In [None]:
# -------------------- XGBoost --------------------
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint
# 最佳 param
xgb_model = XGBClassifier(colsample_bytree=0.7, gamma=0.3, learning_rate=0.5,
                          max_depth=11, n_estimators=258, reg_alpha=0, reg_lambda=10,
                          subsample=1.0)
xgb_model.fit(X_train, y_train)


In [None]:
#儲存模型
import pickle
with open("/kaggle/working/15_05_new_xgb_model.bin", "wb") as f:
    pickle.dump(xgb_model, f)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

import duckdb
import pandas as pd
from tqdm import tqdm
import numpy as np # linear algebra


In [5]:
import pickle
xg =  open("/kaggle/input/new-xgb/pytorch/default/1/15_05_new_xgb_model.bin", "rb")
xgb_15_05_model =  pickle.load(xg) #載入model
xgb_15_05_model

In [6]:
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = AllChem.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)


In [7]:
import os

# Process the test.parquet file chunk by chunk
test_file = '/kaggle/input/leash-BELKA/test.csv' #載入檔案名稱

df_test = pd.read_csv(test_file)

In [8]:
df_test.shape

(1674896, 6)

In [9]:
from rdkit.Chem import AllChem
from rdkit import Chem

output_file = 'submission15_05_.csv'  # 輸出檔案名稱

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=104681):
    
    
    # 對 "molecule_smiles" 欄位進行轉換並顯示進度條
    tqdm.pandas(desc="Transforming molecule_smiles")
    df_test["molecule_smiles"] = df_test["molecule_smiles"].progress_apply(lambda x: smiles_to_morgan_fingerprint(x))
    df_test.columns = df_test.columns.astype(str)
    
    
    # 轉成int8
    int_cols = df_test.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df_test[col] = df_test[col].astype(np.int8)
    
    
    
    fingerprints_df = pd.DataFrame(df_test['molecule_smiles'].to_list())
    print(f"fingerprints_df shape: {fingerprints_df.shape}")  # 應該是 (104681, 2048)
    
    protein_onehot = pd.get_dummies(df_test["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    print(f"protein_onehot shape: {protein_onehot.shape}")  # 應該是 (104681, X)
    
    X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)
    print(f"X_test shape: {X_test.shape}")  # 應該是 (104681, 2048 + X)

    
    print(X_test)
    
    # Predict the probabilities
    probabilities = xgb_15_05_model.predict_proba(X_test)[:, 1]
    
    threshold = 0.5
    predictions = (probabilities >= threshold).astype(int)
    
    # 產生新的 id，範圍從 295246830 到 296921725
    df_test['id'] = range(295246830, 295246830 + len(df_test))
    
    # 建立輸出 DataFrame
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': predictions})
    
    
    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:22<00:00, 518.03it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:24<00:00, 512.05it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:16<00:00, 533.46it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:35<00:00, 485.45it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:28<00:00, 502.91it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:31<00:00, 495.11it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:40<00:00, 474.83it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     1     0     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     1     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     1     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:29<00:00, 498.61it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     1     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     1     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:30<00:00, 496.79it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:45<00:00, 464.61it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  1  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:33<00:00, 490.41it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:34<00:00, 487.88it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  1  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  1  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  1  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  1  0  ...     0     0     0     0     0     0   
104677  0  0  1  0  0  0  0  0  1  0  ...     0     0     0     0     0     0   
104678  0  0  1  0  0  0  0  0  1  0  ...     0     0     0     0     0     0   
104679  0  0  1  0  0  0  0  0  1  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:38<00:00, 478.88it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  1  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:45<00:00, 464.30it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  1  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:30<00:00, 497.41it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

Transforming molecule_smiles: 100%|██████████| 104681/104681 [03:33<00:00, 490.49it/s]


fingerprints_df shape: (104681, 2048)
protein_onehot shape: (104681, 3)
X_test shape: (104681, 2051)
        0  1  2  3  4  5  6  7  8  9  ...  2041  2042  2043  2044  2045  2046  \
0       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
1       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
2       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
3       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
4       0  0  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
...    .. .. .. .. .. .. .. .. .. ..  ...   ...   ...   ...   ...   ...   ...   
104676  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104677  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104678  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104679  0  1  0  0  0  0  0  0  0  0  ...     0     0     0     0     0     0   
104680  

In [12]:
import pandas as pd

input_file = '/kaggle/working/submission15_05_.csv'
output_file = '/kaggle/working/submission15_05_new.csv'
# 讀取已儲存的 CSV
output_df = pd.read_csv(input_file)

print(len(output_df))

# 修改 id 欄位
output_df['id'] = range(295246830, 295246830 + len(output_df))

print(output_df.shape)

# 將修改後的 DataFrame 儲存回 CSV
output_df.to_csv(output_file, index=False)

1674896
(1674896, 2)
