In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pyarrow.parquet as pq
import pandas as pd
import gc
import duckdb
import os
from tqdm import tqdm  # 引入進度條

filename = "/kaggle/input/leash-BELKA/train.parquet"
columns_to_read = ["molecule_smiles", "protein_name", "binds"]

batch_size = 60000
target_rows = 1200000
total_batches = 15
total_rows = 0

parquet_file = pq.ParquetFile(filename)
num_row_groups = parquet_file.num_row_groups
row_groups_per_batch = target_rows // batch_size

for i in range(total_batches):
    chunks = []
    batch_start_row_group = i * row_groups_per_batch
    batch_end_row_group = min(batch_start_row_group + row_groups_per_batch, num_row_groups)
    current_rows = 0  # 當前批次累積筆數

    for row_group_idx in range(batch_start_row_group, batch_end_row_group):
        try:
            batch = parquet_file.read_row_groups([row_group_idx], columns=columns_to_read)
            chunk = batch.to_pandas()

            if not chunk.empty:
                chunks.append(chunk)
                current_rows += len(chunk)
                total_rows += len(chunk)

            if current_rows >= target_rows:
                break  # 當前批次已達目標筆數則停止
        except Exception as e:
            print(f"⚠️ 讀取行組 {row_group_idx} 時發生錯誤: {e}")

    if chunks:
        batch_df = pd.concat(chunks, ignore_index=True)
        output_filename = f"/kaggle/working/train_part{i+1}.parquet"
        batch_df.to_parquet(output_filename, index=False)

        print(f"✅ 第 {i+1} 次存檔：{len(batch_df)} 筆，已累積 {total_rows} 筆")

        del batch_df
        gc.collect()
    else:
        print(f"⚠️ 第 {i+1} 次處理未讀取到任何資料，跳過該批次。")


# **DuckDB 處理抽樣**
parquet_files = [f"/kaggle/working/train_part{i+1}.parquet" for i in range(15)]
con = duckdb.connect()
all_samples = []

for i, file in enumerate(parquet_files):
    if not os.path.exists(file):
        print(f"❌ 檔案不存在: {file}")
        continue

    print(f"📂 正在處理檔案: {file}")

    query = f"""
    (SELECT * FROM parquet_scan('{file}')
        WHERE "binds" = 0
        ORDER BY random()
        LIMIT 15000)
    UNION ALL
    (SELECT * FROM parquet_scan('{file}')
        WHERE "binds" = 1
        ORDER BY random()
        LIMIT 5000)
    """

    df = con.query(query).df()

    output_filename = f"/kaggle/working/sampled_test_part{i+1}.parquet"
    df.to_parquet(output_filename, index=False)
    print(f"✅ 已儲存抽樣結果: {output_filename}（共 {len(df)} 筆）")

    all_samples.append(df)
    del df
    gc.collect()

final_test_df = pd.concat(all_samples, ignore_index=True)
final_test_output = "/kaggle/working/sampled_train_all.parquet"
final_test_df.to_parquet(final_test_output, index=False)
print(f"🎯 全部 15 個檔案已處理完畢，最終合併檔案: {final_test_output}（共 {len(final_test_df)} 筆）")

con.close()

# **最終整理**
con = duckdb.connect()
train_path = '/kaggle/working/sampled_train_all.parquet'

with tqdm(total=2, desc="Processing Data") as pbar:
    df_part1 = con.query(f"""
        SELECT * FROM parquet_scan('{train_path}')
        WHERE "binds" = 0
        ORDER BY random()
        LIMIT 150000
    """).df()
    pbar.update(1)

    df_part2 = con.query(f"""
        SELECT * FROM parquet_scan('{train_path}')
        WHERE "binds" = 1
        ORDER BY random()
        LIMIT 50000
    """).df()
    pbar.update(1)

df = pd.concat([df_part1, df_part2], ignore_index=True)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
con.close()


In [None]:
# 設定檔案路徑
train_path = '/kaggle/input/leash-BELKA/train.parquet'

# 建立 DuckDB 連線
con = duckdb.connect()

# 使用進度條來顯示進度
with tqdm(total=2, desc="Processing Data") as pbar:
    # 查詢第一部分數據
    df_part1 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 0
                              ORDER BY random()
                              LIMIT 150000""").df()
    pbar.update(1)  # 更新進度條

    # 查詢第二部分數據
    df_part2 = con.query(f"""SELECT *
                              FROM parquet_scan('{train_path}')
                              WHERE binds = 1
                              ORDER BY random()
                              LIMIT 50000""").df()
    pbar.update(1)  # 更新進度條

# 合併兩部分數據
df = pd.concat([df_part1, df_part2], ignore_index=True)

# 隨機洗牌數據（frac=1 表示保持原始大小，shuffle 整個 DataFrame）
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# 關閉連線
con.close()

In [None]:
print(df.head())

In [None]:
data = pd.read_parquet('//kaggle/input/data-all/train_transformed_morgan(100k100k).parquet')
data.head()

In [None]:
# 轉換成 DataFrame
X_fingerprints_df = pd.DataFrame(data['molecule_smiles'].to_list())
X_fingerprints_df

In [None]:
# 合併數值特徵
X = pd.concat([X_fingerprints_df, data[['protein_BRD4', 'protein_HSA', 'protein_sEH']]], axis=1)
X

In [None]:
# 轉換欄位名稱都是str
X.columns = X.columns.astype(str)

In [None]:
# 轉成int8
int_cols = X.select_dtypes(include=['int64']).columns
for col in int_cols:
    X[col] = X[col].astype(np.int8)

X.dtypes

In [None]:
# 轉成int8
data['binds'] = data['binds'].astype(np.int8)
data['binds'].dtypes

In [None]:
y = data['binds']
y

In [None]:
X.dtypes
y.dtypes

In [None]:
# 分割資料成 train, validation, test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)  # 90% train+validation & 10% test

In [None]:
# -------------------- XGBoost --------------------
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint

# 定義參數範圍
param_dist = {
    'n_estimators': [195],  # 樹的數量
    'max_depth': [8],  # 樹的最大深度
    'learning_rate': [0.1, 0.5],  # 學習率
    'subsample': [0.9, 1.0],  # 訓練集的隨機抽樣比例
    'colsample_bytree': [0.7, 1.0],  # 每棵樹的隨機抽樣比例
    'gamma': [0.2, 0.3],  # 設置分裂的最小損失函數
    'reg_alpha': [0, 0.01],  # L1正則化
    'reg_lambda': [10],  # L2正則化
}

# 初始化 XGBoost 模型
xgb = XGBClassifier(random_state=42)

# 隨機搜尋
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,  # 進行100次隨機試驗
    cv=5,  # 5折交叉驗證
    scoring='f1',  # 使用 AUC 評估模型
    n_jobs=-1,  # 使用所有可用的處理器
    verbose=1,  # 顯示詳細信息
    random_state=42,
    refit=True  # 使用最佳參數重新訓練
)

# 訓練模型
random_search.fit(X_train, y_train)

# 輸出最佳參數和 F1 分數
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")



In [None]:
# -------------------- XGBoost --------------------
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint
# 最佳 param
xgb_model = XGBClassifier(colsample_bytree=0.7, gamma=0.3, learning_rate=0.5,
                          max_depth=8, n_estimators=195, reg_alpha=0, reg_lambda=10,
                          subsample=1.0)
xgb_model.fit(X_train, y_train)

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/10_10_xgb_model.bin", "wb") as f:
    pickle.dump(xgb_model, f)

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
# -------------------- LightGBM --------------------
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# 建立 LightGBM 模型
lgb_clf = lgb.LGBMClassifier()

# 定義參數範圍
param_dist = {
    'num_leaves': [20, 50],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200],
    # 'boosting_type': ['gbdt', 'dart'],
    # 'feature_fraction': [0.6, 0.7, 0.8, 0.9],
    # 'bagging_fraction': [0.6, 0.7, 0.8],
    # 'bagging_freq': [1, 3, 5, 7]
}

# 設定 RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_dist,
    n_iter=15,  # 隨機搜尋次數
    scoring='f1',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 執行參數搜尋
random_search.fit(X_train, y_train)

# 輸出最佳參數和 F1 分數
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")

In [None]:
# -------------------- LightGBM --------------------
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# 最佳 param
lgbm_model = lgb.LGBMClassifier(num_leaves=50, n_estimators=200, learning_rate=0.2)
lgbm_model.fit(X_train, y_train)

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/10_10_lgbm_model.bin", "wb") as f:
    pickle.dump(lgbm_model, f)

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
data = pd.read_parquet('/kaggle/input/data-all/train_transformed_morgan(150k50k).parquet')
data.head()

In [None]:
# 轉換成 DataFrame
X_fingerprints_df = pd.DataFrame(data['molecule_smiles'].to_list())
X_fingerprints_df

In [None]:
# 合併數值特徵
X = pd.concat([X_fingerprints_df, data[['protein_BRD4', 'protein_HSA', 'protein_sEH']]], axis=1)
X

In [None]:
# 轉換欄位名稱都是str
X.columns = X.columns.astype(str)

In [None]:
# 轉成int8
int_cols = X.select_dtypes(include=['int64']).columns
for col in int_cols:
    X[col] = X[col].astype(np.int8)

X.dtypes

In [None]:
# 轉成int8
data['binds'] = data['binds'].astype(np.int8)
data['binds'].dtypes

In [None]:
y = data['binds']
y

In [None]:
X.dtypes
y.dtypes

In [None]:
# 分割資料成 train, validation, test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.001, random_state=42)  # 90% train+validation & 10% test

In [None]:
# -------------------- XGBoost --------------------
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier
from scipy.stats import randint

# 定義參數範圍
param_dist = {
    'n_estimators': [195],  # 樹的數量
    'max_depth': [8],  # 樹的最大深度
    'learning_rate': [0.1, 0.5],  # 學習率
    'subsample': [0.9, 1.0],  # 訓練集的隨機抽樣比例
    'colsample_bytree': [0.7, 1.0],  # 每棵樹的隨機抽樣比例
    'gamma': [0.2, 0.3],  # 設置分裂的最小損失函數
    'reg_alpha': [0, 0.01],  # L1正則化
    'reg_lambda': [10],  # L2正則化
}

# 初始化 XGBoost 模型
xgb = XGBClassifier(random_state=42)

# 隨機搜尋
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,  # 進行100次隨機試驗
    cv=5,  # 5折交叉驗證
    scoring='f1',  # 使用 AUC 評估模型
    n_jobs=-1,  # 使用所有可用的處理器
    verbose=1,  # 顯示詳細信息
    random_state=42,
    refit=True  # 使用最佳參數重新訓練
)

# 訓練模型
random_search.fit(X_train, y_train)

# 輸出最佳參數和 F1 分數
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")



In [None]:
# 最佳 param
xgb_model = XGBClassifier(**best_params)
xgb_model.fit(X_train, y_train)

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/15_05_xgb_model.bin", "wb") as f:
    pickle.dump(xgb_model, f)

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
# -------------------- LightGBM --------------------
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# 建立 LightGBM 模型
lgb_clf = lgb.LGBMClassifier()

# 定義參數範圍
param_dist = {
    'num_leaves': [20, 50],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200],
    # 'boosting_type': ['gbdt', 'dart'],
    # 'feature_fraction': [0.6, 0.7, 0.8, 0.9],
    # 'bagging_fraction': [0.6, 0.7, 0.8],
    # 'bagging_freq': [1, 3, 5, 7]
}

# 設定 RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_clf,
    param_distributions=param_dist,
    n_iter=15,  # 隨機搜尋次數
    scoring='f1',
    cv=5,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 執行參數搜尋
random_search.fit(X_train, y_train)

# 輸出最佳參數和 F1 分數
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")

In [None]:
# 最佳 param
lgbm_model = lgb.LGBMClassifier(**best_params)
lgbm_model.fit(X_train, y_train)

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/15_05_lgbm_model.bin", "wb") as f:
    pickle.dump(lgbm_model, f)

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
data = pd.read_parquet('/kaggle/input/data-all/train_transformed_morgan(150k50k).parquet')
data.head()

In [None]:
# 轉換成 DataFrame
X_fingerprints_df = pd.DataFrame(data['molecule_smiles'].to_list())
X_fingerprints_df

In [None]:
# 合併數值特徵
X = pd.concat([X_fingerprints_df, data[['protein_BRD4', 'protein_HSA', 'protein_sEH']]], axis=1)
X

In [None]:
# 轉換欄位名稱都是str
X.columns = X.columns.astype(str)

In [None]:
# 轉成int8
int_cols = X.select_dtypes(include=['int64']).columns
for col in int_cols:
    X[col] = X[col].astype(np.int8)

X.dtypes

In [None]:
# 轉成int8
data['binds'] = data['binds'].astype(np.int8)
data['binds'].dtypes

In [None]:
y = data['binds']
y

In [None]:
# 分割資料成 train, validation, test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  # 90% train+validation & 10% test

In [None]:
# -------------------- Random Forest --------------------
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

# 定義參數範圍
param_dist = {
    'n_estimators': [138, 160], # 隨機森林中的樹的數量，從50到200
    'max_depth': [None],  # 每棵樹的最大深度
    'min_samples_leaf': [1],  # 每棵樹的葉子節點所需的最少樣本數
}

# 建立隨機森林
rf = RandomForestClassifier(random_state=42)

# 隨機搜索
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,  # 使用上面定義的隨機參數範圍
    n_iter=30,  # 設定進行100次隨機試驗
    cv=5,  # 使用5折交叉驗證
    scoring='f1',  # 評估指標使用F1分數
    n_jobs=-1,  # 使用所有可用的處理器進行並行運算
    verbose=1,   # 顯示運行過程中的詳細信息
    random_state=42,
    refit=True  # 使用最佳參數重訓練模型
)

# 訓練模型
random_search.fit(X_train, y_train)

# 輸出最佳參數和對應的 F1 分數
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best F1 Score: {best_score}")

In [None]:
# 最佳 param
rf_model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, n_estimators=160)
rf_model.fit(X_train, y_train)

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/15_05_rf_model.bin", "wb") as f:
    pickle.dump(rf_model, f)

In [None]:
!zip rf_model.zip /kaggle/working/15_05_rf_model.bin

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
# 載入模型
import pickle
l =  open("/kaggle/input/15_05_model/keras/default/1/15_05_lgbm_model.bin", "rb")
lgbm_model =  pickle.load(l)
r =  open("/kaggle/input/1505model/scikitlearn/default/1/mg15_05_rf_model.bin", "rb")
rf_model =  pickle.load(r)
xg =  open("/kaggle/input/15_05_model/keras/default/1/15_05_xgb_model.bin", "rb")
xgb_model =  pickle.load(xg)
print("ok")

In [None]:
# -------------------- Voting --------------------
from sklearn.ensemble import VotingClassifier

# 建立 VotingClassifier
voting_clf = VotingClassifier(
    estimators=[('rf', rf_model), ('lgb', lgbm_model), ('xgb', xgb_model)],
    voting='soft'  # 使用機率加權投票
)

# voting：
# 'hard'：根據最多數的預測類別來決定最終結果（適用於分類器無機率輸出時）。
# 'soft'：根據所有分類器的機率平均來決定最終結果（需要分類器支援 predict_proba()）。

# 訓練模型
voting_clf.fit(X_train, y_train)

# 預測
y_pred = voting_clf.predict(X_test)

# 計算準確度
print("Voting accuracy (training)：", voting_clf.score(X_train, y_train))
print("Voting accuracy (test)：", voting_clf.score(X_test, y_test))

In [None]:
#儲存模型
import pickle
with open("/kaggle/working/voting_model_mg1505.bin", "wb") as f:
    pickle.dump(voting_clf, f)

In [None]:
!zip voting_model_mg1505.zip /kaggle/working/voting_model_mg1505.bin

In [None]:
import gc  # 引入垃圾回收模組
gc.collect()  # 執行垃圾回收

In [None]:
data = pd.read_parquet('/kaggle/input/data-all/test_transformed__morgan(180k20k).parquet')
data.head()

In [None]:
# 轉換成 DataFrame
X_fingerprints_df = pd.DataFrame(data['molecule_smiles'].to_list())
X_fingerprints_df

In [None]:
# 合併數值特徵
X = pd.concat([X_fingerprints_df, data[['protein_BRD4', 'protein_HSA', 'protein_sEH']]], axis=1)
X

In [None]:
# 轉換欄位名稱都是str
X.columns = X.columns.astype(str)

In [None]:
# 轉成int8
int_cols = X.select_dtypes(include=['int64']).columns
for col in int_cols:
    X[col] = X[col].astype(np.int8)

X.dtypes

In [None]:
X_test=X
X_test

In [None]:
# 轉成int8
data['binds'] = data['binds'].astype(np.int8)
data['binds'].dtypes

In [None]:
y_test = data['binds']
y_test

In [None]:
import pickle
xg =  open("/kaggle/input/15_05_model/keras/default/1/15_05_xgb_model.bin", "rb")
xgb_15_05_model =  pickle.load(xg)
xgb_15_05_model

In [None]:
import pickle
xg =  open("/kaggle/input/1010model/keras/default/1/10_10_xgb_model.bin", "rb")
xgb_10_10_model =  pickle.load(xg)
xgb_10_10_model

In [None]:
# 載入模型
import pickle
l =  open("/kaggle/input/15_05_model/keras/default/1/15_05_lgbm_model.bin", "rb")
lgbm_15_05_model =  pickle.load(l)
lgbm_15_05_model

In [None]:
# 載入模型
import pickle
l =  open("/kaggle/input/1010model/keras/default/1/10_10_lgbm_model.bin", "rb")
lgbm_10_10_model =  pickle.load(l)
lgbm_10_10_model

In [None]:
#rf
rf =  open("/kaggle/input/1505model/scikitlearn/default/1/mg15_05_rf_model.bin", "rb")
rf_15_05_model =  pickle.load(rf)
rf_15_05_model

In [None]:
# voting
vote =  open("/kaggle/input/1505-voting/scikitlearn/default/1/kaggle/working/voting_model_mg1505.bin", "rb")
voting_clf = pickle.load(vote)
voting_clf

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
# LightGBM 預測
y_pred_lgb = lgbm_15_05_model.predict(X_test)
y_pred_lgb_binary = (y_pred_lgb >= 0.5).astype(int)
y_proba_lgb = lgbm_15_05_model.predict_proba(X_test)[:, 1]

# XGBoost 預測
y_pred_xgb = xgb_15_05_model.predict(X_test)
y_pred_xgb_binary = (y_pred_xgb >= 0.5).astype(int)
y_proba_xgb = xgb_15_05_model.predict_proba(X_test)[:, 1]

# Random Forest 預測
y_pred_rf = rf_15_05_model.predict(X_test)
y_pred_rf_binary = (y_pred_rf >= 0.5).astype(int)
y_proba_rf = rf_15_05_model.predict_proba(X_test)[:, 1]


# 評估函式
def evaluate_model(name, y_test, y_pred_binary, y_proba):
    print(f"🔹 {name} Scores")
    print(f"F1 Score: {f1_score(y_test, y_pred_binary):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred_binary):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_binary):.4f}")
    print("="*50)

# 計算評估指標
evaluate_model("LightGBM", y_test, y_pred_lgb_binary, y_proba_lgb)

evaluate_model("XGBoost", y_test, y_pred_xgb_binary, y_proba_xgb)

evaluate_model("Random Forest", y_test, y_pred_rf_binary, y_proba_rf)

In [None]:
# Voting 預測
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

# 預測機率 (只適用於 'soft' voting)
y_proba = voting_clf.predict_proba(X_test)[:, 1]  # 取出正類別的機率

# 轉換成 0/1 預測標籤（根據 0.5 閾值）
y_pred_binary = (y_proba >= 0.5).astype(int)

# 計算評估指標
print("F1 Score:", f1_score(y_test, y_pred_binary))
print("AUC Score:", roc_auc_score(y_test, y_proba))  # AUC 需要機率輸出
print("Recall:", recall_score(y_test, y_pred_binary))
print("Precision:", precision_score(y_test, y_pred_binary))

In [None]:
# LightGBM 預測
y_pred_lgb = lgbm_10_10_model.predict(X_test)
y_pred_lgb_binary = (y_pred_lgb >= 0.5).astype(int)
y_proba_lgb = lgbm_10_10_model.predict_proba(X_test)[:, 1]

# XGBoost 預測
y_pred_xgb = xgb_10_10_model.predict(X_test)
y_pred_xgb_binary = (y_pred_xgb >= 0.5).astype(int)
y_proba_xgb = xgb_10_10_model.predict_proba(X_test)[:, 1]

# Random Forest 預測
# y_pred_rf = rf_model.predict(X_test)
# y_pred_rf_binary = (y_pred_rf >= 0.5).astype(int)
# y_proba_rf = rf_model.predict_proba(X_test)[:, 1]


# 評估函式
def evaluate_model(name, y_test, y_pred_binary, y_proba):
    print(f"🔹 {name} Scores")
    print(f"F1 Score: {f1_score(y_test, y_pred_binary):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred_binary):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_binary):.4f}")
    print("="*50)

# 計算三個模型的指標
evaluate_model("LightGBM", y_test, y_pred_lgb_binary, y_proba_lgb)

evaluate_model("XGBoost", y_test, y_pred_xgb_binary, y_proba_xgb)

# evaluate_model("Random Forest", y_test, y_pred_rf_binary, y_proba_rf)

In [None]:
data = pd.read_parquet('/kaggle/input/data-all/test_transformed_morgan(100k100k).parquet')
data.head()

In [None]:
# 轉換成 DataFrame
X_fingerprints_df = pd.DataFrame(data['molecule_smiles'].to_list())
X_fingerprints_df

In [None]:
# 合併數值特徵
X = pd.concat([X_fingerprints_df, data[['protein_BRD4', 'protein_HSA', 'protein_sEH']]], axis=1)
X

In [None]:
# 轉換欄位名稱都是str
X.columns = X.columns.astype(str)

In [None]:
# 轉成int8
int_cols = X.select_dtypes(include=['int64']).columns
for col in int_cols:
    X[col] = X[col].astype(np.int8)

X.dtypes

In [None]:
X_test=X
X_test

In [None]:
# 轉成int8
data['binds'] = data['binds'].astype(np.int8)
data['binds'].dtypes

In [None]:
y_test = data['binds']
y_test

In [None]:
import pickle
xg =  open("/kaggle/input/15_05_model/keras/default/1/15_05_xgb_model.bin", "rb")
xgb_15_05_model =  pickle.load(xg)
xgb_15_05_model

In [None]:
import pickle
xg =  open("/kaggle/input/1010model/keras/default/1/10_10_xgb_model.bin", "rb")
xgb_10_10_model =  pickle.load(xg)
xgb_10_10_model

In [None]:
# 載入模型
import pickle
l =  open("/kaggle/input/15_05_model/keras/default/1/15_05_lgbm_model.bin", "rb")
lgbm_15_05_model =  pickle.load(l)
lgbm_15_05_model

In [None]:
# 載入模型
import pickle
l =  open("/kaggle/input/1010model/keras/default/1/10_10_lgbm_model.bin", "rb")
lgbm_10_10_model =  pickle.load(l)
lgbm_10_10_model

In [None]:
#rf

In [None]:
#voting

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score
# LightGBM 預測
y_pred_lgb = lgbm_15_05_model.predict(X_test)
y_pred_lgb_binary = (y_pred_lgb >= 0.5).astype(int)
y_proba_lgb = lgbm_15_05_model.predict_proba(X_test)[:, 1]

# XGBoost 預測
y_pred_xgb = xgb_15_05_model.predict(X_test)
y_pred_xgb_binary = (y_pred_xgb >= 0.5).astype(int)
y_proba_xgb = xgb_15_05_model.predict_proba(X_test)[:, 1]

# Random Forest 預測
# y_pred_rf = rf_model.predict(X_test)
# y_pred_rf_binary = (y_pred_rf >= 0.5).astype(int)
# y_proba_rf = rf_model.predict_proba(X_test)[:, 1]


# 評估函式
def evaluate_model(name, y_test, y_pred_binary, y_proba):
    print(f"🔹 {name} Scores")
    print(f"F1 Score: {f1_score(y_test, y_pred_binary):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred_binary):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_binary):.4f}")
    print("="*50)

# 計算三個模型的指標
evaluate_model("LightGBM", y_test, y_pred_lgb_binary, y_proba_lgb)

evaluate_model("XGBoost", y_test, y_pred_xgb_binary, y_proba_xgb)

# evaluate_model("Random Forest", y_test, y_pred_rf_binary, y_proba_rf)

In [None]:
# Voting 預測
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score

# 預測機率 (只適用於 'soft' voting)
y_proba = voting_clf.predict_proba(X_test)[:, 1]  # 取出正類別的機率

# 轉換成 0/1 預測標籤（根據 0.5 閾值）
y_pred_binary = (y_proba >= 0.5).astype(int)

# 計算評估指標
print("F1 Score:", f1_score(y_test, y_pred_binary))
print("AUC Score:", roc_auc_score(y_test, y_proba))  # AUC 需要機率輸出
print("Recall:", recall_score(y_test, y_pred_binary))
print("Precision:", precision_score(y_test, y_pred_binary))

In [None]:
# LightGBM 預測
y_pred_lgb = lgbm_10_10_model.predict(X_test)
y_pred_lgb_binary = (y_pred_lgb >= 0.5).astype(int)
y_proba_lgb = lgbm_10_10_model.predict_proba(X_test)[:, 1]

# XGBoost 預測
y_pred_xgb = xgb_10_10_model.predict(X_test)
y_pred_xgb_binary = (y_pred_xgb >= 0.5).astype(int)
y_proba_xgb = xgb_10_10_model.predict_proba(X_test)[:, 1]

# Random Forest 預測
# y_pred_rf = rf_model.predict(X_test)
# y_pred_rf_binary = (y_pred_rf >= 0.5).astype(int)
# y_proba_rf = rf_model.predict_proba(X_test)[:, 1]


# 評估函式
def evaluate_model(name, y_test, y_pred_binary, y_proba):
    print(f"🔹 {name} Scores")
    print(f"F1 Score: {f1_score(y_test, y_pred_binary):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_proba):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred_binary):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred_binary):.4f}")
    print("="*50)

# 計算三個模型的指標
evaluate_model("LightGBM", y_test, y_pred_lgb_binary, y_proba_lgb)

evaluate_model("XGBoost", y_test, y_pred_xgb_binary, y_proba_xgb)

# evaluate_model("Random Forest", y_test, y_pred_rf_binary, y_proba_rf)

In [None]:
# test

In [None]:
!pip install rdkit

In [None]:
import pickle
xg =  open("/kaggle/input/15_05_model/keras/default/1/15_05_xgb_model.bin", "rb")
xgb_15_05_model =  pickle.load(xg)
xgb_15_05_model

In [None]:
import pickle
xg =  open("/kaggle/input/1010model/keras/default/1/10_10_xgb_model.bin", "rb")
xgb_10_10_model =  pickle.load(xg)
xgb_10_10_model

In [None]:
import pickle
vot =  open("/kaggle/input/1505-voting/scikitlearn/default/1/kaggle/working/voting_model_mg1505.bin", "rb")
vot_15_05_model =  pickle.load(vot)
vot_15_05_model

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import OneHotEncoder

import duckdb
import pandas as pd
from tqdm import tqdm
import numpy as np # linear algebra


In [None]:
def smiles_to_morgan_fingerprint(smiles, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.zeros(n_bits, dtype=int)
    else:
        generator = AllChem.GetMorganGenerator(radius=2, fpSize=n_bits)
        return np.array(generator.GetFingerprint(mol), dtype=int)



In [None]:
import os

# Process the test.parquet file chunk by chunk
test_file = '/kaggle/input/leash-BELKA/test.csv'
output_file = 'submission15_05_v.csv'  # Specify the path and filename for the output file

test = pd.read_csv(test_file)
test.shape

In [None]:

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=104681):
    
    
    # 對 "molecule_smiles" 欄位進行轉換並顯示進度條
    tqdm.pandas(desc="Transforming molecule_smiles")
    df_test["molecule_smiles"] = df_test["molecule_smiles"].progress_apply(lambda x: smiles_to_morgan_fingerprint(x))
    df_test.columns = df_test.columns.astype(str)
    
    
    # 轉成int8
    int_cols = df_test.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df_test[col] = df_test[col].astype(np.int8)
    
    
    
    fingerprints_df = pd.DataFrame(df_test['molecule_smiles'].to_list())
    print(f"fingerprints_df shape: {fingerprints_df.shape}")  # 應該是 (104681, 2048)
    
    protein_onehot = pd.get_dummies(df_test["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    print(f"protein_onehot shape: {protein_onehot.shape}")  # 應該是 (104681, X)
    
    X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)
    print(f"X_test shape: {X_test.shape}")  # 應該是 (104681, 2048 + X)

    X_test.columns = X_test.columns.astype(str)

    print(X_test)
    
    # Predict the probabilities
    probabilities = vot_15_05_model.predict_proba(X_test)[:, 1]
    
    threshold = 0.5
    predictions = (probabilities >= threshold).astype(int)
    
    
    # Create a DataFrame with 'id' and 'probability' columns
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': predictions})
    
    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))


In [None]:
'''
# 轉成int8
int_cols = df_test.select_dtypes(include=['int64']).columns 
for col in int_cols:
    df_test[col] = df_test[col].astype(np.int8)


# One-hot encode the protein_name
protein_onehot = pd.get_dummies(df_test["protein_name"], prefix="protein").astype(int)

# Combine ECFPs and one-hot encoded protein_name
# X_test = [ecfp + protein for ecfp, protein in zip(df_test['molecule_smiles'].tolist(), protein_onehot)]
fingerprints_df = pd.DataFrame(df_test['molecule_smiles'].to_list())
X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)

# 轉成int8
int_cols = X_test.select_dtypes(include=['int64']).columns
for col in int_cols:
    X_test[col] = X_test[col].astype(np.int8)

X_test.columns = X_test.columns.astype(str)
print(X_test)
'''

In [None]:
'''
# Predict the probabilities
probabilities = vot_15_05_model.predict_proba(X_test)[:, 1]

threshold = 0.5
predictions = (probabilities >= threshold).astype(int)


# Create a DataFrame with 'id' and 'probability' columns
output_df = pd.DataFrame({'id': df_test['id'], 'binds': predictions})

# Save the output DataFrame to a CSV file
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))
'''

In [None]:
stop

In [None]:
output_file = 'submission15_05_2.csv'  # Specify the path and filename for the output file

# Read the test.parquet file into a pandas DataFrame
for df_test in pd.read_csv(test_file, chunksize=104681):
    
    
    # 對 "molecule_smiles" 欄位進行轉換並顯示進度條
    tqdm.pandas(desc="Transforming molecule_smiles")
    df_test["molecule_smiles"] = df_test["molecule_smiles"].progress_apply(lambda x: smiles_to_morgan_fingerprint(x))
    df_test.columns = df_test.columns.astype(str)
    
    
    # 轉成int8
    int_cols = df_test.select_dtypes(include=['int64']).columns
    for col in int_cols:
        df_test[col] = df_test[col].astype(np.int8)
    
    
    
    fingerprints_df = pd.DataFrame(df_test['molecule_smiles'].to_list())
    print(f"fingerprints_df shape: {fingerprints_df.shape}")  # 應該是 (104681, 2048)
    
    protein_onehot = pd.get_dummies(df_test["protein_name"], prefix="protein").astype(int).reset_index(drop=True)
    print(f"protein_onehot shape: {protein_onehot.shape}")  # 應該是 (104681, X)
    
    X_test = pd.concat([fingerprints_df, protein_onehot], axis=1)
    print(f"X_test shape: {X_test.shape}")  # 應該是 (104681, 2048 + X)

    
    print(X_test)
    
    # Predict the probabilities
    probabilities = xgb_15_05_model.predict_proba(X_test)[:, 1]
    
    threshold = 0.5
    predictions = (probabilities >= threshold).astype(int)
    
    # 產生新的 id，範圍從 295246830 到 296921725
    df_test['id'] = range(295246830, 295246830 + len(df_test))
    
    # 建立輸出 DataFrame
    output_df = pd.DataFrame({'id': df_test['id'], 'binds': predictions})
    
    
    # Save the output DataFrame to a CSV file
    output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))


In [None]:
#改id
# 先讀取 output_file 的當前行數，計算新的 id 起點
if os.path.exists(output_file):
    existing_df = pd.read_csv(output_file)
    start_id = existing_df['id'].max() + 1  # 讓新的 id 從上次的最大值 + 1 開始
else:
    start_id = 295246830  # 初始起點

# 修改 df_test['id']
df_test['id'] = range(start_id, start_id + len(df_test))

# 創建 output DataFrame
output_df = pd.DataFrame({'id': df_test['id'], 'binds': predictions})

# 儲存 CSV
output_df.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))


In [None]:
import pandas as pd

input_file = '/kaggle/input/submssion-10-10/submission10_10_v.csv'
output_file = 'submission10_10_new.csv'
# 讀取已儲存的 CSV
output_df = pd.read_csv(input_file)

print(len(output_df))

# 修改 id 欄位
output_df['id'] = range(295246830, 295246830 + len(output_df))

print(output_df.shape)

# 將修改後的 DataFrame 儲存回 CSV
output_df.to_csv(output_file, index=False)


In [None]:
import pandas as pd

input_file = '/kaggle/input/d/t8101349/1505-voting/submission15_05_v (1).csv'
output_file = 'submission15_05_v_new.csv'
# 讀取已儲存的 CSV
output_df = pd.read_csv(input_file)

print(len(output_df))

# 修改 id 欄位
output_df['id'] = range(295246830, 295246830 + len(output_df))

print(output_df.shape)

# 將修改後的 DataFrame 儲存回 CSV
output_df.to_csv(output_file, index=False)


In [None]:
import os
import pandas as pd


input_file = '/kaggle/input/voting-submission/submission15_05_v_new.csv'
output_file = 'submission15_05_v_new.csv'

# 確保檔案存在後才讀取
if os.path.exists(input_file):
    output_df = pd.read_csv(input_file)
    
    # 去掉前 219362 筆資料
    output_df = output_df.iloc[:-219362].reset_index(drop=True)
    
    # 重新分配 ID，範圍從 295246830 開始
    output_df['id'] = range(295246830, 295246830 + len(output_df))
    
    # 存回檔案
    output_df.to_csv(output_file, index=False)
    
    print("已成功去除後 219362 筆資料並更新 ID！")
else:
    print(f"檔案 {output_file} 不存在，無法處理！")
