In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split


In [2]:
import joblib
import pandas as pd

data_kcat = joblib.load('/Users/pinchichen/2025S lab/AI drug project/Catpred/dataset/km_with_features.joblib')
data_kcat.head()

Unnamed: 0,Sequence,sequence_source,uniprot,Smiles,value,ec,taxonomy_id,log10_value,log10km_mean,metabolite_features,protein_features
0,MGQEKLYIEKELSWLSFNERVLQEAADKSNPLIERMRFLGIYSNNL...,sabio,P0A7B1,Nc1nc2c(ncn2C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)...,0.63,2.7.4.1,562,-0.200659,-0.200659,"[0.25370342, -0.3417107, -0.28024313, -0.04292...","[-0.06592023, -0.062496405, -0.012068139, 0.06..."
1,MSQQPHTEQFPKNQPPLAERLASARQLVTKAISAVPPHPEPLPSPN...,brenda,A0A0S2SWE4,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)O,0.0264,1.13.11.60,474922,-1.578396,-1.553572,"[0.61067283, 0.46248585, -0.051634412, -0.7397...","[0.16962084, -0.09671225, 0.116663285, 0.05065..."
2,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAVEKPFKVTVIGSGNWG...,uniprot_search,A0A060KZ16,O=C(CO)COP(=O)(O)O,0.61,1.1.1.8,114524,-0.21467,-0.21467,"[0.29938427, -0.0972216, -0.3334697, -0.721026...","[0.0933136, -0.117130496, 0.0676241, 0.0187789..."
3,MLAEKTRSIIKATVPVLEQQGTVITRTFYKNMLTEHTELLNIFNRT...,sabio,P39676,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,0.028,1.14.12.17,4932,-1.552842,-1.552842,"[0.639636, -0.14536878, -0.52627945, 0.1240742...","[-0.02827879, -0.04372119, 0.03182794, 0.19197..."
4,MASAERVPVSFNKPGRVPFGEVQGYAPGHIPAYSNKHDHFFSGERS...,uniprot_search,O60993,Nc1ncnc2c1ncn2[C@H]1O[C@H](COP(=O)([O-])OP(=O)...,0.4,6.3.1.9,5656,-0.39794,-0.39794,"[0.25855744, 0.43888408, -0.7136499, 0.0234168...","[0.074476704, -0.050253637, 0.05497695, 0.1402..."


In [3]:
data_kcat.rename(columns={'log10km_mean':'label'},inplace=True)
data_kcat.head()

Unnamed: 0,Sequence,sequence_source,uniprot,Smiles,value,ec,taxonomy_id,log10_value,label,metabolite_features,protein_features
0,MGQEKLYIEKELSWLSFNERVLQEAADKSNPLIERMRFLGIYSNNL...,sabio,P0A7B1,Nc1nc2c(ncn2C2OC(COP(=O)(O)OP(=O)(O)O)C(O)C2O)...,0.63,2.7.4.1,562,-0.200659,-0.200659,"[0.25370342, -0.3417107, -0.28024313, -0.04292...","[-0.06592023, -0.062496405, -0.012068139, 0.06..."
1,MSQQPHTEQFPKNQPPLAERLASARQLVTKAISAVPPHPEPLPSPN...,brenda,A0A0S2SWE4,CC/C=C\C/C=C\C/C=C\CCCCCCCC(=O)O,0.0264,1.13.11.60,474922,-1.578396,-1.553572,"[0.61067283, 0.46248585, -0.051634412, -0.7397...","[0.16962084, -0.09671225, 0.116663285, 0.05065..."
2,MSAAADRLNLTSGHLNAGRKRSSSSVSLKAVEKPFKVTVIGSGNWG...,uniprot_search,A0A060KZ16,O=C(CO)COP(=O)(O)O,0.61,1.1.1.8,114524,-0.21467,-0.21467,"[0.29938427, -0.0972216, -0.3334697, -0.721026...","[0.0933136, -0.117130496, 0.0676241, 0.0187789..."
3,MLAEKTRSIIKATVPVLEQQGTVITRTFYKNMLTEHTELLNIFNRT...,sabio,P39676,NC(=O)C1=CN(C2OC(COP(=O)(O)OP(=O)(O)OCC3OC(n4c...,0.028,1.14.12.17,4932,-1.552842,-1.552842,"[0.639636, -0.14536878, -0.52627945, 0.1240742...","[-0.02827879, -0.04372119, 0.03182794, 0.19197..."
4,MASAERVPVSFNKPGRVPFGEVQGYAPGHIPAYSNKHDHFFSGERS...,uniprot_search,O60993,Nc1ncnc2c1ncn2[C@H]1O[C@H](COP(=O)([O-])OP(=O)...,0.4,6.3.1.9,5656,-0.39794,-0.39794,"[0.25855744, 0.43888408, -0.7136499, 0.0234168...","[0.074476704, -0.050253637, 0.05497695, 0.1402..."


In [4]:
# Define dataset
class MPI_Dataset(Dataset):
    def __init__(self, dataframe):
        self.dataframe = dataframe

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        return {
            'metabolite_features': torch.tensor(np.asarray(row['metabolite_features'], dtype=np.float32)),
            'protein_features': torch.tensor(np.asarray(row['protein_features'], dtype=np.float32)),
            'label': torch.tensor(float(row['label']), dtype=torch.float32),
        }

In [5]:
# Separate the dataset by unique proteins and drugs
unique_proteins = data_kcat['Sequence'].unique()
unique_mols = data_kcat['Smiles'].unique()
# Set the seed for reproducibility
torch.manual_seed(42)
# Function to perform a cold split
def cold_split(unique_items, test_size=0.2, val_size=0.1):
    train_items, test_items = train_test_split(unique_items, test_size=test_size, random_state=42)
    train_items, val_items = train_test_split(train_items, test_size=val_size / (1 - test_size), random_state=42)
    return train_items, val_items, test_items
# Cold split by proteins
train_proteins, val_proteins, test_proteins = cold_split(unique_proteins)
train_cold_protein = data_kcat[data_kcat['Sequence'].isin(train_proteins)]
val_cold_protein = data_kcat[data_kcat['Sequence'].isin(val_proteins)]
test_cold_protein = data_kcat[data_kcat['Sequence'].isin(test_proteins)]
# Cold split by molecules
train_mols, val_mols, test_mols = cold_split(unique_mols)
train_cold_mols = data_kcat[data_kcat['Smiles'].isin(train_mols)]
val_cold_mols = data_kcat[data_kcat['Smiles'].isin(val_mols)]
test_cold_mols = data_kcat[data_kcat['Smiles'].isin(test_mols)]

In [6]:
def df2array(df):
    X = np.array([
    np.concatenate([m, p])
    for m, p in zip(df['metabolite_features'], df['protein_features'])])
    y = df['label']
    return X, y

train_X, train_y = df2array(train_cold_protein)
val_X, val_y = df2array(val_cold_protein)
test_X, test_y = df2array(test_cold_protein)

train_X_mols, train_y_mols = df2array(train_cold_mols)
val_X_mols, val_y_mols = df2array(val_cold_mols)
test_X_mols, test_y_mols = df2array(test_cold_mols)


In [7]:
#model initialization
import catboost as cat
ca = cat.CatBoostRegressor()
ca_2 = cat.CatBoostRegressor()

In [8]:
'''
#Grid search
#ref: https://xgboost.readthedocs.io/en/stable/parameter.html
from sklearn.model_selection import GridSearchCV
param_grid = dict(
    max_depth = [5,10,15,20,None],
    n_estimators = [50,100,150,200]
)
regressor = GridSearchCV(estimator = xgb, param_grid = param_grid, cv = pds, verbose = 3, n_jobs = -1)
'''
#Train the model
ca.fit(train_X,train_y)
ca_2.fit(train_X_mols,train_y_mols)

Learning rate set to 0.06949
0:	learn: 1.2636717	total: 118ms	remaining: 1m 58s
1:	learn: 1.2455691	total: 154ms	remaining: 1m 16s
2:	learn: 1.2296843	total: 182ms	remaining: 1m
3:	learn: 1.2146578	total: 261ms	remaining: 1m 5s
4:	learn: 1.2010955	total: 321ms	remaining: 1m 3s
5:	learn: 1.1889994	total: 352ms	remaining: 58.3s
6:	learn: 1.1776010	total: 380ms	remaining: 54s
7:	learn: 1.1679823	total: 406ms	remaining: 50.3s
8:	learn: 1.1589088	total: 435ms	remaining: 47.9s
9:	learn: 1.1503675	total: 466ms	remaining: 46.1s
10:	learn: 1.1429688	total: 498ms	remaining: 44.8s
11:	learn: 1.1365032	total: 524ms	remaining: 43.2s
12:	learn: 1.1295816	total: 552ms	remaining: 41.9s
13:	learn: 1.1234504	total: 580ms	remaining: 40.9s
14:	learn: 1.1171758	total: 627ms	remaining: 41.1s
15:	learn: 1.1120863	total: 653ms	remaining: 40.1s
16:	learn: 1.1072856	total: 675ms	remaining: 39s
17:	learn: 1.1023870	total: 696ms	remaining: 38s
18:	learn: 1.0979697	total: 716ms	remaining: 37s
19:	learn: 1.0934276	

<catboost.core.CatBoostRegressor at 0x306ade210>

In [9]:
# Train
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score
from scipy.stats import pearsonr

def metrics_cal(model,X,y):
    y_pred = model.predict(X)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    r2 = r2_score(y, y_pred)
    pearson_corr, _ = pearsonr(y, y_pred)
    median_ae = median_absolute_error(y, y_pred)
    explained_var = explained_variance_score(y, y_pred)
    return mse, rmse, mae, r2, pearson_corr, median_ae, explained_var


train_mse, train_rmse, train_mae, train_r2, train_pearson_corr, train_median_ae, train_explained_var = metrics_cal(ca, train_X, train_y)
val_mse, val_rmse, val_mae, val_r2, val_pearson_corr, val_median_ae, val_explained_var = metrics_cal(ca, val_X, val_y)
test_mse, test_rmse, test_mae, test_r2, test_pearson_corr, test_median_ae, test_explained_var = metrics_cal(ca, test_X, test_y)


In [10]:
train_mse_mols, train_rmse_mols, train_mae_mols, train_r2_mols, train_pearson_corr_mols, train_median_ae_mols, train_explained_var_mols = metrics_cal(ca_2, train_X_mols, train_y_mols)
val_mse_mols, val_rmse_mols, val_mae_mols, val_r2_mols, val_pearson_corr_mols, val_median_ae_mols, val_explained_var_mols = metrics_cal(ca_2, val_X_mols, val_y_mols)
test_mse_mols, test_rmse_mols, test_mae_mols, test_r2_mols, test_pearson_corr_mols, test_median_ae_mols, test_explained_var_mols = metrics_cal(ca_2, test_X_mols, test_y_mols)

In [11]:
# append the performance to the csv file
df = {
    'Model':['CatBoost','CatBoost','CatBoost'],
    'Dataset':['Train','Validation','Test'],
    'MSE':[train_mse,val_mse,test_mse],
    'RMSE':[train_rmse,val_rmse,test_rmse],
    'MAE':[train_mae,val_mae,test_mae],
    'R2':[train_r2,val_r2,test_r2],
    'Pearson':[train_pearson_corr,val_pearson_corr,test_pearson_corr],
    'Median_AE':[train_median_ae,val_median_ae,test_median_ae],
    'Explained_VAR':[train_explained_var,val_explained_var,test_explained_var],
    'Dataspliting Mode':['cold protein','cold protein','cold protein']
}
df = pd.DataFrame(df)

df.to_csv('/Users/pinchichen/2025S lab/AI drug project/Catpred/Km/model performance metrics_Catpred_Km.csv', mode='a', header=False)

In [12]:
# append the performance to the csv file
df = {
    'Model':['CatBoost','CatBoost','CatBoost'],
    'Dataset':['Train','Validation','Test'],
    'MSE':[train_mse_mols,val_mse_mols,test_mse_mols],
    'RMSE':[train_rmse_mols,val_rmse_mols,test_rmse_mols],
    'MAE':[train_mae_mols,val_mae_mols,test_mae_mols],
    'R2':[train_r2_mols,val_r2_mols,test_r2_mols],
    'Pearson':[train_pearson_corr_mols,val_pearson_corr_mols,test_pearson_corr_mols],
    'Median_AE':[train_median_ae_mols,val_median_ae_mols,test_median_ae_mols],
    'Explained_VAR':[train_explained_var_mols,val_explained_var_mols,test_explained_var_mols],
    'Dataspliting Mode':['cold mols','cold mols','cold mols']
}
df = pd.DataFrame(df)

df.to_csv('/Users/pinchichen/2025S lab/AI drug project/Catpred/Km/model performance metrics_Catpred_Km.csv', mode='a', header=False)

In [13]:
# save the model
ca.save_model("/Users/pinchichen/2025S lab/AI drug project/Catpred/Km/trained_model/cold_protein/CatBoost model_Catpred_Km_cold_protein.json", format="json")
ca_2.save_model("/Users/pinchichen/2025S lab/AI drug project/Catpred/Km/trained_model/cold_mols/CatBoost model_Catpred_Km_cold_mols.json",format='json')

# load the model
#model = CatBoostRegressor()
#model.load_model("catboost_model.cbm"