In [51]:
from tdc.utils import retrieve_label_name_list
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
import numpy as np


In [52]:
label_list = retrieve_label_name_list('Tox21')
data = Tox(name = 'Tox21', label_name = label_list[0])
split = data.get_split()

Found local copy...
Loading...
Done!


In [53]:
test = split['test']
train = split['train']

In [54]:
train.groupby('Y').count()

Unnamed: 0_level_0,Drug_ID,Drug
Y,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,4878,4878
1.0,208,208


In [55]:
SEED = 9582
N_FOLD = 5

model_params = {
    'reg_alpha': 0.0008774661176012108,
    'reg_lambda': 2.542812743920178,
    'colsample_bynode': 0.7839026197349153,
    'subsample': 0.8994226268096415, 
    # subsample=1,
    'eta': 0.04730766698056879, 
    'max_depth': 3, 
    'n_estimators': 500,
    'random_state': SEED,
    'eval_metric': 'rmse',
    'n_jobs': -1,
    'learning_rate':0.023,
}

class ModelTrainer():
    def __init__(self, data_X, data_Y, params):
        self.params = params
        self.X = data_X
        self.Y = data_Y
        self.model = None
    
    # Get the trained model        
    def get_model(self):
        return self.model
    
    def _init_model(self, add_to_seed):
        new_params = self.params.copy()
        new_params["random_state"] += add_to_seed + 1
        self.model = XGBRegressor(**new_params)
    
    def train_model(self):           
        # Split the training data into N_FOLD
        skf = StratifiedKFold(n_splits=N_FOLD, random_state=SEED, shuffle=True)
        fold_rmses = []

        # Initialize lists to store predictions and actual values
        all_predictions = []
        all_actual = []
        all_importances = []
        for fold, (train_index, valid_index) in enumerate(skf.split(self.X, self.Y.astype(str))):
            train_x = self.X.iloc[train_index]
            train_y = self.Y[train_index]
            valid_x = self.X.iloc[valid_index]
            valid_y = self.Y[valid_index]


            self._init_model(fold + 1)

            # Fit the model with train x and train y
            self.model.fit(train_x, train_y,
                              #verbose=2, # Single line processing
                               verbose=0,
                              )
            all_importances.append(self.model.feature_importances_)
                
            predictions = self.model.predict(valid_x)  # do NOT clip the prediction, so the error is accurate
            rmse = mean_squared_error(y_true=valid_y, y_pred=predictions, squared=False) # Return RMSE
            fold_rmses.append(rmse)

            # Store predictions and actual values
            all_predictions.extend(predictions)
            all_actual.extend(valid_y)
        avg_rmse = np.mean(fold_rmses)
        # print(f"Average rmse: {avg_rmse}") 
        return {
            "avg_rmse": avg_rmse,
            "predictions": all_predictions,
            "actual": all_actual,
            "all_importances": all_importances
        }

    def train_model_on_all_data(self, add_to_seed):
        # Fit the model with train x and train y
        self.model.fit(self.X, self.Y,
                          #verbose=2, # Single line processing
                           verbose=0,
                          )
        
    # Predict the test data. 
    def _predict(self, test_x):
        # Prediction loop
        tests_y = np.zeros((len(test_x), N_FOLD))
        for fold in range(N_FOLD):
            preds = self.model.predict(test_x)  # do NOT clip the prediction, so the error is accurate
            tests_y[:, fold] = preds
            #print(f"Fold = {fold} Prediction = {preds[:5]}")
        test_y = np.mean(tests_y, axis=1)
        return test_y# Average the prediction of each fold model

In [56]:
def gen_feats(dataset):
    dataset["num_oxygen"] = dataset["Drug"].str.count("O")
    return dataset
test = gen_feats(test)
train = gen_feats(train)

In [57]:
def fit_model(train_dataset):
    train_x = train_dataset["num_oxygen"]
    train_y = train_dataset["Y"]
    trainer = ModelTrainer(train_x, train_y, model_params)
    res = trainer.train_model()
    print(res["avg_rmse"])
fit_model(train)

0.1984982844885025
