#Evaluate on the default params on multiple algs
1) AdaBoost
2) Decision tree
3) Random Forest
4) Extra trees
5) XGBoost

In [1]:
import pandas as pd
import numpy as np
import time
import sys
import os
import argparse
import joblib
import re

In [2]:
import warnings

# Suppress only UserWarnings
warnings.filterwarnings("ignore", category=UserWarning)

#Use extension to accelerate sklearn over Intel
#Install from pip install scikit-learn-intelex
from sklearnex import patch_sklearn
patch_sklearn()

In [3]:
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, StratifiedGroupKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler, label_binarize
from sklearn.metrics import roc_auc_score, f1_score, matthews_corrcoef, balanced_accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
from scipy.stats import pearsonr

In [4]:
data_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/'
gbsig_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Graph-based/'
result_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Results/'
final_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Final/'

In [6]:
#Input
train_info_path  = final_path + 'train_info.csv'
test_1_info_path = final_path + 'test_1_info.csv'
test_2_info_path = final_path + 'test_2_info.csv'
test_3_info_path = final_path + 'test_3_info.csv'

train_X_path  = final_path + 'final_train_X.csv'
test_1_X_path = final_path + 'final_test_1_X.csv'
test_2_X_path = final_path + 'final_test_2_X.csv'
test_3_X_path = final_path + 'final_test_3_X.csv'

train_y_path  = final_path + 'train_y.csv'
test_1_y_path = final_path + 'test_1_y.csv'
test_2_y_path = final_path + 'test_2_y.csv'
test_3_y_path = final_path + 'test_3_y.csv'

In [7]:
X_train = pd.read_csv(train_X_path, index_col=0)
X_test_1 = pd.read_csv(test_1_X_path, index_col=0)
X_test_2 = pd.read_csv(test_2_X_path, index_col=0)
X_test_3 = pd.read_csv(test_3_X_path, index_col=0)

y_train = pd.read_csv(train_y_path, index_col=0)
y_test_1 = pd.read_csv(test_1_y_path, index_col=0)
y_test_2 = pd.read_csv(test_2_y_path, index_col=0)
y_test_3 = pd.read_csv(test_3_y_path, index_col=0)

In [9]:
print('X_train.shape = ', X_train.shape)
print('X_test_1.shape = ', X_test_1.shape)
print('X_test_2.shape = ', X_test_2.shape)
print('X_test_3.shape = ', X_test_3.shape)

X_train.shape =  (9028, 896)
X_test_1.shape =  (552, 896)
X_test_2.shape =  (1304, 896)
X_test_3.shape =  (2024, 896)


In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import mean_squared_error, r2_score

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [12]:
# List of regression models to test
models = {
    "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Support Vector Regression": SVR()
}

# Define 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Dictionary to store the results
results = []

# List of test sets
test_sets = [
    ('Test Set 1', X_test_1, y_test_1),
    ('Test Set 2', X_test_2, y_test_2),
    ('Test Set 3', X_test_3, y_test_3)
]

scoring = ['neg_mean_squared_error', 'r2']

# Iterate over each model, train, cross-validate, and evaluate on multiple test sets
for name, model in models.items():
    print('running - ', name)

    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    
    rmse_scores = []
    r2_scores = []
    pearson_corrs = []
    
    for train_index, test_index in kfold.split(X_train):
        X_train_fold, X_test_fold = X_train.loc[train_index], X_train.loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        
        # Train the model on the current fold
        model.fit(X_train_fold, y_train_fold)
        y_pred_fold = model.predict(X_test_fold)
        
        # Calculate RMSE, R2, and Pearson correlation
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
        r2 = r2_score(y_test_fold, y_pred_fold)
        pearson_corr, _ = pearsonr(y_test_fold.values.flatten(), y_pred_fold.flatten())
        
        # Store the scores
        rmse_scores = np.append(rmse_scores, rmse)
        r2_scores = np.append(r2_scores, r2)
        pearson_corrs = np.append(pearson_corrs, pearson_corr)

    # Train the model on the full training data
    model.fit(X_train, y_train)

    results.append({"Model": name, 
                    "CV Mean RMSE (10-fold)": round(np.mean(rmse_scores), 3), 
                    "CV Std RMSE (10-fold)": round(np.std(rmse_scores), 3),
                    "CV Mean R2 (10-fold)": round(np.mean(r2_scores), 3),
                    "CV Std R2 (10-fold)": round(np.std(r2_scores), 3),
                    "CV Mean Pearson (10-fold)": round(np.mean(pearson_corrs), 3),
                    "CV Std Pearson (10-fold)": round(np.std(pearson_corrs), 3),
                    "Test Set": '10-fold CV'
                   })
    
    # Evaluate the model on each test set
    for test_name, X_test, y_test in test_sets:
        # Predict on the test data
        y_pred = model.predict(X_test)
        
        # Calculate performance metrics on test data
        rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        pearson_corr, _ = pearsonr(y_test.values.flatten(), y_pred.flatten())
        
        # Append results to the list
        results.append({
            "Model": name,
            "Test Set": test_name,
            "Test RMSE": round(rmse_score, 3),
            "Test R2": round(r2, 3),
            "Test Pearson": round(pearson_corr, 3)
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)

results_df_all_save_path = result_path + 'benchmark_default_params_all.csv'
results_df.to_csv(results_df_all_save_path)

running -  XGBoost
running -  Linear Regression
running -  Decision Tree
running -  Random Forest
running -  Gradient Boosting
running -  Support Vector Regression


In [15]:
results_df

Unnamed: 0,Model,CV Mean RMSE (10-fold),CV Std RMSE (10-fold),CV Mean R2 (10-fold),CV Std R2 (10-fold),CV Mean Pearson (10-fold),CV Std Pearson (10-fold),Test Set,Test RMSE,Test R2,Test Pearson
0,XGBoost,1.332,0.039,0.517,0.035,0.722,0.023,10-fold CV,,,
1,XGBoost,,,,,,,Test Set 1,1.181,0.299,0.601
2,XGBoost,,,,,,,Test Set 2,1.604,0.293,0.555
3,XGBoost,,,,,,,Test Set 3,2.257,0.131,0.409
4,Linear Regression,1.596,0.054,0.307,0.046,0.574,0.027,10-fold CV,,,
5,Linear Regression,,,,,,,Test Set 1,1.496,-0.125,0.393
6,Linear Regression,,,,,,,Test Set 2,1.821,0.089,0.422
7,Linear Regression,,,,,,,Test Set 3,2.64,-0.189,0.218
8,Decision Tree,1.894,0.06,0.024,0.054,0.524,0.024,10-fold CV,,,
9,Decision Tree,,,,,,,Test Set 1,1.896,-0.807,0.287


#FOR EACH FEATURE SET

In [40]:
#load
feat_list_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Feat_list/'
gbsig_feat_list_path = feat_list_path + 'gbsig_feat_list.csv'
seq_feat_list_path = feat_list_path + 'seq_feat_list.csv'
str_feat_list_path = feat_list_path + 'str_feat_list.csv'

gbsig_feat_result_path = result_path + 'benchmark_default_params_gbsig.csv'
seq_feat_result_path = result_path + 'benchmark_default_params_seq.csv'
str_feat_result_path = result_path + 'benchmark_default_params_str.csv'

gbs_feat = pd.read_csv(gbsig_feat_list_path, index_col=0)
seq_feat = pd.read_csv(seq_feat_list_path, index_col=0)
str_feat = pd.read_csv(str_feat_list_path, index_col=0)

In [41]:
features_sets = [
    ('seq_feat', seq_feat, seq_feat_result_path), 
    ('str_feat', str_feat, str_feat_result_path),
    ('gbs_feat', gbs_feat, gbsig_feat_result_path)
]

for feat_name, feat_list, save_path in features_sets:
    print('running feat - ', feat_name)
        # List of regression models to test
    models = {
        "XGBoost": XGBRegressor(objective='reg:squarederror', random_state=42),
        "Linear Regression": LinearRegression(),
        "Decision Tree": DecisionTreeRegressor(random_state=42),
        "Random Forest": RandomForestRegressor(random_state=42),
        "Gradient Boosting": GradientBoostingRegressor(random_state=42),
        "Support Vector Regression": SVR()
    }
    
    # Define 10-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Dictionary to store the results
    results = []
    
    # List of test sets
    test_sets = [
        ('Test Set 1', X_test_1[feat_list.values.flatten()], y_test_1),
        ('Test Set 2', X_test_2[feat_list.values.flatten()], y_test_2),
        ('Test Set 3', X_test_3[feat_list.values.flatten()], y_test_3)
    ]
    
    scoring = ['neg_mean_squared_error', 'r2']
    
    # Iterate over each model, train, cross-validate, and evaluate on multiple test sets
    for name, model in models.items():
        print('running - ', name)
    
        kfold = KFold(n_splits=10, shuffle=True, random_state=42)
        
        rmse_scores = []
        r2_scores = []
        pearson_corrs = []
        
        for train_index, test_index in kfold.split(X_train[feat_list.values.flatten()]):
            X_train_fold, X_test_fold = X_train[feat_list.values.flatten()].loc[train_index], X_train[feat_list.values.flatten()].loc[test_index]
            y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
            
            # Train the model on the current fold
            model.fit(X_train_fold, y_train_fold)
            y_pred_fold = model.predict(X_test_fold)
            
            # Calculate RMSE, R2, and Pearson correlation
            rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
            r2 = r2_score(y_test_fold, y_pred_fold)
            pearson_corr, _ = pearsonr(y_test_fold.values.flatten(), y_pred_fold.flatten())
            
            # Store the scores
            rmse_scores = np.append(rmse_scores, rmse)
            r2_scores = np.append(r2_scores, r2)
            pearson_corrs = np.append(pearson_corrs, pearson_corr)
    
        # Train the model on the full training data
        model.fit(X_train[feat_list.values.flatten()], y_train)
    
        results.append({"Model": name, 
                        "CV Mean RMSE (10-fold)": round(np.mean(rmse_scores), 3), 
                        "CV Std RMSE (10-fold)": round(np.std(rmse_scores), 3),
                        "CV Mean R2 (10-fold)": round(np.mean(r2_scores), 3),
                        "CV Std R2 (10-fold)": round(np.std(r2_scores), 3),
                        "CV Mean Pearson (10-fold)": round(np.mean(pearson_corrs), 3),
                        "CV Std Pearson (10-fold)": round(np.std(pearson_corrs), 3),
                        "Test Set": '10-fold CV'
                       })
        
        # Evaluate the model on each test set
        for test_name, X_test, y_test in test_sets:
            # Predict on the test data
            y_pred = model.predict(X_test[feat_list.values.flatten()])
            
            # Calculate performance metrics on test data
            rmse_score = np.sqrt(mean_squared_error(y_test, y_pred))
            r2 = r2_score(y_test, y_pred)
            pearson_corr, _ = pearsonr(y_test.values.flatten(), y_pred.flatten())
            
            # Append results to the list
            results.append({
                "Model": name,
                "Test Set": test_name,
                "Test RMSE": round(rmse_score, 3),
                "Test R2": round(r2, 3),
                "Test Pearson": round(pearson_corr, 3)
            })
    
    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    results_df.to_csv(save_path)

running feat -  str_feat
running -  XGBoost
running -  Linear Regression
running -  Decision Tree
running -  Random Forest
running -  Gradient Boosting
running -  Support Vector Regression
running feat -  gbs_feat
running -  XGBoost
running -  Linear Regression
running -  Decision Tree
running -  Random Forest
running -  Gradient Boosting
running -  Support Vector Regression
