In [1]:
###TPOT Script for Regression###
import numpy as np
import pandas as pd
from tpot import TPOTRegressor
from scipy.stats import pearsonr

import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
#Set Code path 
alg_path = '/home/korawich/Desktop/AutoML/Algorithm/TPOT'

In [4]:
#Datafile
data_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/'
gbsig_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Graph-based/'
result_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Results/'
final_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Final/'

train_info_path  = final_path + 'train_info.csv'
test_1_info_path = final_path + 'test_1_info.csv'
test_2_info_path = final_path + 'test_2_info.csv'
test_3_info_path = final_path + 'test_3_info.csv'

train_X_path  = final_path + 'final_train_X.csv'
test_1_X_path = final_path + 'final_test_1_X.csv'
test_2_X_path = final_path + 'final_test_2_X.csv'
test_3_X_path = final_path + 'final_test_3_X.csv'

train_y_path  = final_path + 'train_y.csv'
test_1_y_path = final_path + 'test_1_y.csv'
test_2_y_path = final_path + 'test_2_y.csv'
test_3_y_path = final_path + 'test_3_y.csv'

In [5]:
X_train = pd.read_csv(train_X_path, index_col=0)
X_test_1 = pd.read_csv(test_1_X_path, index_col=0)
X_test_2 = pd.read_csv(test_2_X_path, index_col=0)
X_test_3 = pd.read_csv(test_3_X_path, index_col=0)

y_train = pd.read_csv(train_y_path, index_col=0)
y_test_1 = pd.read_csv(test_1_y_path, index_col=0)
y_test_2 = pd.read_csv(test_2_y_path, index_col=0)
y_test_3 = pd.read_csv(test_3_y_path, index_col=0)

In [6]:
print('X_train.shape = ', X_train.shape)
print('X_test_1.shape = ', X_test_1.shape)
print('X_test_2.shape = ', X_test_2.shape)
print('X_test_3.shape = ', X_test_3.shape)

X_train.shape =  (9028, 896)
X_test_1.shape =  (552, 896)
X_test_2.shape =  (1304, 896)
X_test_3.shape =  (2024, 896)


In [7]:
#load list of features
feat_list_path = '/home/korawich/Desktop/AutoML/Dataset/DDMut_data/Feat_list/'
gbsig_feat_list_path = feat_list_path + 'gbsig_feat_list.csv'
seq_feat_list_path = feat_list_path + 'seq_feat_list.csv'
str_feat_list_path = feat_list_path + 'str_feat_list.csv'

all_feat_result_path = result_path + 'tpot_default_params.csv'
gbsig_feat_result_path = result_path + 'tpot_default_params_gbsig.csv'
seq_feat_result_path = result_path + 'tpot_default_params_seq.csv'
str_feat_result_path = result_path + 'tpot_default_params_str.csv'

gbs_feat = pd.read_csv(gbsig_feat_list_path, index_col=0)
seq_feat = pd.read_csv(seq_feat_list_path, index_col=0)
str_feat = pd.read_csv(str_feat_list_path, index_col=0)
all_feat = pd.concat([seq_feat, str_feat, gbs_feat], ignore_index=True)

In [8]:
features_sets = [
    #('all_feat', all_feat, all_feat_result_path),
    #('seq_feat', seq_feat, seq_feat_result_path),
    ('str_feat', str_feat, str_feat_result_path),
    #('gbs_feat', gbs_feat, gbsig_feat_result_path)
]

2. TPOT

In [12]:
#The default parameter is set to be the same as in TPOT_paper
for feat_name, feat_list, result_path in features_sets:
    print('running - feat_name')
    X_train_sel  = X_train[feat_list.values.flatten()]
    X_test_1_sel = X_test_1[feat_list.values.flatten()]
    X_test_2_sel = X_test_2[feat_list.values.flatten()]
    X_test_3_sel = X_test_3[feat_list.values.flatten()]
    
    tpot = TPOTRegressor(generations=100, population_size=100,
                             offspring_size=None, mutation_rate=0.9,
                             crossover_rate=0.1,
                             scoring='neg_mean_squared_error', cv=5,
                             subsample=1.0, n_jobs=-1,
                             max_time_mins=None, max_eval_time_mins=5,
                             random_state=42, config_dict=None, #The config dict is none, use the default
                             template=None,
                             warm_start=False,
                             memory=None,
                             use_dask=False,
                             periodic_checkpoint_folder=None,
                             early_stop=None,
                             verbosity=2,
                             disable_update_check=False)
    
    tpot.fit(X_train, y_train)
    #pipeline_save_path = alg_path + '/tpot_digits_pipeline.py'
    #tpot.export('tpot_digits_pipeline.py')

    #Test in test dataset
    y_pred_test_1 = tpot.predict(X_test_1)
    rmse_1 = np.sqrt(mean_squared_error(y_test_1, y_pred_test_1))
    r2_1 = r2_score(y_test_1, y_pred_test_1)
    pearson_corr_1, _ = pearsonr(y_test_1.values.flatten(), y_pred_test_1.flatten())
    print('score for test 1 - RMSE = ', rmse_1, ' - R2 = ', r2_1, ' - Pearson = ', pearson_corr_1)
    
    y_pred_test_2 = tpot.predict(X_test_2)
    rmse_2 = np.sqrt(mean_squared_error(y_test_2, y_pred_test_2))
    r2_2 = r2_score(y_test_2, y_pred_test_2)
    pearson_corr_2, _ = pearsonr(y_test_2.values.flatten(), y_pred_test_2.flatten())
    print('score for test 2 - RMSE = ', rmse_2, ' - R2 = ', r2_2, ' - Pearson = ', pearson_corr_2)
    
    y_pred_test_3 = tpot.predict(X_test_3)
    rmse_3 = np.sqrt(mean_squared_error(y_test_3, y_pred_test_3))
    r2_3 = r2_score(y_test_3, y_pred_test_3)
    pearson_corr_3, _ = pearsonr(y_test_3.values.flatten(), y_pred_test_3.flatten())
    print('score for test 3 - RMSE = ', rmse_3, ' - R2 = ', r2_3, ' - Pearson = ', pearson_corr_3)
    #Run 10-fold CV with results params
    #TO_DO !
    

    #Have some code to print out all results as a doc file.
    result = []
    results.append({
            "Model params": params,
                        "CV Mean RMSE (10-fold)": round(np.mean(rmse_scores), 3), 
                        "CV Std RMSE (10-fold)": round(np.std(rmse_scores), 3),
                        "CV Mean R2 (10-fold)": round(np.mean(r2_scores), 3),
                        "CV Std R2 (10-fold)": round(np.std(r2_scores), 3),
                        "CV Mean Pearson (10-fold)": round(np.mean(pearson_corrs), 3),
                        "CV Std Pearson (10-fold)": round(np.std(pearson_corrs), 3),
                        "Test 1 RMSE": round(rmse_1, 3),
                        "Test 1 R2"  : round(R2_1, 3),
                        "Test 1 Pearson" : round(pearson_corr_1, 3),
                        "Test 2 RMSE": round(rmse_2, 3),
                        "Test 2 R2"  : round(R2_2, 3),
                        "Test 2 Pearson" : round(pearson_corr_2, 3),
                        "Test 3 RMSE": round(rmse_3, 3),
                        "Test 3 R2"  : round(R2_3, 3),
                        "Test 3 Pearson" : round(pearson_corr_3, 3)
    })
    results_df = pd.DataFrame(results)
    results_df.to_csv(result_path)

IndentationError: unexpected indent (436471827.py, line 54)