In [1]:
#####THIS CODE aims to run Autosklearn v1 on regressor data#####

#Adjustable parameters
main_path = '/home/korawich/Desktop/AutoML/automl_ddmut/'

time_left_for_this_task = 7200 #2 hrs
per_run_time_limit = 600
memory_limit = 9280

In [2]:
#Datafile #change here to use different data
data_path = main_path + 'AutoML/Dataset/DDMut_data/'
gbsig_path = main_path + 'Dataset/DDMut_data/Graph-based/'
result_path = main_path + 'Dataset/DDMut_data/Results/'
final_path = main_path + 'Dataset/DDMut_data/Final/'

#train_info_path  = final_path + 'train_info.csv'
#test_1_info_path = final_path + 'test_1_info.csv'
#test_2_info_path = final_path + 'test_2_info.csv'
#test_3_info_path = final_path + 'test_3_info.csv'

train_X_path  = final_path + 'final_train_X.csv'
test_1_X_path = final_path + 'final_test_1_X.csv'
test_2_X_path = final_path + 'final_test_2_X.csv'
test_3_X_path = final_path + 'final_test_3_X.csv'

train_y_path  = final_path + 'train_y.csv'
test_1_y_path = final_path + 'test_1_y.csv'
test_2_y_path = final_path + 'test_2_y.csv'
test_3_y_path = final_path + 'test_3_y.csv'

In [3]:
import numpy as np
import pandas as pd

from pprint import pprint

import sklearn.datasets
import sklearn.metrics

import autosklearn.regression

import warnings
warnings.filterwarnings('ignore')

In [4]:
#Code path 
alg_path = main_path + 'Algorithm/Auto-sklearn/'

loading data

In [5]:
X_train = pd.read_csv(train_X_path, index_col=0)
X_test_1 = pd.read_csv(test_1_X_path, index_col=0)
X_test_2 = pd.read_csv(test_2_X_path, index_col=0)
X_test_3 = pd.read_csv(test_3_X_path, index_col=0)

y_train = pd.read_csv(train_y_path, index_col=0)
y_test_1 = pd.read_csv(test_1_y_path, index_col=0)
y_test_2 = pd.read_csv(test_2_y_path, index_col=0)
y_test_3 = pd.read_csv(test_3_y_path, index_col=0)

In [6]:
#Check size of datasets
print('X_train.shape = ', X_train.shape)
print('X_test_1.shape = ', X_test_1.shape)
print('X_test_2.shape = ', X_test_2.shape)
print('X_test_3.shape = ', X_test_3.shape)

X_train.shape =  (9028, 896)
X_test_1.shape =  (552, 896)
X_test_2.shape =  (1304, 896)
X_test_3.shape =  (2024, 896)


In [7]:
#load list of features
feat_list_path = main_path + 'Dataset/DDMut_data/Feat_list/'
gbsig_feat_list_path = feat_list_path + 'gbsig_feat_list.csv'
seq_feat_list_path = feat_list_path + 'seq_feat_list.csv'
str_feat_list_path = feat_list_path + 'str_feat_list.csv'

all_feat_result_path = result_path + 'tpot_default_params.csv'
gbsig_feat_result_path = result_path + 'tpot_default_params_gbsig.csv'
seq_feat_result_path = result_path + 'tpot_default_params_seq.csv'
str_feat_result_path = result_path + 'tpot_default_params_str.csv'

gbs_feat = pd.read_csv(gbsig_feat_list_path, index_col=0)
seq_feat = pd.read_csv(seq_feat_list_path, index_col=0)
str_feat = pd.read_csv(str_feat_list_path, index_col=0)
all_feat = pd.concat([seq_feat, str_feat, gbs_feat], ignore_index=True)

In [8]:
features_sets = [
    ('all_feat', all_feat, all_feat_result_path),
    ('seq_feat', seq_feat, seq_feat_result_path),
    ('str_feat', str_feat, str_feat_result_path),
    ('gbs_feat', gbs_feat, gbsig_feat_result_path)
]

Autosklearn

In [9]:
#Set search space -> #same as in table 18 of Autosklearn 2.0 paper (2020)
#It's only using iterative models and only preprocessing to encode data into suitable format
#https://arxiv.org/pdf/2007.04074
#But table 18 is classification. We adapt them to regression counterpart.

search_space_dict = {
            'regressor': [
                "extra_trees", 
                "gradient_boosting",
                "mlp",
                #"passive_aggressive", #No PA in regression
                "random_forest",
                "sgd"],
            'feature_preprocessor': ["no_preprocessing"]
}

In [10]:
automl = autosklearn.regression.AutoSklearnRegressor(
    ensemble_size=3, #use in the paper, always choosing the current best model
    initial_configurations_via_metalearning=0,
    n_jobs=-1,
    allow_string_features=False,
    time_left_for_this_task=time_left_for_this_task, #need to be changed
    per_run_time_limit=per_run_time_limit, #need to be changed
    resampling_strategy="cv",
    resampling_strategy_arguments={"folds": 5}, #should be ten
    seed=1,
    include=search_space_dict,
    exclude=None,
    delete_tmp_folder_after_terminate=False,
    memory_limit=memory_limit)

In [12]:
results = []
for feat_name, feat_list, result_path in features_sets:
    print('running - feat_name: ', feat_name)
    X_train_sel  = X_train[feat_list.values.flatten()]
    X_test_1_sel = X_test_1[feat_list.values.flatten()]
    X_test_2_sel = X_test_2[feat_list.values.flatten()]
    X_test_3_sel = X_test_3[feat_list.values.flatten()]
     
    automl.fit(X_train_sel, y_train, dataset_name="ddmut")
    
    print(automl.leaderboard())
    
    #During fit(), models are fit on individual cross-validation folds.
    #Refit use models on whole datasets
    automl.refit(X_train_sel.copy(), y_train)
    
    #Report the models found by Auto-sklearn
    pprint(automl.show_models(), indent=4)
    
    #Report statistics about the search
    print(automl.sprint_statistics())
    
    #Test on a blind test
    y_pred_test_1 = automl.predict(X_test_1_sel)
    rmse_1 = np.sqrt(mean_squared_error(y_test_1, y_pred_test_1))
    r2_1 = r2_score(y_test_1, y_pred_test_1)
    pearson_corr_1, _ = pearsonr(y_test_1.values.flatten(), y_pred_test_1.flatten())
    print('score for test 1 - RMSE = ', rmse_1, ' - R2 = ', r2_1, ' - Pearson = ', pearson_corr_1)

    y_pred_test_2 = automl.predict(X_test_2_sel)
    rmse_2 = np.sqrt(mean_squared_error(y_test_2, y_pred_test_2))
    r2_2 = r2_score(y_test_2, y_pred_test_2)
    pearson_corr_2, _ = pearsonr(y_test_2.values.flatten(), y_pred_test_2.flatten())
    print('score for test 2 - RMSE = ', rmse_2, ' - R2 = ', r2_2, ' - Pearson = ', pearson_corr_2)
    
    y_pred_test_3 = automl.predict(X_test_3_sel)
    rmse_3 = np.sqrt(mean_squared_error(y_test_3, y_pred_test_3))
    r2_3 = r2_score(y_test_3, y_pred_test_3)
    pearson_corr_3, _ = pearsonr(y_test_3.values.flatten(), y_pred_test_3.flatten())
    print('score for test 2 - RMSE = ', rmse_3, ' - R2 = ', r2_3, ' - Pearson = ', pearson_corr_3)
    
    #10-fold CV
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    rmse_scores = []
    r2_scores = []
    pearson_corrs = []
    
    for train_index, test_index in kfold.split(X_train_sel[feat_list.values.flatten()]):
        X_train_fold, X_test_fold = X_train_sel[feat_list.values.flatten()].loc[train_index], X_train_sel[feat_list.values.flatten()].loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        
        # Train the model on the current fold
        exported_pipeline.fit(X_train_fold, y_train_fold)
        y_pred_fold = exported_pipeline.predict(X_test_fold)
        
        # Calculate RMSE, R2, and Pearson correlation
        rmse = np.sqrt(mean_squared_error(y_test_fold, y_pred_fold))
        r2 = r2_score(y_test_fold, y_pred_fold)
        pearson_corr, _ = pearsonr(y_test_fold.values.flatten(), y_pred_fold.flatten())
        
        # Store the scores
        rmse_scores = np.append(rmse_scores, rmse)
        r2_scores = np.append(r2_scores, r2)
        pearson_corrs = np.append(pearson_corrs, pearson_corr)
        
    results.append({
        "Method": "TPOT",
        "Feat set": feat_name,
        "Model params": exported_pipeline,
        "CV Mean RMSE (10-fold)": round(np.mean(rmse_scores), 3), 
        "CV Std RMSE (10-fold)": round(np.std(rmse_scores), 3),
        "CV Mean R2 (10-fold)": round(np.mean(r2_scores), 3),
        "CV Std R2 (10-fold)": round(np.std(r2_scores), 3),
        "CV Mean Pearson (10-fold)": round(np.mean(pearson_corrs), 3),
        "CV Std Pearson (10-fold)": round(np.std(pearson_corrs), 3),
        "Test 1 RMSE": round(rmse_1, 3),
        "Test 1 R2"  : round(r2_1, 3),
        "Test 1 Pearson" : round(pearson_corr_1, 3),
        "Test 2 RMSE": round(rmse_2, 3),
        "Test 2 R2"  : round(r2_2, 3),
        "Test 2 Pearson" : round(pearson_corr_2, 3),
        "Test 3 RMSE": round(rmse_3, 3),
        "Test 3 R2"  : round(r2_3, 3),
        "Test 3 Pearson" : round(pearson_corr_3, 3)
        })
    results_df = pd.DataFrame(results)
    
    results_df_autosklearn_save_path = result_path + 'Autosklearn_ddmut_results.csv'
    results_df.to_csv(results_df_autosklearn_save_path)

running - feat_name:  all_feat
          rank  ensemble_weight  type  cost duration
model_id                                            
1            1              1.0  <NA>  <NA>     <NA>


RuntimeError: No model found. Try increasing 'time_left_for_this_task'.