### This notebook have code for feature selection

### Univariate feature selection

#### 1. Load libraries

In [1]:
# Settings imported from other notebook Settings.ipynb
%run Settings.ipynb

#### 1. Load the processed data

In [2]:
labels = joblib.load("E:/Machine Learning/Output Data/Response Variable (PTSS Epic).pkl")
full_data = joblib.load("E:/Machine Learning/Output Data/GRRN_phenodata (cross_long Epic).pkl")

In [3]:
full_data

{'full_df_cross':      cg11188799  cg19586199  cg03354771  cg16246200  cg12700033  cg15500320  \
 0        0.0125      0.9566      0.0198      0.9633      0.0407      0.0166   
 2        0.0154      0.9574      0.0228      0.9728      0.0330      0.0139   
 5        0.0128      0.9573      0.0279      0.9758      0.0445      0.0161   
 6        0.0150      0.9566      0.0291      0.9692      0.0315      0.0154   
 11       0.0110      0.9678      0.0221      0.9686      0.0450      0.0157   
 ..          ...         ...         ...         ...         ...         ...   
 440      0.0104      0.9616      0.0206      0.9790      0.0428      0.0151   
 441      0.0132      0.9632      0.0312      0.9769      0.0368      0.0125   
 442      0.0143      0.9532      0.0225      0.9737      0.0355      0.0154   
 444      0.0112      0.9658      0.0231      0.9785      0.0440      0.0171   
 447      0.0128      0.9575      0.0238      0.9786      0.0359      0.0141   
 
      cg13374701  cg1

#### 2. Functions to fit the model and select the top features

In [4]:
# we will try reducing the features
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression

class SelectFeatures():
    
    def __init__(self):
        
        """
        Function constructor
        
        """
        self.fit_lis = []
        self.df_imp_lis = []
        
        
    def CallFit(self, df, labels, top_fea):
        """
        Fit model for feature selection. 
        The model will fit the data to find the predictive 
        features for response variable. 
        
        Parameters:
        ----------
        df : List of data frames you want to select features from
        labels : List of response variable. 
        top_fea : Number of features you want to select
        
        """
        top_selected = SelectKBest(score_func = f_regression, k = top_fea)
        fit = top_selected.fit(df, labels)
        self.fit_lis.append(fit)
        
        
    
    def UnivFeatureSelection(self, df, index):
        """
        This function will call the fitted models to select the features
        
        Parameters:
        ----------
        df : List of data frames you want to select features from
        index : Index of the model fitted on the data
        
        """
        cols = self.fit_lis[index].get_support(indices=True)
        df_impt_uvs = df.iloc[:,cols]
        self.df_imp_lis.append(df_impt_uvs)
        return self.df_imp_lis


  return f(*args, **kwds)


#### 3. Call the functions for cross-sectional and longitudinal data and select the features. We will save it to use for machine learning. We will call some functions which are in other notebook.

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing

# First we will import some functions from other notebook Epic_ML_3.ipynb
import nbimporter
import Epic_ML_3


Importing Jupyter notebook from Epic_ML_3.ipynb


  return f(*args, **kwds)


In [6]:
# Function to evaluate the model. It was in other notebook but "np" gives some issue
# when calling from other notebook. So I have copied that function 
# here to make it locally available

from sklearn import metrics
def evaluate_model(data_scaled, preds, df_indx, df_types, 
                   model_name, store ):
    for i in range(len(data_scaled)):
        print("\nModel : ........ ", model_name)
        print("\n", df_types[i], ": ")
        abe = metrics.mean_absolute_error(data_scaled[i][df_indx], preds[i])
        mse = metrics.mean_squared_error(data_scaled[i][df_indx], preds[i])
        rmse = np.sqrt(metrics.mean_squared_error(data_scaled[i][df_indx], preds[i]))
        r2 = metrics.r2_score(data_scaled[i][df_indx], preds[i])
        print('Mean Absolute Error:', abe)
        print('Mean Squared Error:', mse)
        print('Root Mean Squared Error:', rmse) 
        print("R2 on data:", r2)
        key = model_name+ '_' +df_types[i] 
        store[key] = [abe, mse, rmse, r2]

    return store


# Function to predict and evaluate the model
def PredictEvaluate(trained_models_base, df_types, model_names, 
                   data_scaled):
    base_pred = [{},{}]
    base_m_score = [{},{}]
    for i in range(2): # for train and test data
        if i == 0:
            print("Fitting training data -------")
            indx = 0
        else:
            print("Fitting test data ===========")
            indx = 2
        base_pred[i] = Epic_ML_3.make_predictions(data_scaled=data_scaled, 
                                        model_names=model_names, 
                                        df_types=df_types, indx=indx,
                                        trained_models=trained_models_base)


        # Get scores for each model on each data set
        for j in range(len(model_names)):
            print("Model :",model_names[j])
            predictions = [value for key,value in base_pred[i].items() if key.startswith(model_names[j])]
            base_m_score[i] = evaluate_model(data_scaled=data_scaled, 
                                                             preds=predictions, df_indx=indx+1, 
                                                             df_types=df_types, 
                                                             model_name = model_names[j],
                                                             store=base_m_score[i]
                                                            )
    return base_m_score

In [11]:
top = np.arange(10,600, 20)
top

array([ 10,  30,  50,  70,  90, 110, 130, 150, 170, 190, 210, 230, 250,
       270, 290, 310, 330, 350, 370, 390, 410, 430, 450, 470, 490, 510,
       530, 550, 570, 590])

In [12]:

# call training function
model_names = ['Random Forest', "Adaboost", "Gradient Boost"]
df_types = ["full_cros", "full_long", "imp_cros", "imp_long"]

# Set different set of features to find optimal set
# top = np.arange(50,600, 50) 
top = np.arange(10,600, 20)

# Base models scores for training and testing
final_score = [{} for i in range(len(top))]

# Now get each set of important features do the following steps
# 1. Split data into train and test
# 2. Perform scaling 
# 3. Train models
# 4. Predict and evaluate
for k in range(len(top)):
    FS = SelectFeatures()
    for index, key in enumerate(full_data):
        print("Patience, selecting features :", top[k])
        FS.CallFit(df = full_data[key],labels = labels[index], top_fea=top[k])
        important_features = FS.UnivFeatureSelection(df = full_data[key], index = index)
     
    TT = Epic_ML_3.TrainTest()
    
    # save the data, full data and import features together in a list
    dfs_final = [full_data['full_df_cross'], full_data['full_df_long'], 
                 important_features[0], important_features[1]]
    
    df_types1 = ["full_cros", "full_long"]
    df_types2 = ["imp_cros", "imp_long"]
    df_types = df_types1 + [s + str(top[k]) for s in df_types2] # add number to important features
    
    for i in range(len(dfs_final)):
        print(i)
        if i == 0 or i == 2:
            j = 0 # labels
        else:
            j = 1
            
        train_features, test_features, train_labels, test_labels = train_test_split(dfs_final[i],
                                                                                    labels[j], 
                                                                                    test_size = 0.25, 
                                                                                    random_state = 42)
        
        # Scale data
        data, data_scaled = TT.ScaleData(train_features, test_features, train_labels, test_labels)
        
    
    # Train models
    trained_models_base = Epic_ML_3.train_models(data_scaled=data_scaled, 
                                model_names=model_names, 
                                df_types=df_types,
                                indx1=0, indx2=1)
    
    # Predict and get the scores
    key = str(top[k]) + " Features"
    final_score[k][key] = PredictEvaluate(trained_models_base, df_types, model_names, 
                   data_scaled)
    

Patience, selecting features : 10
Patience, selecting features : 10
0
1
2
3
Training ......  Random Forest on full_cros
Wall time: 12.5 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.63 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.7 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learnin

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros10 : 
Mean Absolute Error: 0.13613600712602383
Mean Squared Error: 0.0549395971314988
Root Mean Squared Error: 0.23439197326593503
R2 on data: 0.9450604028685012

Model : ........  Random Forest

 imp_long10 : 
Mean Absolute Error: 0.19649672893160727
Mean Squared Error: 0.10205497163818979
Root Mean Squared Error: 0.31946043829900095
R2 on data: 0.8979450283618102
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared Err

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros30 : 
Mean Absolute Error: 0.06721207977240172
Mean Squared Error: 0.01283318246957747
Root Mean Squared Error: 0.11328363725436022
R2 on data: 0.9871668175304226

Model : ........  Random Forest

 imp_long30 : 
Mean Absolute Error: 0.08187518293515665
Mean Squared Error: 0.01737587147644586
Root Mean Squared Error: 0.13181756892177104
R2 on data: 0.9826241285235542
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean Squa

Wall time: 149 ms
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long50
Wall time: 277 ms
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_spl

Wall time: 15.9 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_long
Wall time: 6.35 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_long
Wall time: 14.2 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                  

2
3
Training ......  Random Forest on full_cros
Wall time: 12.2 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.59 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.8 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_featur

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros90 : 
Mean Absolute Error: 0.1372767710016643
Mean Squared Error: 0.048613744593015186
Root Mean Squared Error: 0.22048524801676683
R2 on data: 0.9513862554069848

Model : ........  Random Forest

 imp_long90 : 
Mean Absolute Error: 0.21366756788182917
Mean Squared Error: 0.10769732978575317
Root Mean Squared Error: 0.3281727133473366
R2 on data: 0.8923026702142468
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared Err

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros110 : 
Mean Absolute Error: 0.08249022003151052
Mean Squared Error: 0.016621453288362474
Root Mean Squared Error: 0.12892421529085402
R2 on data: 0.9833785467116375

Model : ........  Random Forest

 imp_long110 : 
Mean Absolute Error: 0.08919020637000334
Mean Squared Error: 0.01777023516550304
Root Mean Squared Error: 0.13330504553655514
R2 on data: 0.982229764834497
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean Sq

Wall time: 282 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros130
Wall time: 381 ms
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long130
Wall time: 551 ms
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='

Wall time: 10.6 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 16.5 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_spli

3
Training ......  Random Forest on full_cros
Wall time: 12.5 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.74 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 11 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=N

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros170 : 
Mean Absolute Error: 0.14247687124978958
Mean Squared Error: 0.052448679879063875
Root Mean Squared Error: 0.22901676768102347
R2 on data: 0.9475513201209361

Model : ........  Random Forest

 imp_long170 : 
Mean Absolute Error: 0.24271840294882333
Mean Squared Error: 0.11844695922684711
Root Mean Squared Error: 0.34416124015764343
R2 on data: 0.8815530407731529
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros190 : 
Mean Absolute Error: 0.08169990394128257
Mean Squared Error: 0.016903729167132245
Root Mean Squared Error: 0.13001434215936428
R2 on data: 0.9830962708328678

Model : ........  Random Forest

 imp_long190 : 
Mean Absolute Error: 0.09483543609561074
Mean Squared Error: 0.01801218779978835
Root Mean Squared Error: 0.13420949221194584
R2 on data: 0.9819878122002117
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean S

Wall time: 400 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros210
Wall time: 702 ms
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long210
Wall time: 707 ms
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='

Wall time: 11 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 16 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=No

2
3
Training ......  Random Forest on full_cros
Wall time: 12.5 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.51 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.3 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_featur

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros250 : 
Mean Absolute Error: 0.1538006764116712
Mean Squared Error: 0.056332989408402215
Root Mean Squared Error: 0.23734571706353205
R2 on data: 0.9436670105915977

Model : ........  Random Forest

 imp_long250 : 
Mean Absolute Error: 0.24787276635301922
Mean Squared Error: 0.12029734690782162
Root Mean Squared Error: 0.34683907926850116
R2 on data: 0.8797026530921783
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared 

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros270 : 
Mean Absolute Error: 0.07952849092110194
Mean Squared Error: 0.01655571725008439
Root Mean Squared Error: 0.12866902210743808
R2 on data: 0.9834442827499156

Model : ........  Random Forest

 imp_long270 : 
Mean Absolute Error: 0.0961734827102994
Mean Squared Error: 0.018653994640266264
Root Mean Squared Error: 0.13657962747154592
R2 on data: 0.9813460053597337
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean Sq

Wall time: 484 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros290
Wall time: 960 ms
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long290
Wall time: 924 ms
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='

Wall time: 11 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 16.1 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=

2
3
Training ......  Random Forest on full_cros
Wall time: 12.6 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 5.03 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 11 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros330 : 
Mean Absolute Error: 0.15130113187283717
Mean Squared Error: 0.05503045785650789
Root Mean Squared Error: 0.2345857153718186
R2 on data: 0.944969542143492

Model : ........  Random Forest

 imp_long330 : 
Mean Absolute Error: 0.24917084894286562
Mean Squared Error: 0.12034486166196824
Root Mean Squared Error: 0.34690756933507266
R2 on data: 0.8796551383380318
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared Er

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros350 : 
Mean Absolute Error: 0.08420778816819396
Mean Squared Error: 0.01748824470451223
Root Mean Squared Error: 0.13224312724868628
R2 on data: 0.9825117552954877

Model : ........  Random Forest

 imp_long350 : 
Mean Absolute Error: 0.09390654423086843
Mean Squared Error: 0.01868991035927828
Root Mean Squared Error: 0.13671104695407127
R2 on data: 0.9813100896407217
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean Sq

Wall time: 586 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros370
Wall time: 1.29 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long370
Wall time: 1.15 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='

Wall time: 10.9 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 15.8 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_spli

3
Training ......  Random Forest on full_cros
Wall time: 12.4 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.67 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.4 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros410 : 
Mean Absolute Error: 0.1609345091347819
Mean Squared Error: 0.06051813697575657
Root Mean Squared Error: 0.2460043434083158
R2 on data: 0.9394818630242434

Model : ........  Random Forest

 imp_long410 : 
Mean Absolute Error: 0.24986617572753866
Mean Squared Error: 0.11799354912984908
Root Mean Squared Error: 0.34350189101349804
R2 on data: 0.8820064508701508
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared Er

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros430 : 
Mean Absolute Error: 0.08441514833048143
Mean Squared Error: 0.017436298502363265
Root Mean Squared Error: 0.13204657701872952
R2 on data: 0.9825637014976367

Model : ........  Random Forest

 imp_long430 : 
Mean Absolute Error: 0.09584173561574853
Mean Squared Error: 0.01931223231138062
Root Mean Squared Error: 0.1389684579729538
R2 on data: 0.9806877676886194
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean Sq

Wall time: 702 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros450
Wall time: 1.56 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long450
Wall time: 1.36 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='

Wall time: 11 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 15.5 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=

2
3
Training ......  Random Forest on full_cros
Wall time: 12.3 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.94 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.3 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_featur

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros490 : 
Mean Absolute Error: 0.16038455799413298
Mean Squared Error: 0.06233646418982045
Root Mean Squared Error: 0.24967271414758252
R2 on data: 0.9376635358101796

Model : ........  Random Forest

 imp_long490 : 
Mean Absolute Error: 0.24825720231896686
Mean Squared Error: 0.12265880695989155
Root Mean Squared Error: 0.3502267936065023
R2 on data: 0.8773411930401084
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared E

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros510 : 
Mean Absolute Error: 0.08616792857019483
Mean Squared Error: 0.018068977529785705
Root Mean Squared Error: 0.13442089692375106
R2 on data: 0.9819310224702142

Model : ........  Random Forest

 imp_long510 : 
Mean Absolute Error: 0.09836854265257766
Mean Squared Error: 0.019754945127845515
Root Mean Squared Error: 0.1405522860996772
R2 on data: 0.9802450548721545
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean S

Wall time: 923 ms
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on imp_cros530
Wall time: 2.1 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on imp_long530
Wall time: 1.77 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='m

Wall time: 10.2 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)
Training ......  Random Forest on full_long
Wall time: 15.9 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_spli

1
2
3
Training ......  Random Forest on full_cros
Wall time: 12.6 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 4.78 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 10.4 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_feat

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.17789519378405577
Mean Squared Error: 0.07151825492989036
Root Mean Squared Error: 0.26742897174743496
R2 on data: 0.9284817450701096

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.2638132142461889
Mean Squared Error: 0.124774981485575
Root Mean Squared Error: 0.3532350230166525
R2 on data: 0.8752250185144249

Model : ........  Random Forest

 imp_cros570 : 
Mean Absolute Error: 0.15984773153959428
Mean Squared Error: 0.05949805186127791
Root Mean Squared Error: 0.2439222250252689
R2 on data: 0.9405019481387221

Model : ........  Random Forest

 imp_long570 : 
Mean Absolute Error: 0.24941542533765307
Mean Squared Error: 0.11824668766284148
Root Mean Squared Error: 0.3438701610533276
R2 on data: 0.8817533123371585
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.2128397205474921
Mean Squared Error: 0.08712205679565097
Root Mean Squared Er

Model : Random Forest

Model : ........  Random Forest

 full_cros : 
Mean Absolute Error: 0.09287387796945532
Mean Squared Error: 0.018924610666195562
Root Mean Squared Error: 0.13756674985691697
R2 on data: 0.9810753893338044

Model : ........  Random Forest

 full_long : 
Mean Absolute Error: 0.10747500039799918
Mean Squared Error: 0.02186663263078799
Root Mean Squared Error: 0.14787370500122052
R2 on data: 0.978133367369212

Model : ........  Random Forest

 imp_cros590 : 
Mean Absolute Error: 0.0866139485419076
Mean Squared Error: 0.018105165635821277
Root Mean Squared Error: 0.13455543703552553
R2 on data: 0.9818948343641787

Model : ........  Random Forest

 imp_long590 : 
Mean Absolute Error: 0.10050278229418828
Mean Squared Error: 0.020545959655866718
Root Mean Squared Error: 0.14333861885712001
R2 on data: 0.9794540403441333
Model : Adaboost

Model : ........  Adaboost

 full_cros : 
Mean Absolute Error: 0.14130647737637705
Mean Squared Error: 0.028575112966433443
Root Mean S

In [13]:
#save scores on different set of features
joblib.dump(final_score, "E:/Machine Learning/Output Data/Epic Scores on different set of features.pkl")

['E:/Machine Learning/Output Data/Epic Scores on different set of features.pkl']

#### 4. Now get the scores and calcuate average of the model for error and r2 and compare it with the full data to find out which feature set is optimal.

In [14]:
cols = ['Mean Absolute Error', 'Mean Square Error',
    "Root Mean Sq Error", "R Squared"]

def CreatDF(data, cols):
    """
    Funtion to create a dataframe out of various accuracy measures
    Parameters:
    data: A dictionary
    cols: Names for the columns of dataframe
    """
    score = pd.DataFrame(data, index=[cols]).T
    score.sort_index(inplace = True)
    return score


def GetAverage(score, patern, colname):
    """
    Get average of the accuracy measures
    Parameters: 
    score: dataframe of scores
    patern: A string mentioning the pattern of the score you want to pull out
    colname: Name for the column
    """
    imp = score.iloc[score.index.str.contains(patern),:]
    mean_score = imp[[colname]].mean()
    return mean_score
    

In [15]:
#store
test_rmse = {}
test_mabe = {}

# get all keys
keys = [z for y in (x.keys() for x in final_score) for z in y]

# Now get the data for each feature set, convert to data frame
# And calculate average score from all three models
for i in range(len(final_score)):
    key = keys[i]
    df = final_score[i][key][1]
    df_score = CreatDF(data=df, cols=cols)
    mean_mabe = GetAverage(score = df_score, patern='_imp_', 
                         colname='Mean Absolute Error')
    mean_rmse = GetAverage(score = df_score, patern='_imp_', 
                         colname='Root Mean Sq Error')
    
    test_mabe[key] = mean_mabe
    test_rmse[key] = mean_rmse
    

In [82]:
mabe = pd.DataFrame(test_mabe).T
rmse = pd.DataFrame(test_rmse).T
final = pd.merge(mabe, rmse, left_index=True, right_index=True, how='inner')

In [83]:
final.reset_index(inplace= True)
final.rename(columns = {'index': 'Features'}, inplace = True)
final.replace(regex=['Features'], value='', inplace=True)
print(final)
final.to_csv("E:/Machine Learning/Output Data/Error Rate on different feature sets.csv")

   Features Mean Absolute Error Root Mean Sq Error
0       10               0.2080             0.3032
1       30               0.2021             0.2896
2       50               0.1952             0.2745
3       70               0.2037             0.2902
4       90               0.2004             0.2910
5      110               0.2015             0.2947
6      130               0.2084             0.2994
7      150               0.1939             0.2850
8      170               0.2106             0.3035
9      190               0.2098             0.3033
10     210               0.2145             0.3070
11     230               0.2060             0.3032
12     250               0.2188             0.3148
13     270               0.2182             0.3088
14     290               0.2146             0.3038
15     310               0.2199             0.3105
16     330               0.2189             0.3107
17     350               0.2241             0.3135
18     370               0.2225

In [18]:
# Now when we have find out the best set of features, 
# we will save that set
FS = SelectFeatures()
for index, key in enumerate(full_data):
    FS.CallFit(df = full_data[key],labels = labels[index], top_fea=150)
    important_features = FS.UnivFeatureSelection(df = full_data[key], index = index)
    
# save the data, full data and import features together in a list
dfs_final = [full_data['full_df_cross'], full_data['full_df_long'], 
             important_features[0], important_features[1]]


    

In [20]:
[s.shape for s in dfs_final]

[(210, 2728), (148, 5356), (210, 150), (148, 150)]

In [21]:
joblib.dump(dfs_final, "E:/Machine Learning/Output Data/Epic Final set of features (n=150).pkl")

['E:/Machine Learning/Output Data/Epic Final set of features (n=150).pkl']

In [88]:
# Number of cpgs in important cross-sectional
dfs_final[2].loc[:, dfs_final[2].columns.str.contains('^cg')].shape

(210, 79)