### This notebook has code to implement various machine learning approaches for Epic Methylation data on GRRN genes. It has 4 datasets, cross-sectional and longitudinal full and important features

#### 1. Load libraries

In [1]:
# Settings imported from other notebook Settings.ipynb
%run Settings.ipynb

#### 2. Load the data, data and the labels (outcome variable) 

In [2]:
dfs_final = joblib.load("E:/Machine Learning/Output Data/Epic Final set of features (n=150).pkl")
labels = joblib.load("E:/Machine Learning/Output Data/Response Variable (PTSS Epic).pkl")

In [3]:
print("Shape of different datasets :")
[dfs_final[i].shape for i in range(len(dfs_final))]


Shape of different datasets :


[(210, 2728), (148, 5356), (210, 150), (148, 150)]

#### 3. Split the data into training and testing and transform

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn import preprocessing

class TrainTest:
    def __init__(self):
        self.data = []
        self.data_scaled = []
        self.scaler_trn_f = preprocessing.StandardScaler()
        self.scaler_tst_f = preprocessing.StandardScaler()
        self.scaler_trn_l = preprocessing.StandardScaler()
        self.scaler_tst_l = preprocessing.StandardScaler()

    def ScaleData(self, train_f, test_f, train_l, test_l):
        """
        Function to scale the data
        
        Parameters:
        -----------
        train_f : Training features
        test_f : Test features
        train_l : Training labels
        test_l : Test labels
        
        """
        
        self.scaler_trn_f.fit(train_f)
        self.scaler_tst_f.fit(test_f)
        self.scaler_trn_l.fit(train_l.reshape(-1,1))
        self.scaler_tst_l.fit(test_l.reshape(-1,1))
        
        scal_trn_f = self.scaler_trn_f.transform(train_f)
        scal_tst_f = self.scaler_tst_f.transform(test_f)
        scal_trn_l = self.scaler_trn_l.transform(train_l.reshape(-1,1))
        scal_tst_l = self.scaler_tst_l.transform(test_l.reshape(-1,1))

        self.data.append([train_f, train_l, test_f, test_l])
        self.data_scaled.append([scal_trn_f, scal_trn_l, scal_tst_f, scal_tst_l])
        return [self.data, self.data_scaled]
    
    

  return f(*args, **kwds)


In [5]:
# The following code splits the data sets with another single line:
# split into traning and testing
# Using Skicit-learn to split data into training and testing sets
# labels are for 3 prediction types

# Initialize constructor
Sdata = TrainTest()

# split into train and test and store
for i in range(len(dfs_final)):
    print(i)
    if i == 0 or i == 2:
        j = 0 # labels
    else:
        j = 1
    train_features, test_features, train_labels, test_labels = train_test_split(dfs_final[i],
                                                                                labels[j], 
                                                                                test_size = 0.25, 
                                                                                random_state = 42)
    
    data, data_scaled = Sdata.ScaleData(train_features, test_features, train_labels, test_labels)


0
1
2
3


In [6]:
# We can look at the shape of all the data to make sure we did everything correctly. 
# We expect the training features number of columns to match the testing feature 
# number of columns and the number of rows to match for the respective training 
# and testing features and the labels :

def CheckShape(df, name):
        """
        Function to check the shape of the data
        
        Parameters:
        ----------
        df: Data frame list
        name: Name of the data to print message
        
        """
        
        print(name + "..............")
        for i in range(len(df)):
            for j in range(len(df[i])):
                print(df[i][j].shape)

    
CheckShape(df = data, name="Without scaling")
CheckShape(df= data_scaled, name="With scaling")

Without scaling..............
(157, 2728)
(157,)
(53, 2728)
(53,)
(111, 5356)
(111,)
(37, 5356)
(37,)
(157, 150)
(157,)
(53, 150)
(53,)
(111, 150)
(111,)
(37, 150)
(37,)
With scaling..............
(157, 2728)
(157, 1)
(53, 2728)
(53, 1)
(111, 5356)
(111, 1)
(37, 5356)
(37, 1)
(157, 150)
(157, 1)
(53, 150)
(53, 1)
(111, 150)
(111, 1)
(37, 150)
(37, 1)


In [9]:
# Save scaled data
import joblib
joblib.dump(data_scaled, "E:/Machine Learning/Output Data/Final/Epic_ML_Scaled_Data.pkl")
joblib.dump(data, "E:/Machine Learning/Output Data/Final/Epic_ML_without_scaling_Data.pkl")

['E:/Machine Learning/Output Data/Final/Epic_ML_without_scaling_Data.pkl']

#### 4.  Training the models

In [8]:
# Train Model
# After all the work of data preparation, creating and training the model is pretty simple 
# using Scikit-learn. We import the random forest regression model from skicit-learn, 
# instantiate the model, and fit (scikit-learn’s name for training) the model on the training data. 
# (Again setting the random state for reproducible results). 
# Import the model we are using


from sklearn import preprocessing

def train_models(data_scaled, model_names, df_types, 
                 indx1, indx2):
    
    """
     Function to train the models(random forest, adaboost, gradient boost)
     on cross-sectional and longitudinal full and important data
        
     Parameters:
     ----------
     data_scaled: List of data frames containing scaled data 
     model_names: Model names e.g, random forest etc
     df_types: Data name e.g, full data
     indx1 : Index of the data, e.g indx1 = 0 for training features
     indx2 : Index of the labes, e.g indx1 = 1 for taining labels

    """
    
    
    store_models = []
    for i in range(len(data_scaled)):
        # default setting
        rforest = RandomForestRegressor(random_state=42)
        aboost = AdaBoostRegressor(random_state=42)
        gboost = GradientBoostingRegressor(random_state=42)
                
        training_models = [rforest, aboost, gboost]
        
        # After scaling, labels change into column. We need to reshape into an array
        scaled_trn_labels = data_scaled[i][indx2].ravel() 
        
        # loop over models for each data set
        for m in range(len(training_models)):
            print("Training ...... ", model_names[m], "on", df_types[i])
            %time training_models[m].fit(data_scaled[i][indx1], scaled_trn_labels) # train data and train features for each case
            print(training_models[m])
            store_models.append(training_models[m])
            
    return store_models
            
  

#### 5.  After training we will first check the accuracy on the training data. This will help us to find out if the model is overfitting the data. If the model performed well on the training data but didn't perform well on the test data, then the model is overfitting. 

In [9]:
# Using training data, we will predict the labels of the training set
# input: training set
# output: training labels

def make_predictions(data_scaled, model_names, df_types, indx, 
                    trained_models):
    """
    Function to evaluate the models

    Parameters:
    ----------
    data_scaled: List of data frames containing scaled data 
    model_names: Model names e.g, random forest etc
    df_types: Data name e.g, full data
    indx : Index of the data, e.g indx = 0 for training features
    """
    
    predictions = {}
    m = 0 # models 
    for i in range(len(data_scaled)):
        for j in range(len(model_names)):
            key = model_names[j]+"_"+df_types[i]
            predictions[key] = trained_models[m].predict(data_scaled[i][indx])
            m = m+1
    return predictions
            


In [10]:
# Check the error rate on the training set
# input: training labels and predicted labels on the training set
# output: error rate
# df_indx will be 1 for training and 3 for testing
from sklearn import metrics
 
def evaluate_model(data_scaled, preds, df_indx, df_types, 
                   model_name, store ):
    for i in range(len(data_scaled)):
        print("\nModel : ........ ", model_name)
        print("\n", df_types[i], ": ")
        abe = metrics.mean_absolute_error(data_scaled[i][df_indx], preds[i])
        mse = metrics.mean_squared_error(data_scaled[i][df_indx], preds[i])
        rmse = np.sqrt(metrics.mean_squared_error(data_scaled[i][df_indx], preds[i]))
        r2 = metrics.r2_score(data_scaled[i][df_indx], preds[i])
        print('Mean Absolute Error:', abe)
        print('Mean Squared Error:', mse)
        print('Root Mean Squared Error:', rmse) 
        print("R2 on data:", r2)
        key = model_name+ '_' +df_types[i] 
        store[key] = [abe, mse, rmse, r2]

    return store

In [11]:

# call training function
model_names = ['Random Forest', "Adaboost", "Gradient Boost"]
df_types = ["full_cros", "full_long", "imp_cros", "imp_long"]

# Base models scores for training and testing
base_pred = [{},{}]
base_m_score = [{},{}]

# base model on training and testing data to check overfitting
trained_models_base =  train_models(data_scaled=data_scaled, 
                                    model_names=model_names, 
                                    df_types=df_types,
                                    indx1=0, indx2=1)

for i in range(2): # for train and test data
    if i == 0:
        indx = 0
    else:
        indx = 2
       
    # Make predictions
    base_pred[i] = make_predictions(data_scaled=data_scaled, 
                                    model_names=model_names, 
                                    df_types=df_types, indx=indx,
                                    trained_models=trained_models_base)
    
    # Get scores for each model on each data set
    for j in range(len(model_names)):
        predictions = [value for key,value in base_pred[i].items() if key.startswith(model_names[j])]
        print(predictions)
        base_m_score[i] = evaluate_model(data_scaled=data_scaled, 
                                                         preds=predictions, df_indx=indx+1, 
                                                         df_types=df_types, 
                                                         model_name = model_names[j],
                                                         store=base_m_score[i]
                                                        )
        

Training ......  Random Forest on full_cros
Wall time: 9.66 s
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Training ......  Adaboost on full_cros
Wall time: 3.66 s
AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=42)
Training ......  Gradient Boost on full_cros
Wall time: 8.2 s
GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=No

In [12]:
base_pred

[{'Random Forest_full_cros': array([ 0.02567745, -1.02531384, -0.96020275,  0.59632088, -0.86622165,
         -0.44422807, -0.33611909, -1.48170573,  1.41143803, -1.35578334,
         -0.6567605 ,  0.59202109, -0.76425522,  0.28673607,  1.69215282,
         -1.32077077, -0.0676894 , -0.72555712, -0.57322174, -0.794968  ,
          0.50541105, -1.55480215,  0.52322446, -0.50135384, -0.43255721,
          0.07236087,  0.9624172 , -0.60700579, -1.29312927, -1.06401194,
          0.01400659,  0.55823703, -1.20344796, -1.01671426, -0.74398479,
          0.29902118,  0.58833555,  0.47776955, -0.89877719,  0.15282835,
          2.19461388,  0.12887238,  1.05394128, -1.35271206, -0.89079187,
         -0.61376261, -0.72739989, -0.5916494 ,  2.13810237,  0.4992685 ,
         -0.75934118, -0.90737677,  0.30639225,  0.168799  , -0.40798699,
         -0.61499112,  0.50233977, -1.32752758, -0.25749437, -1.35455483,
          0.16142793, -0.22309606,  0.37518887,  0.82052416, -1.49399084,
          0

In [13]:
# Save trained models
top = data_scaled[2][0].shape[1] # significant features
joblib.dump(trained_models_base, 
            "E:/Machine Learning/Saved Models/Epic Base Trained Models New "+"n_fea "+str(top)+".pkl")

['E:/Machine Learning/Saved Models/Epic Base Trained Models New n_fea 150.pkl']

In [2]:
# Load if needed
import os
import joblib
def load_saved(path):
    print("Loading object ... done")
    saved_scores = joblib.load(path)
    return saved_scores 
    

path_bm = 'E:/Machine Learning/Saved Models/Epic Base Trained Models New n_fea 150.pkl'
if 'trained_models_base' not in locals():
    trained_models_base = load_saved(path = path_bm)



Loading object ... done


In [3]:
trained_models_base

[RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False),
 AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                   n_estimators=50, random_state=42),
 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_spli

#### Load the parameters if there

In [28]:

# Load the model back
name_id = ' '.join(("n_iter", str(n_iter), "n_fea", str(top))) # make id from iterations and num of top features
# if 'rand_s_parms' not in locals():
#     rand_s_parms = joblib.load("E:/Machine Learning/Saved Models/Hyper_Parameters_Epic_nIter_"+name_id+".pkl")

#### 4. Now search for the hyper-parameters to see if the base model improves

In [29]:
name_id

'n_iter 100 n_fea 150'

In [22]:
n_estimators = np.arange(100, 2000, 100)
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators

[array([ 100,  200,  300,  400,  500,  600,  700,  800,  900, 1000, 1100,
        1200, 1300, 1400, 1500, 1600, 1700, 1800, 1900])]

In [26]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators = np.arange(100, 2000, 100)

random_state = [42]

# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 4, 5, 8, 10, 12] 

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 6, 8, 10, 12]

max_leaf_nodes = [2, 3, 4, 5, 6, 7, 8, 10]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid_rf = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'max_leaf_nodes': max_leaf_nodes,
                  'bootstrap': bootstrap, 
                  'random_state': random_state}


learning_rate = [0.1, 0.2, 0.3, 0.5, 0.8, 1, 1.5]
loss_ab = ['linear', 'square', 'exponential']
random_grid_ab = {'n_estimators': n_estimators,
                    'learning_rate': learning_rate,
                    'loss': loss_ab}


loss_gb = ['ls', 'lad', 'huber', 'quantile']
random_grid_gb = {'n_estimators': n_estimators,
                  'max_features': max_features,
                  'max_depth': max_depth,
                  'min_samples_split': min_samples_split,
                  'min_samples_leaf': min_samples_leaf,
                  'max_leaf_nodes': max_leaf_nodes,
                  'learning_rate': learning_rate, 
                  'random_state': random_state,
                  'loss': loss_gb}

rf_g_search = RandomForestRegressor(random_state=42)
ab_g_search = AdaBoostRegressor(random_state=42)
gb_g_search = GradientBoostingRegressor(random_state=42)


def SearchParameters(model_name, grid_parm, n_iter, cv, scoring, train_f, train_l):
    grid_search = RandomizedSearchCV(model_name, grid_parm, n_iter=n_iter, 
                                     cv=cv, scoring=scoring)
    %time grid_search.fit(train_f, train_l) # train data and train features for each case
    return(grid_search)
    

In [27]:
# Search for hyperparameters
rand_s_parms = {}
g_search_models = [rf_g_search, ab_g_search, gb_g_search]
g_search_parms = [random_grid_rf, random_grid_ab, random_grid_gb]
n_iter=100
for i in range(len(data_scaled)):
    print("Searching parameter space", i)
    scaled_trn_labels = data_scaled[i][1].ravel()
    for j in range(len(g_search_models)):
        parms = SearchParameters(model_name= g_search_models[j], 
                                 grid_parm= g_search_parms[j],
                                 n_iter=n_iter, cv=5, scoring='neg_mean_squared_error',
                                 train_f=data_scaled[i][0], train_l=scaled_trn_labels)
        key = model_names[j]+ '_' +df_types[i] 
        rand_s_parms[key] = parms
   

Searching parameter space 0
Wall time: 3h 5min 43s
Wall time: 6h 59min 44s
Wall time: 2h 22min 7s
Searching parameter space 1
Wall time: 3h 13min 39s
Wall time: 11h 8min 12s
Wall time: 2h 14min 17s
Searching parameter space 2
Wall time: 14min 24s
Wall time: 24min 50s
Wall time: 8min 53s
Searching parameter space 3
Wall time: 14min 4s
Wall time: 21min 3s
Wall time: 9min 28s


#### Save the HyperParameters

In [30]:
joblib.dump(rand_s_parms, "E:/Machine Learning/Saved Models/Hyper_Parameters_Epic_nIter_new"+name_id+".pkl")

['E:/Machine Learning/Saved Models/Hyper_Parameters_Epic_nIter_newn_iter 100 n_fea 150.pkl']

In [4]:
# Load if needed
# Load if needed
path_parms = 'E:/Machine Learning/Saved Models/Hyper_Parameters_Epic_nIter_newn_iter 100 n_fea 150.pkl'
if 'rand_s_parms' not in locals():
    rand_s_parms = load_saved(path = path_parms)


Loading object ... done


In [6]:
# Best Estimators using RandomSearch
for key in rand_s_parms:
    print(key)
    print("Parameters :", rand_s_parms[key].best_params_)
    print("Estimaters :", rand_s_parms[key].best_estimator_)


Random Forest_full_cros
Parameters : {'random_state': 42, 'n_estimators': 1900, 'min_samples_split': 2, 'min_samples_leaf': 6, 'max_leaf_nodes': 8, 'max_features': 'auto', 'max_depth': 20, 'bootstrap': True}
Estimaters : RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='auto', max_leaf_nodes=8,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1900, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Adaboost_full_cros
Parameters : {'n_estimators': 1600, 'loss': 'square', 'learning_rate': 0.5}
Estimaters : AdaBoostRegressor(base_estimator=None, learning_rate=0.5, loss='square',
                  n_estimators=1600, random_state=42)
Gradient Boost_full_cros
Parameters : {'random_state'

#### Evaluate Random Search. To determine if random search yielded a better model, we compare the base model with the best random search model.

In [32]:
# # # the best model we created using best hyperparameters
# # We have the parameters for full and important features
# # First three for full data (RF,AB,GB) and last three for important features
def score(features, labels):
    print('Scores')
    mabe = metrics.mean_absolute_error(features, labels)
    mse = metrics.mean_squared_error(features, labels)
    rmse = np.sqrt(metrics.mean_squared_error(features, labels))
    r2 = metrics.r2_score(features, labels)
    print("Mean Abs Error: ", mabe )
    print("Mean Squared Error: ",mse )
    print("Root Mean Squared Error: ", rmse )
    print('R Squared : ', r2 , "\n")
    return([mabe, mse, rmse, r2])

best_m_score = [{},{}]
for index, key in enumerate(rand_s_parms):
    print(index)
    print(key, "--------------------------")
    i = index//3 # we have three models
    print(df_types[i], ":")
    best_random = rand_s_parms[key].best_estimator_
    print(best_random)

    for j in range(2): # train and test
        if j == 0:
            msg = "Train"
            indx = 0
        else:
            msg = "Test"
            indx = 2
    
        # now trying the best model on the training data
        print("Best model on ", msg,  "data ................: \n")
        pred = best_random.predict(data_scaled[i][indx])
        best_score = score(data_scaled[i][indx+1].ravel(), pred)
        best_m_score[j][key] = best_score

0
Random Forest_full_cros --------------------------
full_cros :
RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=20, max_features='auto', max_leaf_nodes=8,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=6,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1900, n_jobs=None, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)
Best model on  Train data ................: 

Scores
Mean Abs Error:  0.16086321572574488
Mean Squared Error:  0.05415792902941793
Root Mean Squared Error:  0.2327185618497543
R Squared :  0.945842070970582 

Best model on  Test data ................: 

Scores
Mean Abs Error:  0.17792551585431415
Mean Squared Error:  0.07271932022786129
Root Mean Squared Error:  0.26966520025368734
R Squared :  0.9272806797721387 

1
Adaboost_full_cros ----

#### 5. Lets evaluate base and the tuned models to determine if random search yielded a better model, we compare the base model with the best random search model. We will perfrom ten-fold cross-validation. 

#####  5.1. Cross-validation using base model

In [33]:
# n_jobs = 100, could be a problem to run that number of threads. Use n_jobs = -1  as a solution
# input: training data and training labels
# output: Score for each fold
# Scikit-Learn cross-validation features expect a utility function (greater is better) rather 
# than a cost function (lower is better), so the scoring function is actually the 
# opposite of the MSE (i.e., a negative value), which is why the preceding code computes -scores 
# before calculating the square root.

from sklearn.model_selection import cross_val_score

# Function to perform cross validation    
def perform_crossv(model, df1, df2, cv_fold, model_name, 
                   df_types, store_s, store_a):
    print("Model ..........: ", model_name)
    scores = cross_val_score(model, df1, df2, scoring="neg_mean_squared_error", cv = cv_fold)
    r2 = cross_val_score(model, df1, df2, scoring="r2", cv = cv_fold)
    key = model_name + "_"+ df_types +  "_" + 'mse'
    key1 = model_name + '_' + df_types +   "_" + 'r2'
#     tree_rmse_scores = np.sqrt(-scores)
#     print("RMSE Score :", tree_rmse_scores)
    store_s[key] = [scores]
    store_a[key1] = [r2]
    

cv_scores = [{},{}]
cv_r2 = [{},{}]

# call cross validation function
for i in range(len(data_scaled)):
    df_type = df_types[i]
    rfm = RandomForestRegressor(random_state = 42)
    ada_reg = AdaBoostRegressor(random_state=42)
    gbrt = GradientBoostingRegressor(random_state=42)
    models = [rfm, ada_reg, gbrt]
    
    for m in range(len(models)):
        
        for j in range(2):
            if j == 0:
                msg = "Train"
                indx = 0
            else:
                msg = "Test"
                indx = 2
            # cross validation training
            print("Performing cross validation on ", msg, "data" )
            perform_crossv(model=models[m], df1=data_scaled[i][indx], 
                           df2=data_scaled[i][indx+1].ravel(), cv_fold=10, 
                           model_name=model_names[m], df_types=df_type, 
                           store_s=cv_scores[j], store_a=cv_r2[j])
        
        if m == 2:
            m = 0


Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Test data
Model ..........:  Random Forest
Performing cross validation on  Train data
Model ..........:  Adaboost
Performing cross validation on  Test data
Model ..........:  Adaboost
Performing cross validation on  Train data
Model ..........:  Gradient Boost
Performing cross validation on  Test data
Model ..........:  Gradient Boost
Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Test data
Model ..........:  Random Forest
Performing cross validation on  Train data
Model ..........:  Adaboost
Performing cross validation on  Test data
Model ..........:  Adaboost
Performing cross validation on  Train data
Model ..........:  Gradient Boost
Performing cross validation on  Test data
Model ..........:  Gradient Boost
Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Te

In [35]:
#save scores
base_id = ' '.join(('n_fea', str(top)))
joblib.dump([cv_scores, cv_r2], "E:/Machine Learning/Output Data/BaseModel Scores Epic New "+base_id+".pkl")

['E:/Machine Learning/Output Data/BaseModel Scores Epic New n_fea 150.pkl']

In [5]:
# load the base model results
  
# call for base scores 
path_base = "E:/Machine Learning/Output Data/BaseModel Scores Epic New n_fea 150.pkl"
if 'cv_scores' not in locals():
    cv_scores, cv_r2 = load_saved(path= path_base)


Loading object ... done


In [6]:
cv_scores

[{'Random Forest_full_cros_mse': [array([-0.11131708, -0.13729954, -0.14415538, -0.07908314, -0.13590623,
          -0.09625949, -0.28923703, -0.09319565, -0.05138408, -0.21808427])],
  'Adaboost_full_cros_mse': [array([-0.09905924, -0.14731475, -0.23154867, -0.10477705, -0.111415  ,
          -0.10271536, -0.29269233, -0.16101142, -0.07255728, -0.20547586])],
  'Gradient Boost_full_cros_mse': [array([-0.14346104, -0.15701661, -0.20535438, -0.06578425, -0.10683774,
          -0.0700115 , -0.29936777, -0.12068702, -0.01726851, -0.24261307])],
  'Random Forest_full_long_mse': [array([-0.25721257, -0.16421848, -0.07992322, -0.26077213, -0.11227248,
          -0.13222697, -0.30451873, -0.09891032, -0.07160586, -0.11140927])],
  'Adaboost_full_long_mse': [array([-0.20632998, -0.18516019, -0.14828948, -0.24704667, -0.08208457,
          -0.12727051, -0.34826006, -0.10420478, -0.06834157, -0.14535937])],
  'Gradient Boost_full_long_mse': [array([-0.20337134, -0.17386341, -0.07589559, -0.24347

##### 5.2 Cross validation on training data using best hyperparameters

In [914]:
model_names

['Random Forest', 'Adaboost', 'Gradient Boost']

In [37]:
# storage for best score and r2 values
cv_scores_best = [{},{}]
cv_r2_best = [{}, {}]

# call cross validation function
keys = list(rand_s_parms.keys())
k = 0 # counter to get keys
for i in range(len(data_scaled)):
    df_type = df_types[i]
    for m in range(len(model_names)):
        best_random = rand_s_parms[keys[k]].best_estimator_

        for j in range(2):
            if j == 0:
                msg = "Train"
                indx = 0
            else:
                msg = "Test"
                indx = 2
            print("Performing cross validation on ", msg, "data" )
            perform_crossv(model=best_random, df1=data_scaled[i][indx], 
                           df2=data_scaled[i][indx+1].ravel(), cv_fold=10, 
                           model_name=model_names[m], df_types=df_type, 
                           store_s=cv_scores_best[j], store_a=cv_r2_best[j])
        
        k=k+1
        if m == 2:
            m = 0

Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Test data
Model ..........:  Random Forest
Performing cross validation on  Train data
Model ..........:  Adaboost
Performing cross validation on  Test data
Model ..........:  Adaboost
Performing cross validation on  Train data
Model ..........:  Gradient Boost
Performing cross validation on  Test data
Model ..........:  Gradient Boost
Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Test data
Model ..........:  Random Forest
Performing cross validation on  Train data
Model ..........:  Adaboost
Performing cross validation on  Test data
Model ..........:  Adaboost
Performing cross validation on  Train data
Model ..........:  Gradient Boost
Performing cross validation on  Test data
Model ..........:  Gradient Boost
Performing cross validation on  Train data
Model ..........:  Random Forest
Performing cross validation on  Te

In [38]:
#save scores
joblib.dump([cv_scores_best, cv_r2_best], "E:/Machine Learning/Output Data/Hypertuned Model Scores Epic New "+name_id+".pkl")

['E:/Machine Learning/Output Data/Hypertuned Model Scores Epic New n_iter 100 n_fea 150.pkl']

In [7]:

# call for tuned scores 
path_tuned = "E:/Machine Learning/Output Data/Hypertuned Model Scores Epic New n_iter 100 n_fea 150.pkl"
if 'cv_scores_best' not in locals():
    cv_scores_best, cv_r2_best = load_saved(path= path_tuned)

    
# if 'cv_scores_best' not in locals():
#     print("Loading scores ... done")
#     saved_scores_best = joblib.load("E:/Machine Learning/Output Data/Hypertuned Model Scores Epic New n_iter 100 n_fea 150.pkl")
#     cv_scores_best, cv_r2_best = saved_scores_best
# else:
#     print("Already loaded ...")
    

Loading object ... done


#### 7. Final scores on test data

##### 7.1 Base model 

In [419]:
cols = ['Mean Absolute Error', 'Mean Square Error',
        "Root Mean Sq Error", "R Squared"]

def create_dataframe(data, score_cols):
    score = pd.DataFrame(data, index=[score_cols]).T # transpose the data
    score.sort_index(inplace = True)
    return(score)

In [54]:
# call function and save data
base_scores = create_dataframe(data = base_m_score[1], score_cols = cols)
base_scores.to_csv("E:/Machine Learning/Output Data/Base Model Test Score " +"n_fea "+ str(top)+ ".csv")
base_scores

Unnamed: 0,Mean Absolute Error,Mean Square Error,Root Mean Sq Error,R Squared
Adaboost_full_cros,0.2128,0.0871,0.2952,0.9129
Adaboost_full_long,0.2728,0.142,0.3768,0.858
Adaboost_imp_cros,0.2161,0.0834,0.2888,0.9166
Adaboost_imp_long,0.2452,0.1204,0.347,0.8796
Gradient Boost_full_cros,0.183,0.0752,0.2742,0.9248
Gradient Boost_full_long,0.2639,0.1204,0.3469,0.8796
Gradient Boost_imp_cros,0.1353,0.0471,0.2171,0.9529
Gradient Boost_imp_long,0.2059,0.0951,0.3084,0.9049
Random Forest_full_cros,0.1779,0.0715,0.2674,0.9285
Random Forest_full_long,0.2638,0.1248,0.3532,0.8752


In [422]:
base_scores.mean(axis=0)

Mean Absolute Error   0.211465
Mean Square Error     0.093585
Root Mean Sq Error    0.301963
R Squared             0.906415
dtype: float64

##### 7.2 Tuned Model

In [421]:
#Score on test data using hyperparameters
hyperTuned_scores = create_dataframe(data=best_m_score[1], score_cols=cols)
hyperTuned_scores.to_csv("E:/Machine Learning/Output Data/HyperTuned Test Score " + name_id+".csv")
hyperTuned_scores

Unnamed: 0,Mean Absolute Error,Mean Square Error,Root Mean Sq Error,R Squared
Adaboost_full_cros,0.194522,0.07779,0.278908,0.92221
Adaboost_full_long,0.251786,0.124107,0.352289,0.875893
Adaboost_imp_cros,0.211094,0.0776,0.278568,0.9224
Adaboost_imp_long,0.232637,0.108769,0.329802,0.891231
Gradient Boost_full_cros,0.181404,0.074168,0.272337,0.925832
Gradient Boost_full_long,0.228186,0.093056,0.305051,0.906944
Gradient Boost_imp_cros,0.167227,0.059376,0.243672,0.940624
Gradient Boost_imp_long,0.190469,0.115559,0.33994,0.884441
Random Forest_full_cros,0.177926,0.072719,0.269665,0.927281
Random Forest_full_long,0.260542,0.129969,0.360512,0.870031


In [423]:
hyperTuned_scores.mean(axis=0)

Mean Absolute Error   0.205631
Mean Square Error     0.091619
Root Mean Sq Error    0.300036
R Squared             0.908381
dtype: float64

#### 8. Bagging and voting approaches

In [56]:
from sklearn.ensemble import RandomForestRegressor 
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
 
lnr_reg = LinearRegression()
rnd_reg = RandomForestRegressor(random_state = 42) 
ab_reg = AdaBoostRegressor(random_state=42)
gb_reg = GradientBoostingRegressor(random_state=42)
svr_reg = SVR()
bag_reg = BaggingRegressor(DecisionTreeRegressor(random_state=42),  n_estimators=500,
                           bootstrap=True, n_jobs=-1, oob_score=True, 
                           bootstrap_features=True)

# ('lr', log_reg),
voting_reg = VotingRegressor([('lnr', lnr_reg),
                              ('rf', rnd_reg), 
                              ('ab', ab_reg),
                              ('gb', gb_reg),
                              ('svr', svr_reg),
                             ('bag_reg', bag_reg)])


In [63]:
from sklearn.metrics import accuracy_score
bag_score = {}
models_all = [lnr_reg, rnd_reg, ab_reg, gb_reg, svr_reg, voting_reg, bag_reg]
voting_models = [models_all]
for m in range(len(voting_models)):
    for clf in voting_models[m]:
        print(m)
        print('=====================')
        for i in range(len(data_scaled)):
            clf.fit(data_scaled[i][0], data_scaled[i][1].ravel())
            y_pred = clf.predict(data_scaled[i][2])
            clf_nm = clf.__class__.__name__ + "_" + df_types[i] + "_" + str(m)

            abe = metrics.mean_absolute_error(data_scaled[i][3].ravel(), y_pred)
            mse = metrics.mean_squared_error(data_scaled[i][3].ravel(), y_pred)
            rmse = np.sqrt(metrics.mean_squared_error(data_scaled[i][3].ravel(), y_pred))
            r2 = metrics.r2_score(data_scaled[i][3], y_pred)
            bag_score[clf_nm] = [abe, mse, rmse, r2 ]


0
0
0
0
0
0
0


In [68]:
# bag_score_tst = dict(filter(lambda item: '_1'  in item[0], bag_score.items()))
ensemble_score = create_dataframe(data=bag_score, score_cols=cols )
ensemble_score.to_csv("E:/Machine Learning/Output Data/Bagging & voting scores new " + str(top) +".csv")
ensemble_score

Unnamed: 0,Mean Absolute Error,Mean Square Error,Root Mean Sq Error,R Squared
AdaBoostRegressor_full_cros_0,0.2128,0.0871,0.2952,0.9129
AdaBoostRegressor_full_long_0,0.2728,0.142,0.3768,0.858
AdaBoostRegressor_imp_cros_0,0.2161,0.0834,0.2888,0.9166
AdaBoostRegressor_imp_long_0,0.2452,0.1204,0.347,0.8796
BaggingRegressor_full_cros_0,0.2189,0.0924,0.304,0.9076
BaggingRegressor_full_long_0,0.3041,0.1582,0.3978,0.8418
BaggingRegressor_imp_cros_0,0.1658,0.0599,0.2448,0.9401
BaggingRegressor_imp_long_0,0.249,0.1219,0.3492,0.8781
GradientBoostingRegressor_full_cros_0,0.183,0.0752,0.2742,0.9248
GradientBoostingRegressor_full_long_0,0.2639,0.1204,0.3469,0.8796


In [69]:
# get the scores of bagging, linear reg,svr and voting
sel_models = ['Bagging', 'Linear', 'SVR', 'Voting']
sel_models = ensemble_score.loc[ensemble_score.index.str.contains('|'.join(sel_models)),:]
sel_models.to_csv("E:/Machine Learning/Output Data/Bagging, LR,SVR & voting scores" + str(top)+".csv")
sel_models

Unnamed: 0,Mean Absolute Error,Mean Square Error,Root Mean Sq Error,R Squared
BaggingRegressor_full_cros_0,0.2189,0.0924,0.304,0.9076
BaggingRegressor_full_long_0,0.3041,0.1582,0.3978,0.8418
BaggingRegressor_imp_cros_0,0.1658,0.0599,0.2448,0.9401
BaggingRegressor_imp_long_0,0.249,0.1219,0.3492,0.8781
LinearRegression_full_cros_0,0.464,0.324,0.5692,0.676
LinearRegression_full_long_0,0.6536,0.6888,0.83,0.3112
LinearRegression_imp_cros_0,0.7971,1.1396,1.0675,-0.1396
LinearRegression_imp_long_0,0.5958,0.5455,0.7386,0.4545
SVR_full_cros_0,0.707,0.7426,0.8618,0.2574
SVR_full_long_0,0.7715,0.9185,0.9584,0.0815
