# Modeling - Slim

This Notebook is used to generate the optimal modelparamters in an less explorative matter. 

## Preperation

* Import modules
* Define paths
* Import data

### Import Modules

In [51]:
import sys
import time
import joblib
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.kernel_ridge import KernelRidge as KRR
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer  
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict

import pandas as pd


### Define Paths

In [40]:
# Basepath
basepath="./"
sys.path.append(basepath)

# Modelpath

MODEL_PATH = basepath + "models"

sys.path.append(MODEL_PATH)
sys.path    

['/home/fhwn.ac.at/202375/Thesis',
 '/home/fhwn.ac.at/202375/.conda/envs/ml_course/lib/python312.zip',
 '/home/fhwn.ac.at/202375/.conda/envs/ml_course/lib/python3.12',
 '/home/fhwn.ac.at/202375/.conda/envs/ml_course/lib/python3.12/lib-dynload',
 '',
 '/home/fhwn.ac.at/202375/.conda/envs/ml_course/lib/python3.12/site-packages',
 './',
 './',
 './models',
 './',
 './models',
 './',
 './models',
 './',
 './models']

### Import Data

In [41]:
data_full = pd.read_csv(basepath+"data/PS20191107_gegl.csv", 
                            sep=";", decimal=",", encoding="utf-8")
data_full.shape # for quality control purposes

(2244, 1870)

In [42]:
data_full.head()

Unnamed: 0.1,Unnamed: 0,year,Origin,type,3996,3994,3992,3990,3988,3987,...,417,415,413,411,409,407,405,403,401,399
0,2GOS-18_1955,1955,POL,living,0.016119,0.015972,0.01583,0.015728,0.015734,0.015787,...,-0.027973,-0.02818,-0.028389,-0.028595,-0.029011,-0.029123,-0.029323,-0.02961,-0.029759,-0.029746
1,2GOS-18_1969,1969,POL,living,0.016368,0.016543,0.016663,0.016569,0.016333,0.016217,...,-0.02952,-0.029747,-0.029978,-0.030204,-0.030087,-0.030284,-0.030746,-0.031163,-0.031519,-0.031815
2,2GOS-18_1974,1974,POL,living,0.021364,0.021662,0.021862,0.021573,0.020925,0.020585,...,-0.031046,-0.03127,-0.031483,-0.031701,-0.032089,-0.03239,-0.032609,-0.032653,-0.032627,-0.032784
3,2GOS-18_1976,1976,POL,living,0.019351,0.019246,0.019181,0.018998,0.018926,0.019205,...,-0.029852,-0.030092,-0.030361,-0.030647,-0.031115,-0.031281,-0.031376,-0.031721,-0.032172,-0.032433
4,2GOS-18_1996,1996,POL,living,0.018548,0.018604,0.01867,0.018616,0.018375,0.018266,...,-0.029963,-0.030206,-0.030436,-0.030643,-0.030917,-0.031127,-0.031338,-0.031409,-0.031364,-0.031465


In [77]:
# Define the parameters for the CV

# Switch for testing mode (use only 10% of the data, among others)
testing = True


######################################################
if testing == True:
    nfolds = 2
    NoTrials = 5
    n_jobs = 20
    save_model = False
    print("Testing mode for Cross Validation")
    print("Splitting the data for faster modelling")
    data_testing = data_full.sample(frac=0.1)
else:
    nfolds = 10
    NoTrials = 15
    n_jobs = -1
    save_model = True
    print("Extensive mode for Cross Validation")
######################################################

Testing mode for Cross Validation
Splitting the data for faster modelling


## Preprocessing

To apply the models we need to split the data into the variables and target.

In [78]:
# Split into target and features

random_state = np.random.RandomState(202375)
if testing == True:
    data = data_testing
    random_state = np.random.RandomState(202375)
else:
    data = data_full

X = data.select_dtypes('float')

y = data['year']

In [79]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state)

In [80]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((156, 1866), (68, 1866), (156,), (68,))

## Define CV and Score metrics

We might want to later implement functions an various model metrics, the goal is to implement multiple errror metrics. For Regression models there are various metrics available (see [sklearn](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values) for details).  

| Scoring | function|
| ------------- | ------------- |
|‘explained_variance’|metrics.explained_variance_score|
|‘max_error’|metrics.max_error|
|**‘neg_mean_absolute_error’**|metrics.mean_absolute_error|
|‘neg_mean_squared_error’|metrics.mean_squared_error|
|**‘neg_root_mean_squared_error’**|metrics.root_mean_squared_error|
...


As sklearn allows for the evaluation of multiple metrics, we will implement The MAE, and RMSE as follows.

In [81]:
from sklearn.metrics import get_scorer_names
get_scorer_names

<function sklearn.metrics._scorer.get_scorer_names()>

In [88]:
mutliple_scoring = {'neg_mean_absolute_error', 'mean_squared_error'}

# scoring = {'neg_MAE': 'neg_mean_absolute_error', 'MSE': 'mean_squared_error'}
scoring = ['neg_mean_absolute_error']

# Modeling with Randomized Search

## Random Forest (RSCV)
Define the parameter space over: 

* bootstrap
* max_depth
* max_features
* n_estimators
* min_samples_split
* min_samples_leaf

In [83]:
rf_param_distribution = {
    'bootstrap': [True, False],
    'max_depth': randint(low=10, high=110),
#     'max_features': ['auto', 'sqrt', 'log2'],
    'n_estimators': randint(low=3, high=100), # for hyperparameter with discrete values 
    'min_samples_split': randint(low=2, high=20), 
    'min_samples_leaf': randint(low=1, high=20),
}

### Define the Randomized Search

In [84]:
# Prepare to store the results

rf_rscv_results = pd.DataFrame(columns=['model', 'MAE', 'RMSE', 'params'])

one_cv_layer=np.zeros((NoTrials, 1))
two_cv_layers=np.zeros((NoTrials, 1))

# Save the predictions
all_predictions=np.zeros((len(y_train),NoTrials))

### fit the RSCV for RF

In [90]:
# scoring = 'neg_mean_absolute_error'

for i in range(0, NoTrials):
    print("Sim: {0}".format(i))

    # Split the data into 'nfolds' number of splits 
    inner_cv = KFold(n_splits=nfolds, shuffle=True, random_state=i)
    outer_cv = KFold(n_splits=nfolds, shuffle=True, random_state=i)

    # Define the Model
    rf_rscv = RandomizedSearchCV(
    estimator = RandomForestRegressor(),
    param_distributions = rf_param_distribution,
    n_iter = NoTrials,
    cv = inner_cv,
    scoring = 'neg_mean_absolute_error', # scoring = scoring
    n_jobs=n_jobs,
    random_state=i,
    )

    # fit the model
    rf_rscv.fit(X_train, y_train)
   
   # Calculate the inner CV results
    one_cv_layer[i] = rf_rscv.best_score_

    # calculate ynl_hat
    y_pred = cross_val_predict(rf_rscv, X=X_train, y=y_train, cv=outer_cv, n_jobs=n_jobs)
    all_predictions[:,i] = y_pred

    #  Calculate the outer CV results

    #perform the outer cross validation
    
    outer_cv_results = cross_val_predict(
        rf_rscv, 
        X=X_test, 
        y=y_test, 
        cv=outer_cv, 
        n_jobs=n_jobs)



Sim: 0
Sim: 1
Sim: 2
Sim: 3
Sim: 4


In [76]:
# Investigate the model performance in regards to the Model parameters
one_cv_layer

array([[-723.18203947],
       [-731.36736257],
       [-740.73284658],
       [-743.26392149],
       [-730.09173807],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ],
       [   0.        ]])

In [92]:
rf_rscv.best_params_

{'bootstrap': True,
 'max_depth': 60,
 'min_samples_leaf': 10,
 'min_samples_split': 9,
 'n_estimators': 97}

In [107]:
rf_rscv_results.loc[i, 'model'] = rf_rscv
rf_rscv_results.loc[i, 'MAE'] = mean_absolute_error(y_test, outer_cv_results)
rf_rscv_results.loc[i, 'RMSE'] = np.sqrt(mean_squared_error(y_test, outer_cv_results))
rf_rscv_results.at[i, 'params'] = rf_rscv.best_params_

In [108]:
rf_rscv.best_estimator_
rf_rscv_results

Unnamed: 0,model,MAE,RMSE,params
4,"RandomizedSearchCV(cv=KFold(n_splits=2, random...",2797.584748,4042.486859,"{'bootstrap': True, 'max_depth': 60, 'min_samp..."


In [109]:
import subprocess
import os

notebook_name = '03_b_modeling_slim.ipynb' 
# get the current date from the system

# Get the current date
import datetime
now = datetime.datetime.now()
date = now.strftime("%Y-%m-%d")

# define the output name 
output_name = f"{notebook_name.split('.')[0]}_{date}.html"

# Convert the notebook to HTML 
subprocess.call(['jupyter', 'nbconvert', '--to', 'html', notebook_name])
# rename the file
os.rename(notebook_name.split('.')[0] + '.html', output_name)

[NbConvertApp] Converting notebook 03_b_modeling_slim.ipynb to html
[NbConvertApp] Writing 314520 bytes to 03_b_modeling_slim.html
