# 6. Model training

### Import needed packages

In [6]:
from pickle import load, dump

import pandas as pd

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import GridSearchCV

from functions.save_load import load_sets

### Loading the dataset

In [3]:
X_train, y_train = load_sets('dataprep_train.pkl')
X_val, y_val = load_sets('dataprep_validation.pkl')

In [4]:
print(X_train.shape, X_val.shape)

(71274, 36) (11939, 36)


In [7]:
dtc = DecisionTreeClassifier(criterion = 'gini', random_state = 20)

param_dist = {'max_depth': [3,5,6,7], 'min_samples_split': [140,280,420,560,700]}
dtc_grid = GridSearchCV(dtc, param_grid = param_dist, cv = 10, n_jobs = -1 )
dtc_grid.fit(X_train, y_train)

print('Best Parameters using Grid Search: \n', dtc_grid.best_params_)

Best Parameters using Grid Search: 
 {'max_depth': 7, 'min_samples_split': 140}


In [8]:
#Training the model with the best choice of parameters

dtc = DecisionTreeClassifier(criterion = 'gini', random_state = 0, max_depth = 7, min_samples_split=140)
dtc.fit(X_train,y_train) 

DecisionTreeClassifier(max_depth=7, min_samples_split=140, random_state=0)

In [9]:
y_pred_dtc = dtc.predict(X_val)

## Run the metrics for performance comparision

In [None]:
acc_dtc = accuracy_score(y_val, y_pred_dtc)
conf_matrix = confusion_matrix(y_val, y_pred_dtc)
clf_report = classification_report(y_val, y_pred_dtc, output_dict=True)
roc_auc = roc_auc_score(y_val, y_pred_dtc)
precision = precision_score(y_val, y_pred_dtc)
recall = recall_score(y_val, y_pred_dtc)

In [11]:
conf_matrix = pd.DataFrame(conf_matrix, columns=['target', 'predicted'])

### Save model

In [12]:
with open('model.pkl', 'wb') as f:
    dump(dtc, f)

## Pipeline output

In [13]:
import json

metrics = {
'metrics': [
    {
        'name': 'accuracy-score',
        'numberValue':  acc_dtc,
        'format': 'PERCENTAGE'
    },
    {
        'name': 'roc-auc-score',
        'numberValue':  roc_auc,
        'format': 'PERCENTAGE'       
    },
    {
        'name': 'precision-score',
        'numberValue':  precision,
        'format': 'PERCENTAGE'       
    },
    {
        'name': 'recall-score',
        'numberValue':  recall,
        'format': 'PERCENTAGE'       
    }
]
}


with open('mlpipeline-metrics.json', 'w') as f:
    json.dump(metrics, f)