## Tuning our model by introducing a validation set

In [1]:
# Import the essentials

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import random

In [2]:
# Import our classifciation and regression datasets

heart_disease = pd.read_csv('../Pandas/heart-disease.csv')

from sklearn.datasets import fetch_california_housing

housing_dataset = fetch_california_housing()

housing_dataframe = pd.DataFrame(housing_dataset['data'],columns=housing_dataset['feature_names'])

housing_dataframe['Target'] = housing_dataset['target']

## Tuning hyperparameters by hand

Let's make 3 set, training, validation and test

In [3]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

We're going to adjust:

* max_depth
* max_features
* min_samples_leaf
* min_samples_split
* n_estimators

In [4]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_true, y_preds):
    """
    Performs evaluaton comparison on y_true labels vs y_pred labels on a classification problem.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {"accuracy": round(accuracy, 2),
                  "precision": round(precision, 2),
                  "recall": round(recall, 2),
                  "f1": round(f1,2)}
    print(f"Accuracy: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score:{f1:.2f}")
    
    return metric_dict

In [5]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data
heart_disease_shuffled = heart_disease.sample(frac=1)

# Split into X & y
X = heart_disease_shuffled.drop('target', axis=1)
y = heart_disease_shuffled['target']

# Split the date into train,validation & test sets
train_split = round(0.7 * len(heart_disease_shuffled)) # 70% of data
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled)) # 15% of data

X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[valid_split:]

# Initialise Model

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Make baseline predictions

y_preds = clf.predict(X_valid)

# Evaluate the classifier on validation set

baseline_metrics = evaluate_preds(y_valid, y_preds)

Accuracy: 82.22%
Precision: 0.81
Recall: 0.88
F1 Score:0.85


In [8]:
np.random.seed(42)

# Create a second Classifier with different parameters

clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

# Make predictions with different hyperparameters
y_preds_2 = clf_2.predict(X_valid)

# Evaluate the 2nd classifier
clf_2_metrics = evaluate_preds(y_valid, y_preds_2)

Accuracy: 82.22%
Precision: 0.84
Recall: 0.84
F1 Score:0.84


In [13]:
np.random.seed(42)

# Create a second Classifier with different parameters

clf_3 = RandomForestClassifier(n_estimators=100,
                              max_depth=10)
clf_3.fit(X_train, y_train)

# Make predictions with different hyperparameters
y_preds_3 = clf_3.predict(X_valid)

# Evaluate the 2nd classifier
clf_3_metrics = evaluate_preds(y_valid, y_preds_3)

Accuracy: 80.00%
Precision: 0.81
Recall: 0.84
F1 Score:0.82
