# MNIST - Ensemble Learning
In this code exercise we are going to practice on creating different models. 

In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import joblib
np.random.seed(42)

In [2]:
mnist = fetch_openml('mnist_784', version=1, cache=True, as_frame=False)

X = mnist["data"]
y = mnist["target"].astype(np.uint8)

# Taking less data to speed up training. 
X_train_random = X[:5000]
y_train_random = y[:5000]
X_val_random = X[5000:6000]
y_val_random = y[5000:6000]


  warn(


In [3]:
scaler = StandardScaler()
X_random_train_scaled = scaler.fit_transform(X_train_random)
X_random_val_scaled = scaler.transform(X_val_random)

In [4]:
knn = KNeighborsClassifier()
svc = SVC(kernel='linear')
random_forest = RandomForestClassifier()

param_dist_knn_rcv = {
    'n_neighbors': range(1, 21),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]
}

param_dist_svc_rcv = {
    'C': np.logspace(-1, 1, num=10),
}

param_dist_random_forest_rcv = {
    'n_estimators': [int(x) for x in np.linspace(start=700, stop=1400, num=20)],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [int(x) for x in np.linspace(10, 50, num=5)] + [None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}

In [5]:
knn_random_search = RandomizedSearchCV(knn, param_distributions=param_dist_knn_rcv, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
knn_random_search.fit(X_random_train_scaled, y_train_random)

print("Best hyperparameters for KNeighborsClassifier: ", knn_random_search.best_params_)
print("Best accuracy for KNeighborsClassifier:", knn_random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters for KNeighborsClassifier:  {'weights': 'distance', 'p': 1, 'n_neighbors': 3, 'algorithm': 'auto'}
Best accuracy for KNeighborsClassifier: 0.9168


In [6]:
svc_random_search = RandomizedSearchCV(svc, param_distributions=param_dist_svc_rcv, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
svc_random_search.fit(X_random_train_scaled, y_train_random)

print("Best hyperparameters for SVC: ", svc_random_search.best_params_)
print("Best accuracy for SVC:", svc_random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters for SVC:  {'C': 0.1}
Best accuracy for SVC: 0.9062000000000001


In [7]:
rf_random_search = RandomizedSearchCV(random_forest, param_distributions=param_dist_random_forest_rcv, n_iter=10, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random_search.fit(X_train_random, y_train_random)

print(f"Best hyperparameters for Random forest classifier: ", rf_random_search.best_params_)
print(f"Best accuracy for Random forest classifier: ", rf_random_search.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best hyperparameters for Random forest classifier:  {'n_estimators': 1031, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 40, 'bootstrap': False}
Best accuracy for Random forest classifier:  0.9436


In [8]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [9]:
param_grid_knn = {
    'n_neighbors': [1, 3, 5, 10],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto'],
    'p': [1]
}

param_grid_svc = {
    'C': [0.1,1,10],
}

param_grid_random_forest = {
    'bootstrap': [False],
    'max_depth': [35, 40, 45],
    'max_features': ['sqrt'],
    'min_samples_leaf': [1],
    'min_samples_split': [2],
    'n_estimators': [800, 1000, 1200]
}

In [10]:
search_knn = GridSearchCV(knn, param_grid=param_grid_knn, cv=3, verbose=2, n_jobs=-1)
search_knn.fit(X_train_scaled, y_train)
print(f"Best hyperparameters for KNN classifier: ", search_knn.best_params_)
print(f"Best accuracy for KNN classifier: ", search_knn.best_score_)


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best hyperparameters for KNN classifier:  {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Best accuracy for KNN classifier:  0.9556600491655113


In [11]:
search_svc = GridSearchCV(svc, param_grid=param_grid_svc, cv=3, verbose=2, n_jobs=-1)
search_svc.fit(X_train_scaled, y_train)
print(f"Best hyperparameters for SVC classifier: ", search_svc.best_params_)
print(f"Best accuracy for SVC classifier: ", search_svc.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Best hyperparameters for SVC classifier:  {'C': 0.1}
Best accuracy for SVC classifier:  0.9276800135424148


In [12]:
search_forest = GridSearchCV(random_forest, param_grid=param_grid_random_forest, cv=3, verbose=2, n_jobs=-1)
search_forest.fit(X_train, y_train)
print(f"Best hyperparameters for random forest classifier: ", search_forest.best_params_)
print(f"Best accuracy for random forest classifier: ", search_forest.best_score_)


Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best hyperparameters for random forest classifier:  {'bootstrap': False, 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Best accuracy for random forest classifier:  0.9705800423773115


In [13]:
y_val_pred_knn = search_knn.predict(X_val_scaled)
accuracy_knn_val = accuracy_score(y_val, y_val_pred_knn)
print("Accuracy KNN val:", accuracy_knn_val)
print ("\n")

y_val_pred_svc = search_svc.predict(X_val_scaled)
accuracy_svc_val = accuracy_score(y_val, y_val_pred_svc)
print("Accuracy SVC val:", accuracy_svc_val)
print ("\n")

y_val_pred_forest = search_forest.predict(X_val)
accuracy_forest_val = accuracy_score(y_val, y_val_pred_forest)
print("Accuracy random forest val:", accuracy_forest_val)
print ("\n")

Accuracy KNN val: 0.9597


Accuracy SVC val: 0.9292


Accuracy random forest val: 0.9745




In [14]:
test_model = search_forest.best_estimator_

test_model.fit(X_train_val, y_train_val)

y_pred_forest = test_model.predict(X_test)
accuracy_forest = accuracy_score(y_test, y_pred_forest)
print("Accuracy random forest model:", accuracy_forest)





Accuracy random forest model: 0.971


In [15]:


#joblib.dump(test_model, 'mnist_rf_model.joblib')