### 1. A classifier on MNIST dataset with 97% accuracy on test data.

In [1]:
from sklearn.datasets import fetch_openml
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
## Loading the MNIST dataset

mnist = fetch_openml('mnist_784', version=1)

In [4]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [5]:
X, y = mnist["data"], mnist["target"]

# features
print(X.shape)

# labels
print(y.shape)

(70000, 784)
(70000,)


In [6]:
## Train test split
X_train, X_test, y_train, y_test = X.iloc[:60000], X.iloc[60000:], y[:60000], y[60000:]

## Training a KNN classifier on MNIST dataset
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()

## Computing accuracy via cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn_clf, X_train, y_train, cv=3, verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.968) total time=  55.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   55.7s remaining:    0.0s


[CV] END ................................ score: (test=0.967) total time=  46.6s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.7min remaining:    0.0s


[CV] END ................................ score: (test=0.968) total time=  46.6s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.5min finished


In [7]:
## Mean accuracy

scores.mean()

0.9674166666666667

=> We're already getting some fair amount of accuracy. Let's increase it even more by tweaking some parameters via GridSearchCV.

In [8]:
## Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

# Params to be hypertuned
grid_params = {
    "n_neighbors": [3, 4, 5],
    "weights": ['uniform', 'distance']
}

# Grid Search
grid_search = GridSearchCV(param_grid=grid_params, estimator=knn_clf, cv=3, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ....n_neighbors=3, weights=uniform;, score=0.969 total time=  43.6s
[CV 2/3] END ....n_neighbors=3, weights=uniform;, score=0.968 total time=  45.7s
[CV 3/3] END ....n_neighbors=3, weights=uniform;, score=0.968 total time=  48.4s
[CV 1/3] END ...n_neighbors=3, weights=distance;, score=0.970 total time=  44.0s
[CV 2/3] END ...n_neighbors=3, weights=distance;, score=0.969 total time=  45.0s
[CV 3/3] END ...n_neighbors=3, weights=distance;, score=0.969 total time=  44.9s
[CV 1/3] END ....n_neighbors=4, weights=uniform;, score=0.966 total time=  52.6s
[CV 2/3] END ....n_neighbors=4, weights=uniform;, score=0.966 total time=  51.0s
[CV 3/3] END ....n_neighbors=4, weights=uniform;, score=0.967 total time=  51.7s
[CV 1/3] END ...n_neighbors=4, weights=distance;, score=0.971 total time=  49.6s
[CV 2/3] END ...n_neighbors=4, weights=distance;, score=0.970 total time=  50.0s
[CV 3/3] END ...n_neighbors=4, weights=distance;,

GridSearchCV(cv=3, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 4, 5],
                         'weights': ['uniform', 'distance']},
             verbose=3)

In [9]:
## best params
best_params = grid_search.best_params_
print("best_params: ", best_params)

## best estimator
best_knn_clf = grid_search.best_estimator_
best_knn_clf

best_params:  {'n_neighbors': 4, 'weights': 'distance'}


KNeighborsClassifier(n_neighbors=4, weights='distance')

In [8]:
# KNN Classifier w best params

best_knn_clf = KNeighborsClassifier(n_neighbors=4, weights="distance")
best_knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [9]:
# Computing accuracy of the best KNN clf on test data

best_knn_clf.score(X_test, y_test)

0.9714

=> Woa, we got what we wanted, 97% accuracy on the test data,

### 3. Tackle the `titanic` dataset.