In [9]:
# Imports
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [10]:
# Loading dataset
mnist = fetch_openml('mnist_784')
# View the shape of the dataset
mnist.data.shape

(70000, 784)

In [11]:
# Setting features
X = mnist.data
# Setting target
y = mnist.target
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [23]:
def evaluate(model, X_train, X_test, y_train, y_test):
    """
    Evaluate a model on train and test sets
    """
    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)
    print('Classification report: \n', classification_report(y_test, model.predict(X_test)))
    print(confusion_matrix(y_test, model.predict(X_test)))

In [18]:
# Creating StandardScaler object
scaler = StandardScaler()
# Creating PCA object
pca = PCA(n_components=.95)
# Creating pipeline for StandardScaler and PCA
transformer = make_pipeline(scaler, pca)

In [24]:
# Creating first KNN model with PCA pipeline
knn1 = make_pipeline(transformer, KNeighborsClassifier())
# Fitting first KNN model
knn1.fit(X_train, y_train)

evaluate(knn1, X_train, X_test, y_train, y_test)



ValueError: Expected 2D array, got 1D array instead:
array=[3. 2. 8. ... 1. 0. 0.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

---
---

> For this assignment, VSCode doesn't have the %%time function since the jupyter notebook extension comes with execution time built into the cell as default (Time took to execute: 28.2 sec). I commented it out for you so that you can run it and see the result on your end. 
> 
> - I know that jupyter notebook is required but I have dojo-env loaded and anaconda. Also, I like VSCode using my computers resources (they tend to be faster than client-server IDE's). So trying to be honest I'm going to be stubborn about switching.

---
---

In [20]:
# Creating second KNN model without PCA pipeline
knn2 = make_pipeline(scaler, KNeighborsClassifier())
knn2.fit(X_train, y_train)

evaluate(knn2, X_train, X_test, y_train, y_test)

Classification report: 
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      1714
           1       0.95      0.99      0.97      1977
           2       0.95      0.93      0.94      1761
           3       0.93      0.94      0.94      1806
           4       0.94      0.93      0.94      1587
           5       0.94      0.93      0.94      1607
           6       0.96      0.97      0.97      1761
           7       0.94      0.93      0.93      1878
           8       0.97      0.89      0.93      1657
           9       0.90      0.92      0.91      1752

    accuracy                           0.94     17500
   macro avg       0.94      0.94      0.94     17500
weighted avg       0.94      0.94      0.94     17500

[[1681    1    5    2    0    8   14    2    1    0]
 [   0 1962    8    1    2    0    1    1    1    1]
 [  18   23 1646   25    7    6   13   11    8    4]
 [   2    6   21 1703    3   16    2   26   14   13]
 [  

---

#### a. Which model performed the best on the test set?



---

#### b. Which model was the fastest at making predictions?



---