## Basic ML Classification
This notebook is based on `Chapter 3 - Classification` of Hands-On ML, which uses the standard MNIST dataset

In [None]:
# common imports
import sys
import sklearn
import numpy as np
import os
import pandas as pd
from pathlib import Path

# Setting seed value 
np.random.seed(42)

#figures
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib as mpl
# Sets defaults/can also be imported from a style file
mpl.rc('axes', labelsize=12)
mpl.rc('xtick', labelsize=10)
mpl.rc('ytick', labelsize=10)

In [None]:
# Saving path to a directory
path = Path('../input/Kannada-MNIST/')
for file in path.iterdir():
    print(file)

In [None]:
df = pd.read_csv(path/'train.csv', low_memory=False)
df.head()

In [None]:
# Loading dataset into separate numpy arrays
x = df.iloc[:,1:].values
y = df.iloc[:,0].values
y[:5]

In [None]:
# Number of unique classes in the dataset
np.unique(y)

In [None]:
# Similar to MNIST image shape
x.shape, y.shape

In [None]:
# Viewing the digits in the dataset
# Aren't very clea
digit = x[np.random.randint(0, 100)].reshape(28, 28)
plt.imshow(digit, cmap=mpl.cm.binary, interpolation='nearest')
plt.axis("off")
plt.show()

In [None]:
# Plotting images of each uniques class in a row
def plot_digits(images, row_size):
    n_rows = (images.shape[0]) // row_size
    row_images = []
    for row in range(n_rows):
        rimages = images[row * row_size : (row + 1) * row_size]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap=mpl.cm.binary)
    plt.axis("off")

In [None]:
'''
The plot of images shows that the labels are in a sequential order
So we need to randomize the data before passing through a NN model
'''
plt.figure(figsize=(9,9))
example_images = x[:100].reshape(-1, 28, 28)
plot_digits(example_images, 10)
plt.show()


In [None]:
# Shuffling the data 
# This creates a random permutation of indices in the dataset
indices = np.random.permutation(x.shape[0])
x = x[indices]
y = y[indices]

In [None]:
# Now the first 100 points have been shuffled
plt.figure(figsize=(9,9))
example_images = x[:100].reshape(-1, 28, 28)
plot_digits(example_images, 10)
plt.show()

In [None]:
# Train-Test Split
# Using Stratified Split allows to have the sampe proportion of the data in both train and test set
from sklearn.model_selection import StratifiedShuffleSplit
ss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=42)
for train_ind, test_ind in ss.split(x, y):
    x_train, x_test = x[train_ind], x[test_ind]
    y_train, y_test = y[train_ind], y[test_ind]
print("Train Shape:", x_train.shape)
print("Test Shape:", x_test.shape)

In [None]:
'''
After train test split the proportion of the classes remains the same
'''
print("Proportion of classes in original data:", np.unique(y, return_counts=True)[1] / y.shape[0])
print("Proportion of classes in training data:", np.unique(y_train, return_counts=True)[1] / y_train.shape[0])

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
# Hinge loss which is the default loss function uses the SVM Classifier
sgd = SGDClassifier(n_jobs=-1, random_state = 42)

### Confusion Matrix for complete dataset

In [None]:
# For confusion matrix on complete data
from sklearn.model_selection import cross_val_predict
y_pred = cross_val_predict(sgd, x, y , cv=5)

In [None]:
def plot_confusion_matrix(df, col=plt.cm.gray):
    fig = plt.figure(figsize=(6,6))
    ax = fig.add_subplot(111)
    cax = ax.matshow(df, cmap=col)
    fig.colorbar(cax)

In [None]:
# The SGD Classifier works well as very few classes are wrongly predicted
plot_confusion_matrix(confusion_matrix(y, y_pred), plt.cm.plasma)
plt.show()

### Binary classifier for Precision Recall Curve

In [None]:
# Setting up as binary classifier as precision-recall can't be plot for multiclass
# Taking similar to book for number 5
y_5 = (y==5)
y_pred_5 = cross_val_predict(sgd, x, y_5 , cv=5, method="decision_function")

In [None]:
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_5, y_pred_5)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.legend(loc="center right", fontsize=16) # Not shown in the book
    plt.xlabel("Threshold", fontsize=16)        # Not shown
    plt.grid(True)                              # Not shown
    plt.axis([-50000, 50000, 0, 1])             # Not shown

In [None]:
recall_90_precision = recalls[np.argmax(precisions >= 0.90)]
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]


plt.figure(figsize=(8, 4))                                                                  # Not shown
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.plot([threshold_90_precision, threshold_90_precision], [0., 0.9], "r:")                 # Not shown
plt.plot([-50000, threshold_90_precision], [0.9, 0.9], "r:")                                # Not shown
plt.plot([-50000, threshold_90_precision], [recall_90_precision, recall_90_precision], "r:")# Not shown
plt.plot([threshold_90_precision], [0.9], "ro")                                             # Not shown
plt.plot([threshold_90_precision], [recall_90_precision], "ro")                             # Not shown
plt.show()

In [None]:
# We usually select the point just before the sharp drop i.e recall=0.9 and precision=0.95
# Our model works well as the precision recall curve is steep and since our data is imbalances, it works better
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])
    plt.grid(True)

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.plot([0.90, 0.90], [0., 0.95], "r:")
plt.plot([0.0, 0.9], [0.95, 0.95], "r:")
plt.plot([0.9], [0.95], "ro")
plt.show()

### ROC curve

In [None]:
from sklearn.metrics import roc_curve
# true positive rate is recall
fpr, tpr, thresholds = roc_curve(y_5, y_pred_5)

In [None]:
# Here we want our curve to be as steep as possible initially
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # dashed diagonal
    plt.axis([0, 1, 0, 1])                                    # Not shown in the book
    plt.xlabel('False Positive Rate (Fall-Out)', fontsize=16) # Not shown
    plt.ylabel('True Positive Rate (Recall)', fontsize=16)    # Not shown
    plt.grid(True)                                            # Not shown

plt.figure(figsize=(8, 6))                         # Not shown
plot_roc_curve(fpr, tpr)
plt.plot([4.837e-3, 4.837e-3], [0., 0.4368], "r:") # Not shown
plt.plot([0.0, 4.837e-3], [0.4368, 0.4368], "r:")  # Not shown
plt.plot([4.837e-3], [0.4368], "ro")               # Not shown
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_5, y_pred_5)

### Error Analysis for multiclass classification

In [None]:
conf_sum = confusion_matrix(y, y_pred)
row_sums = conf_sum.sum(axis=1, keepdims=True)
norm_conf = conf_sum / row_sums

In [None]:
np.fill_diagonal(norm_conf, 0)
plt.matshow(norm_conf, cmap=plt.cm.gray)
plt.show()

In [None]:
cl_3, cl_7 = 3, 7
X_33 = x[(y == cl_3) & (y_pred == cl_3)]
X_37 = x[(y == cl_3) & (y_pred == cl_7)]
X_73 = x[(y == cl_7) & (y_pred == cl_3)]
X_77 = x[(y == cl_7) & (y_pred == cl_7)]

In [None]:
# 3 and 7 both look similar with rounding in 3 in lower part and 7 having round in upper part
# Also great way of using subplots
plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_33[:25].reshape(-1, 28, 28), 5)
plt.subplot(222); plot_digits(X_37[:25].reshape(-1, 28, 28), 5)
plt.subplot(223); plot_digits(X_73[:25].reshape(-1, 28, 28), 5)
plt.subplot(224); plot_digits(X_77[:25].reshape(-1, 28, 28), 5)
plt.show()