In [1]:
import numpy as np
import pandas as pd
mnist_test=pd.read_csv("mnist_test.csv")
mnist_train=pd.read_csv("mnist_train.csv")

In [2]:
mnist_test=np.array(mnist_test)
mnist_test=mnist_test[mnist_test[:,0].argsort(kind='quicksort')]
mnist_train=np.array(mnist_train)
mnist_train=mnist_train[mnist_train[:,0].argsort(kind='quicksort')]
X_test=mnist_test[:,1:]
y_test=mnist_test[:,0]
X_train=mnist_train[:,1:]
y_train=mnist_train[:,0]

In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt
some_digit = X_train[36000]
some_digit_image = some_digit.reshape(28, 28)  #each image is 28x28=784px
plt.imshow(some_digit_image, cmap = mpl.cm.binary,
           interpolation="nearest")
plt.axis("off")

plt.show()

<Figure size 640x480 with 1 Axes>

In [4]:
y_train[36000]

5

In [5]:
#shuffling oh yeah (some algos perform poorly when there are similar instances in a row)
shuffle_index=np.random.permutation(59999)
X_train, y_train=X_train[shuffle_index], y_train[shuffle_index]

In [6]:
#training a binary classifier to only figure out if the digit is 5 or not
y_train_5=(y_train==5)  #true for 5, false for other digits
y_test_5=(y_test==5)

In [7]:
#training a SGD classifier
from sklearn.linear_model import SGDClassifier
sgd_classifier=SGDClassifier(random_state=42)
sgd_classifier.fit(X_train,y_train_5)  #to guess which elements of X_train are 5



SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [8]:
sgd_classifier.predict([some_digit])

array([ True])

In [9]:
#cross-validation
from sklearn.model_selection import StratifiedKFold  #performs stratified sampling, gives a representative ratio of instances for each stratum
from sklearn.base import clone
skfolds=StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train,y_train_5):
    clone_clf=clone(sgd_classifier)  #deep copy of the model in an estimator
    X_train_folds=X_train[train_index]
    y_train_folds=y_train_5[train_index]
    X_test_fold=X_train[test_index]
    y_test_fold=y_train_5[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred=clone_clf.predict(X_test_fold)
    n_correct=sum(y_pred==y_test_fold)
    print(n_correct/len(y_pred))



0.9639




0.9654




0.9621481074053703


#### Kfold trains the model on all k-1 folds (not the one it's making predictions on), happens for each fold



In [10]:
from sklearn.model_selection import cross_val_score  #doing the same as above
cross_val_score(sgd_classifier, X_train, y_train_5, cv=3, scoring="accuracy")
#BUT accuracy is generally not the preferred performance measure for classifiers,
#especially when dealing with skewed datasets
#because only about 10% of the images are 5s, so if
#you always guess that an image is not a 5, you will be right about 90% of the time



array([0.9639    , 0.9654    , 0.96214811])

### Confusion matrix-a much better way to evaluate performance of classifier
##### true negatives | false positives
##### -------------------------------------------
##### false negatives| true positives

#### To compute cm, must have a set of predictions to be compared to targets:

In [11]:
from sklearn.model_selection import cross_val_predict
y_train_pred=cross_val_predict(sgd_classifier, X_train, y_train_5, cv=3)
#cross_val_predict() performs K-fold cross-validation,
#but instead of returning the evaluation scores, 
#it returns the predictions made on each test fold
#clean prediction-made by a model that never saw the data during training



In [12]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred) 
#each row: actual class
#each column: predicted class

array([[54070,   509],
       [ 1662,  3758]], dtype=int64)

#### Accuracy of positive predictions-precision of classifier
## precision=tp/(tp+fp)
#### recall-sensitivity/true positive rate(TPR)-ratio of positive instances that are correctly detected by the classifier
## recall=tp/(tp+fn)

<img src="pic.png">

### precision = how many positives have been well predicted out of all predicted positives
### recall = how many positives have been well predicted out of all actual positives

In [13]:
#takes y_true, y_pred
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_5, y_train_pred)

0.880712444340286

In [14]:
recall_score(y_train_5, y_train_pred)

0.6933579335793358

#### f1 score= precision and recall together
##### it is a harmonic mean of precision and recall. Regural mean treats all values equally, and harmonic mean gives more weight to low values --> a classifier will only get high f1 score if both precision and recall are high

### f1=2/(1/precision+1/recall)=
### = 2* precision*recall/(precision+recall)=
### = tp/(tp+(fn+fp)/2)

In [15]:
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)
#not always cool to have high f1 score(safe videos for kids, shoplifters)
#increasing precision reduces recall, and vice versa = precision/recall tradeoff

0.7758852069784248

### sklearn APIs:
### https://scikit-learn.org/stable/modules/classes.html#

<img src="ml_map.png">