In [1]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [2]:
#datasets loaded by sklearn generally have descr key, data key and target key

X, y = mnist["data"], mnist["target"]
#each picture has 784 features because each picture is 28x28 pixels
print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [3]:
import matplotlib as mpl
import matplotlib.pyplot as plt

some_digit = X[0]
some_digit_image = some_digit.reshape(28, 28)
# Reshape feature vector into a 28 by 28 array

plt.imshow(some_digit_image, cmap = mpl.cm.binary, interpolation="nearest")
plt.axis("off")
plt.show()

<Figure size 640x480 with 1 Axes>

In [4]:
import numpy as np
y = y.astype(np.uint8)

In [5]:
#always create a ttest set and set it aside before inspecting the data closely, we might end up doing some bias shi
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# this is for the simple 5 classifier that checks if it is 5 or not 5

y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

from sklearn.linear_model import SGDClassifier

sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_5)

#sgd relies on randomness during training henc the name stochastic, if u want reproducible results
#u should set the random_state param

sgd_clf.predict([some_digit])

# note that shuffling for cross-validation doesnt always work well esp if you are manipulating time series data
# like stock prices 

array([ True])

In [7]:

# implementing a custom cross-validation function that works just like cross_val_score() function of sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_5):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train[test_index]
    
    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.09925
0.09675
0.10035


In [8]:
from sklearn.model_selection import cross_val_score

cross_val_score(sgd_clf, X_train, y_train_5, cv=3, scoring="accuracy")

#accuracy is generally not the preferred performance measure for classifiers, esp when dealing with
# skewed datasets (some claasses are more frequent than others)
# a much better way to evaluate the performance of a classifier is a confusion matrix

array([0.95035, 0.96035, 0.9604 ])

In [9]:
from sklearn.model_selection import cross_val_predict

#performs a stratifiedkfold except this function returns a clean prediction for each instance in the training
# set
#clean meaning the prediction was made bya model that never saw the data during training

y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3)




In [10]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_5, y_train_pred)
#each row is an actual class, and each column is a predicted class
#a perfect classifier, which is most likely overfitted, would have onon zero values on the main diag only
# has only true positives and true negatives kasi


array([[53892,   687],
       [ 1891,  3530]], dtype=int64)

In [11]:
#so far we have cross validation, confusion matrix and precision and recall (f1 score) as performance measures
#true positive rate is ratio of positive instances from correctly detected instances, this is also called recall
#precision = tp / (tp + fp)
#recall = tp / (tp+ fn)

from sklearn.metrics import precision_score, recall_score

print(precision_score(y_train_5, y_train_pred))
print(recall_score(y_train_5, y_train_pred))

0.8370879772350012
0.6511713705958311


In [12]:
#the f1 score is the harmonic mean of precision and recall
#f1 gives more weight to low values, thus precision and recall have to be both high
#although f1 favors classifiers w similar precision and recal, this isnt always what u want
from sklearn.metrics import f1_score
f1_score(y_train_5, y_train_pred)


0.7325171197343846

In [16]:
#sklearn does not let you set the threshold directly

#instead of using predict(), use this to return a score for each instance

#raising threshold decreases recall and increases precision. vice versa hehe

y_scores = sgd_clf.decision_function([some_digit])
print(y_scores)
threshold = 0
y_some_digit_pred = (y_scores > threshold)
print(y_some_digit_pred)

[2164.22030239]
[ True]


In [None]:
#which threshold would be best for this use case?

y_scores = cross_val_predict(sgd_clf, X_train, y_train, cv=3, method="decision_function")
#spits out the scores per instance

from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

#you can plot precision and recall against threshold to see which is best
#you may also directly plot precision against recall for custom usefulness on certain projects
#e.g. i have a 90% precision classifier



In [20]:
#the roc curve is another common tool for binary classifiers
#it plots the true positive rate against the false positive rate
#fpr is = 1 - tn
#btw true negative rate is also called specificity and tpr is also called sensitivity

from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1], [0, 1], 'k--')
    
plot_roc_curve(fpr, tpr)
plt.show()

ValueError: Found input variables with inconsistent numbers of samples: [1, 60000]