# Classification

In [12]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [13]:
mnist.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'DESCR', 'details', 'categories', 'url'])

In [14]:
X, y = mnist['data'], mnist['target']

In [15]:
X.shape

(70000, 784)

In [16]:
y.shape

(70000,)

In [17]:
import matplotlib.pyplot as plt
import numpy as np

y = y.astype(np.uint8)

In [18]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## Binary Classifier

To predict 5 or not

In [19]:
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)

In [20]:
from sklearn.linear_model import SGDClassifier

sgd_classifier = SGDClassifier(random_state=42)
#sgd_classifier.fit(X_train, y_train_5)

In [21]:
sgd_classifier.predict([X[0]])

NotFittedError: This SGDClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## Performance Measures

In [22]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

In [23]:
for train_index, test_index, in skfolds.split(X_train, y_train_5):
    clone_classifier = clone(sgd_classifier)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train_5[train_index]
    X_test_fold = X_train[test_index]
    y_test_fold = y_train_5[test_index]
    
    clone_classifier.fit(X_train_folds, y_train_folds)
    y_pred = clone_classifier.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))

0.9669
0.91625
0.96785


In [24]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd_classifier, X_train, y_train_5, cv=3)

y_train_pred

array([ True, False, False, ...,  True, False, False])

In [25]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [26]:
precision_score(y_train_5, y_train_pred)

0.8370879772350012

In [27]:
recall_score(y_train_5, y_train_pred)

0.6511713705958311

In [28]:
f1_score(y_train_5, y_train_pred)

0.7325171197343846

In [29]:
y_scores = sgd_classifier.decision_function([X[0]])

NotFittedError: This SGDClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [None]:
y_scores

In [None]:
# y_scores = cross_val_predict(sgd_classifier, X_train, y_train_5, cv=3, method='decision_function')
y_scores

In [None]:
y_scores.shape
y_train_5.shape

In [30]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])
    
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

NameError: name 'y_scores' is not defined

In [None]:
plt.plot(precisions, recalls)

In [None]:
thresholds_90_precision = thresholds[np.argmax(precisions >= 0.90)]
thresholds_90_precision

In [None]:
y_train_pred_90 = (y_scores >= thresholds_90_precision)

In [None]:
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_train_5, y_scores)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(random_state=42)

y_proba_forest = cross_val_predict(forest_clf, X_train, y_train_5, cv=3, method='predict_proba')

y_proba_forest

In [None]:
from sklearn.metrics import roc_curve

y_scores_forest = y_proba_forest[:, 1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

In [None]:
# plt.plot(fpr_forest, tpr_forest, 'b:', label='SGD')
plot_roc_curve(fpr_forest, tpr_forest, 'Random Forest')
plt.legend(loc='lower right')
plt.show()

In [None]:
roc_auc_score(y_train_5, y_scores_forest)

In [None]:
y_predict_forest = y_proba_forest[:, 1] > 0.5
precision_score(y_train_5, y_predict_forest)

In [31]:
recall_score(y_train_5, y_predict_forest)

NameError: name 'y_predict_forest' is not defined

In [33]:
from sklearn.svm import SVC

svc_classifier = SVC()
#svc_classifier.fit(X_train, y_train)

In [None]:
svc_classifier.predict([X[0]])

In [None]:
scores = svc_classifier.decision_function([X[0]])
scores

In [None]:
svc_classifier.classes_

In [None]:
from sklearn.multiclass import OneVsRestClassifier
ovr_classifier = OneVsRestClassifier(SVC())
ovr_classifier.fit(X_train, y_train)

In [None]:
ovr_classifier.predict([X[0]])

### len(ovr_classifier.estimators_)