In [55]:
#initialize

import sklearn
from sklearn import tree
from scipy import stats
# import the necessary packages
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Downloading the dataset
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) 
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
mnist["data"], mnist["target"]

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([5, 0, 4, ..., 4, 5, 6], dtype=int8))

In [56]:
# 75 / 25 Train-Test Split
(train_data, test_data, train_labels, test_labels) = train_test_split(mnist["data"][:5000], mnist["target"][:5000], test_size=0.2)

In [57]:
# 90 / 10 Train / Validation Split
(train_data, validation_data, train_labels, validation_labels) = train_test_split(train_data, train_labels,
test_size=0.1, random_state=84)

#SPLIT ALLOCATION
print("Split Allocation ---------------------")
print("training: {}".format(len(train_labels)))
print("validation: {}".format(len(validation_labels)))
print("testing: {}".format(len(test_labels)))

Split Allocation ---------------------
training: 3600
validation: 400
testing: 1000


## Finding 9's using K- Nearest Neighbor

In [58]:
# initialize the values of k
# list of accuracies for each value of k
k_val = range(1, 30)
accuracies = []
 
# loop over various values of k
for k in range(1, 4):
# train the k-Nearest Neighbor classifier with k
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(train_data, train_labels)
 
# evaluation of model
    score = model.score(validation_data, validation_labels)
    print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    accuracies.append(score)
 
# highest accuracy
i = int(np.argmax(accuracies))
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (k_val[i],
accuracies[i] * 100))

k=1, accuracy=95.00%
k=2, accuracy=94.50%
k=3, accuracy=93.75%
k=1 achieved highest accuracy of 95.00% on validation data


In [59]:
# test data
model = KNeighborsClassifier(n_neighbors=k_val[i])
model.fit(train_data, train_labels)
k_nearest_predictions = model.predict(test_data)
 

In [60]:
# classification report for each digit
print("K Nearest Score")
print(classification_report(test_labels, k_nearest_predictions))

K Nearest Score
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        94
           1       0.90      0.98      0.94       114
           2       0.98      0.88      0.93       111
           3       0.91      0.93      0.92       100
           4       0.94      0.91      0.93       115
           5       0.95      0.90      0.92        87
           6       0.92      0.96      0.94        83
           7       0.93      0.92      0.93       106
           8       0.95      0.88      0.92        86
           9       0.87      0.90      0.89       104

   micro avg       0.93      0.93      0.93      1000
   macro avg       0.93      0.93      0.93      1000
weighted avg       0.93      0.93      0.93      1000



In [61]:
#how many 9's did I find
est= np.round(sum(x == 9 for x in k_nearest_predictions)*.87,0)
print("correct predictions: {}".format(sum(x == 9 for x in k_nearest_predictions)*.87))
print("actual #: {}".format(sum(x == 9 for x in test_labels)))

print("about "+format(est)+" nines!")

correct predictions: 93.96
actual #: 104
about 94.0 nines!


### Finding 9's Using Decision Tree



In [62]:
# initialize the values of d
# list of accuracies for each value of d
max_i = []

for i in range (0,20):
    depth_val = range(1, 20)
    tree_accuracies = []
    # loop over various values of d
    for k in range(1, 20):
    # train the decision-tree classifier with depth
        model = tree.DecisionTreeClassifier(max_depth=k)
        model.fit(train_data, train_labels)
 
    # evaluation of model
        score = model.score(validation_data, validation_labels)
        #print("k=%d, accuracy=%.2f%%" % (k, score * 100))
        tree_accuracies.append(score)
 
    # highest accuracy
    i = int(np.argmax(tree_accuracies))
    #print("k=%d achieved highest accuracy of %.2f%% on validation data" % (depth_val[i],
    #tree_accuracies[i] * 100))
    max_i.append(i)
depth = max(set(max_i), key=max_i.count)

    



In [63]:
# test data
model = tree.DecisionTreeClassifier(max_depth=depth)
model.fit(train_data, train_labels)
tree_predictions = model.predict(test_data)
 

In [64]:
# classification report for each digit
print("Decision Tree Score")
print(classification_report(test_labels, tree_predictions))
print("Accuracy:",metrics.accuracy_score(test_labels, tree_predictions))


Decision Tree Score
              precision    recall  f1-score   support

           0       0.87      0.95      0.91        94
           1       0.88      0.83      0.86       114
           2       0.72      0.73      0.72       111
           3       0.69      0.72      0.71       100
           4       0.84      0.76      0.80       115
           5       0.78      0.64      0.70        87
           6       0.81      0.84      0.83        83
           7       0.82      0.86      0.84       106
           8       0.70      0.74      0.72        86
           9       0.76      0.81      0.79       104

   micro avg       0.79      0.79      0.79      1000
   macro avg       0.79      0.79      0.79      1000
weighted avg       0.79      0.79      0.79      1000

Accuracy: 0.789


In [65]:
#how many 9's did I find
est_tree = np.round(sum(x == 9 for x in tree_predictions)*.69,0)
print("correct predictions: {}".format(sum(x == 9 for x in tree_predictions)*.69))
print("actual #: {}".format(sum(x == 9 for x in test_labels)))

print("about "+format(est_tree)+" nines!")

correct predictions: 75.89999999999999
actual #: 104
about 76.0 nines!


### Finding 9's Using Random Forest



In [66]:
depth_val = range(1, 20)
forest_accuracies = []
    # loop over various values of d
for k in range(1, 20):
    # train the random forest classifier with depth
    model = RandomForestClassifier(n_estimators=100,max_depth=k)
    model.fit(train_data, train_labels)
 
    # evaluation of model
    score = model.score(validation_data, validation_labels)
    #print("k=%d, accuracy=%.2f%%" % (k, score * 100))
    forest_accuracies.append(score)
 
# highest accuracy
i = int(np.argmax(forest_accuracies))
print("k=%d achieved highest accuracy of %.2f%% on validation data" % (depth_val[i],
forest_accuracies[i] * 100))



k=13 achieved highest accuracy of 95.50% on validation data


In [68]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100,max_depth=depth_val[i])

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(train_data,train_labels)

random_forest_predictions=clf.predict(test_data)



In [None]:
# classification report for each digit
print("Random Forest Score")
print(classification_report(test_labels, random_forest_predictions))
print("Accuracy:",metrics.accuracy_score(test_labels, random_forest_predictions))


In [None]:
#how many 9's did I find
est= np.round(sum(x == 9 for x in random_forest_predictions)*.88,0)
print("correct predictions: {}".format(sum(x == 9 for x in random_forest_predictions)*.88))
print("actual #: {}".format(sum(x == 9 for x in test_labels)))

print("about "+format(est)+" nines!")