### Digit Recognition Project

In [None]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

import matplotlib.pyplot as plt

### Load data

In [None]:
labeled_images = np.load("labeled_images.npy")
labeled_digits = np.load("labeled_digits.npy")

autograder_images = np.load("autograder_images.npy")

In [None]:
# this is an array containing all the 3750 images of size 28 times 28 pixels:
labeled_images.shape

In [None]:
# this is an array containing the digit corresponding to each image:
labeled_digits[0:10]

In [None]:
# Number of images to display
num_images = 10

# Set up the plot
plt.figure(figsize=(10, 10))
for i in range(num_images):
    # Create a subplot for each image
    plt.subplot(1, num_images, i + 1)
    # Display the image (assuming grayscale)
    plt.imshow(labeled_images[i], cmap='gray')
    # Turn off axis labels
    plt.axis('off')
plt.show()

### K-Nearest Neighbors Classifier

### Logistic Regression Classifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier

x = labeled_images.reshape(labeled_images.shape[0], -1)
X_train, X_test, y_train, y_test = train_test_split(x, labeled_digits, random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('logistic', 
                  # LogisticRegression(max_iter = 150)
                    SGDClassifier(loss="log_loss", max_iter=1000, tol=1e-3)
                  )
                 ])
#pipe.fit(X_train, y_train).score(X_test, y_test)

param_grid = {
    'logistic__fit_intercept': [True, False],
}

grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid, cv=5,
                            scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_params_)
grid.score(X_test, y_test)

In [None]:
# import pandas as pd
# 
# mean_scores = np.array(grid.cv_results_["mean_test_score"])
# mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))
# mean_scores = mean_scores.max(axis=0)
# mean_scores = pd.DataFrame(
#     mean_scores.T, index=N_FEATURES_OPTIONS, columns=reducer_labels
# )
# ax = mean_scores.plot.bar()
# ax.set_title("Comparing feature reduction techniques")
# ax.set_xlabel("Reduced number of features")
# ax.set_ylabel("Digit classification accuracy")
# ax.set_ylim((0, 1))
# ax.legend(loc="upper left")
# plt.show()

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

clf = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(dummy_cv_scores)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
# scaler
# scaler.mean_
# scaler.scale_
X_scaled = scaler.transform(X_train)
X_scaled

In [None]:
# y_pred = clf.predict(x)
# accuracy = np.mean(y_pred == labeled_digits)
# print('Accuracy on the training set:', accuracy)
# y_pred = clf.predict(X_test)
# accuracy = np.mean(y_pred == y_test)
# print('Accuracy on the test set:', accuracy)

### Support Vector Machine Classifier

In [None]:
# Preprocess the Data
labeled_images_flat = labeled_images.reshape(3750, -1)
labeled_images_flat = labeled_images_flat / 255.0
autograder_images_flat = autograder_images.reshape(len(autograder_images), -1) / 255.0

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    labeled_images_flat, labeled_digits, test_size=0.2)
print(len(X_train))
print(len(X_test))

### Train the SVM classifier
svm_model = SVC(kernel='rbf', C=1.0, gamma='scale')
svm_model.fit(X_train, y_train)

# Test the SVM Classifier (Accuracy)
from sklearn.metrics import accuracy_score
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

# Use SVM Classifier on the Autograder dataset
#prediction = svm_model.predict(autograder_images_flat)
#result = np.append(accuracy, prediction)
#pd.DataFrame(result).to_csv("autograder.txt", index=False, header=False)

### Decision Tree

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

x = labeled_images.reshape(labeled_images.shape[0], -1)
X_train, X_test, y_train, y_test = train_test_split(x, labeled_digits, random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), 
                 ('decision', 
                    DecisionTreeClassifier()
                  )
                 ])

param_grid = {
    # 'decision__fit_intercept': [True, False],
}

grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid, cv=5,
                            scoring='accuracy')
grid.fit(X_train, y_train)
print(grid.best_params_)
grid.score(X_test, y_test)

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score

clf = DummyClassifier(strategy='most_frequent', random_state=0)
dummy_cv_scores = cross_val_score(clf, X_train, y_train, cv=5)
print(dummy_cv_scores)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

### Prepare autograder submission

In the autograder you will need to provide two things: 1) estimate of the accuracy of your model on unseen data, 2) the predictions on the autograder images. For the autograder images we only provide the images and not the class labels. Thus, you cannot compute the accuracy on this data yourself - you need to estimate that with labeled data that is provided (labeled_images, labeled_digits). We will calculate the accuracy for you on the autograder data and you will receive an automatic grade based on this. 

In [None]:
estimate = np.array([0.7]) # TODO Replace this with your estimate of the accuracy on new data
prediction = np.array([-1] * len(autograder_images)) # TODO Replace this with your predictions of your best model
# For example using something like:
# prediction = my_super_duper_model.predict(autograder_images) 

result = np.append(estimate, prediction)

# The code below will write your estimate and prediction to a file named autograder.txt
# You will need to upload this file to the Vocareum autograder
pd.DataFrame(result).to_csv("autograder.txt", index=False, header=False)