# Imports

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from datetime import datetime

%matplotlib inline 

# Reading and pre-processing

In [6]:
df = pd.read_csv('data/MNIST/train.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [8]:
X = df.drop('label',axis=1)
y = df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle=True, test_size=0.30)

In [9]:
X.shape, y.shape, X_train.shape, y_train.shape, X_val.shape, y_val.shape

((42000, 784), (42000,), (29400, 784), (29400,), (12600, 784), (12600,))

In [10]:
subsample = 4200

df_sub = df.sample(subsample)
X_sub = df_sub.drop('label',axis=1)
y_sub = df_sub['label']

X_train_sub, X_val_sub, y_train_sub, y_val_sub = train_test_split(X_sub, y_sub, shuffle=True, test_size=0.20)

In [11]:
X_sub.shape, y_sub.shape, X_train_sub.shape, y_train_sub.shape, X_val_sub.shape, y_val_sub.shape

((4200, 784), (4200,), (3360, 784), (3360,), (840, 784), (840,))

## Utils

In [12]:
def fit_predict(m, x_tr, y_tr, x_vl):
    dt0 = datetime.now()
    m.fit(x_tr, y_tr)
    dt1 = datetime.now() - dt0
    
    dt0 = datetime.now()
    y_tr_p = m.predict(x_tr)
    dt2 = datetime.now() - dt0
    
    dt0 = datetime.now()
    y_vl_p = m.predict(x_vl)
    dt3 = datetime.now() - dt0
    
    print(f"Training time: {dt1.total_seconds()}s\nTesting time with training data: {dt2.total_seconds()}s\nTesting time with validation data: {dt3.total_seconds()}s")
    return y_tr_p, y_vl_p

In [13]:
def print_score(y_tr_p, y_vl_p, y_tr, y_vl):
    score_tr = (y_tr_p == y_tr).mean()
    score_vl = (y_vl_p == y_vl).mean()
    print(f"Training set mean score: {score_tr}") 
    print(f"Validation set mean score: {score_vl}")

In [14]:
def train_predict_fullset(m):
    y_train_pred, y_val_pred = fit_predict(m, X_train, y_train, X_val)
    print_score(y_train_pred, y_val_pred, np.array(y_train), np.array(y_val))

In [15]:
def train_predict_subset(m):
    y_train_pred, y_val_pred = fit_predict(m, X_train_sub, y_train_sub, X_val_sub)
    print_score(y_train_pred, y_val_pred, np.array(y_train_sub), np.array(y_val_sub))

# Models

## 1. Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
m = RandomForestClassifier()
train_predict_fullset(m)

Training time: 2.058506s
Testing time with training data: 0.185752s
Testing time with validation data: 0.080158s
Training set mean score: 0.9990136054421769
Validation set mean score: 0.935952380952381


In [18]:
m = RandomForestClassifier(n_estimators=200, n_jobs=-1)
train_predict_fullset(m)

Training time: 8.280962s
Testing time with training data: 0.663921s
Testing time with validation data: 0.370071s
Training set mean score: 1.0
Validation set mean score: 0.9647619047619047


In [19]:
X_train_norm = X_train/255
X_val_norm = X_val/255

m = RandomForestClassifier(n_estimators=200, n_jobs=-1)
y_train_pred, y_val_pred = fit_predict(m, X_train_norm, y_train, X_val_norm)
print_score(y_train_pred, y_val_pred, np.array(y_train), np.array(y_val))

Training time: 8.763978s
Testing time with training data: 0.866736s
Testing time with validation data: 0.359183s
Training set mean score: 1.0
Validation set mean score: 0.9642857142857143


## 2. SVM

In [15]:
from sklearn.svm import SVC

In [16]:
m = SVC()
train_predict_subset(m)

Training time: 21.48263s
Testing time with training data: 11.518969s
Testing time with validation data: 2.756833s
Training set mean score: 1.0
Validation set mean score: 0.1261904761904762


#### SVM with Normalization

In [17]:
X_train_sub_norm = X_train_sub/255
X_val_sub_norm = X_val_sub/255

In [18]:
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_norm, y_train_sub, X_val_sub_norm)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 6.449678s
Testing time with training data: 7.918522s
Testing time with validation data: 2.244526s
Training set mean score: 0.9157738095238095
Validation set mean score: 0.9166666666666666


#### SVM with Thresholding

In [19]:
X_train_sub_thres = X_train_sub.mask(X_train_sub>0, 1)
X_val_sub_thres = X_val_sub.mask(X_val_sub>0, 1)

In [20]:
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_thres, y_train_sub, X_val_sub_thres)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 6.172162s
Testing time with training data: 7.621128s
Testing time with validation data: 1.944358s
Training set mean score: 0.9267857142857143
Validation set mean score: 0.9154761904761904


### With rbf kernel

In [21]:
m = SVC(kernel="linear", C=0.025)
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_norm, y_train_sub, X_val_sub_norm)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 4.158838s
Testing time with training data: 5.683071s
Testing time with validation data: 1.296252s
Training set mean score: 0.9592261904761905
Validation set mean score: 0.9261904761904762


In [22]:
m = SVC(gamma=2, C=1)
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_norm, y_train_sub, X_val_sub_norm)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 22.474027s
Testing time with training data: 11.982576s
Testing time with validation data: 3.129323s
Training set mean score: 1.0
Validation set mean score: 0.1261904761904762


In [23]:
m = SVC(probability=False, kernel="rbf", C=2.8, gamma=.0073)
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_norm, y_train_sub, X_val_sub_norm)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 4.555213s
Testing time with training data: 6.188292s
Testing time with validation data: 1.40387s
Training set mean score: 0.9851190476190477
Validation set mean score: 0.9428571428571428


In [24]:
m = SVC(probability=False, kernel="rbf", C=2.8, gamma=.0073)
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_thres, y_train_sub, X_val_sub_thres)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 4.419043s
Testing time with training data: 6.220563s
Testing time with validation data: 1.547248s
Training set mean score: 0.9952380952380953
Validation set mean score: 0.9464285714285714


## 3. Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

In [26]:
m = LogisticRegression()
train_predict_subset(m)

Training time: 7.134025s
Testing time with training data: 0.015595s
Testing time with validation data: 0.0s
Training set mean score: 1.0
Validation set mean score: 0.8357142857142857


In [27]:
X_train_sub_norm = X_train_sub/255
X_val_sub_norm = X_val_sub/255

In [28]:
m = LogisticRegression()
y_train_pred, y_val_pred = fit_predict(m, X_train_sub_norm, y_train_sub, X_val_sub_norm)
print_score(y_train_pred, y_val_pred, y_train_sub, y_val_sub)

Training time: 1.783512s
Testing time with training data: 0.008025s
Testing time with validation data: 0.002002s
Training set mean score: 0.9764880952380952
Validation set mean score: 0.9035714285714286


## 4. KNN

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
m = KNeighborsClassifier()
train_predict_subset(m)

Training time: 0.152404s
Testing time with training data: 15.271242s
Testing time with validation data: 3.824789s
Training set mean score: 0.9526785714285714
Validation set mean score: 0.9297619047619048


In [31]:
for k in range(1,9):
    print(f'k={k}')
    m = KNeighborsClassifier(n_neighbors=k)
    train_predict_subset(m)
    print('\n')

k=1
Training time: 0.122672s
Testing time with training data: 8.428626s
Testing time with validation data: 3.818578s
Training set mean score: 1.0
Validation set mean score: 0.9357142857142857


k=2
Training time: 0.127386s
Testing time with training data: 14.986785s
Testing time with validation data: 3.532891s
Training set mean score: 0.9586309523809524
Validation set mean score: 0.9178571428571428


k=3
Training time: 0.127205s
Testing time with training data: 15.138564s
Testing time with validation data: 4.001741s
Training set mean score: 0.9607142857142857
Validation set mean score: 0.9357142857142857


k=4
Training time: 0.112316s
Testing time with training data: 14.785333s
Testing time with validation data: 3.404331s
Training set mean score: 0.9565476190476191
Validation set mean score: 0.9333333333333333


k=5
Training time: 0.109428s
Testing time with training data: 14.846764s
Testing time with validation data: 3.918034s
Training set mean score: 0.9526785714285714
Validation set

## 5. Decision Tree

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
m = DecisionTreeClassifier()
train_predict_fullset(m)

Training time: 7.864265s
Testing time with training data: 0.087233s
Testing time with validation data: 0.043114s
Training set mean score: 1.0
Validation set mean score: 0.8528571428571429


## 6. Perceptron

In [34]:
from sklearn.linear_model import perceptron

In [35]:
m = perceptron.Perceptron()
train_predict_fullset(m)



Training time: 2.2225s
Testing time with training data: 0.115554s
Testing time with validation data: 0.05214s
Training set mean score: 0.8469387755102041
Validation set mean score: 0.847936507936508


## 7. Naive Bayes

In [36]:
from sklearn.naive_bayes import *

In [37]:
# Gaussian Naive Bayes
m = GaussianNB()
train_predict_fullset(m)

Training time: 0.671916s
Testing time with training data: 3.1706s
Testing time with validation data: 1.501178s
Training set mean score: 0.5558843537414966
Validation set mean score: 0.5548412698412698


In [38]:
# Naive Bayes classifier for multinomial models
m = MultinomialNB()
train_predict_fullset(m)

Training time: 0.153928s
Testing time with training data: 0.154614s
Testing time with validation data: 0.057151s
Training set mean score: 0.8245238095238095
Validation set mean score: 0.829920634920635


In [39]:
# Naive Bayes classifier for multivariate Bernoulli models.
m = BernoulliNB()
train_predict_fullset(m)

Training time: 0.612477s
Testing time with training data: 0.574241s
Testing time with validation data: 0.22879s
Training set mean score: 0.8322789115646259
Validation set mean score: 0.8386507936507936


# Test Set

In [40]:
df_test = pd.read_csv('data/MNIST/test.csv')

In [41]:
y_pred = m.predict(df_test)

In [None]:
import os

os.makedirs('output', exist_ok=True)

In [42]:
# save submission to csv
#pd.DataFrame({'ImageId': list(range(1,len(df_test)+1)),
#              'Label': y_pred}).to_csv('submission.csv', index=False, header=True)


my_submission = pd.DataFrame({'ImageId': list(range(1,len(df_test)+1)), 
                              'Label': y_pred})

my_submission.to_csv('output/submission.csv', index=False, header=True)