In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"

def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [2]:
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [3]:
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy="median")
training_features = imputer.fit_transform(train_data)
testing_features = imputer.transform(test_data)
all_cases_features = imputer.transform(all_cases_input_data)

In [4]:
from sklearn.preprocessing import StandardScaler
scalar = StandardScaler()
training_features = scalar.fit_transform(training_features)
testing_features = scalar.transform(testing_features)
all_cases_features = scalar.transform(all_cases_features)

In [5]:
training_labels = train_labels.values
testing_labels = test_labels.values

In [16]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures
from sklearn.base import clone

poly_features = PolynomialFeatures(degree=2, include_bias=False)
poly_features.fit(training_features)

X_train = poly_features.transform(training_features)
Y_train = training_labels
X_test = poly_features.transform(testing_features)
Y_test = testing_labels

sgd_clf = SGDClassifier(random_state=42, penalty="l2")
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=2, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1.]


Accuracy on testing data set
 0.25


In [17]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

poly_features = PolynomialFeatures(degree=2, include_bias=False)
poly_features.fit(training_features)

X_train = poly_features.transform(training_features)
Y_train = training_labels
X_test = poly_features.transform(testing_features)
Y_test = testing_labels

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("Cross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=2, scoring="accuracy"))


parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'n_neighbors' : [2,3,4,5,6,7,8,9,10],
              'weights' : ['uniform', 'distance']
             }
clf = GridSearchCV(KNeighborsClassifier(), parameters)
clf.fit(X_train, Y_train)
print("\nBest params: ", clf.best_params_)
print("\n\nAccuracy on testing data set\n", sum(Y_test == clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [0.8 1. ]

Best params:  {'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'uniform'}


Accuracy on testing data set
 0.55


In [18]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = poly_features.transform(training_features)
Y_train = training_labels
X_test = poly_features.transform(testing_features)
Y_test = testing_labels

forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=15)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=2, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1.]


Accuracy on testing data set
 0.9


In [22]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = poly_features.transform(training_features)
Y_train = training_labels
X_test = poly_features.transform(testing_features)
Y_test = testing_labels

batch_size = 1
num_classes = 10
epochs = 5

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

adam = Adam()
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

ModuleNotFoundError: No module named 'keras'

In [19]:
X_train.shape[1]

35

In [21]:
Y_test

array([0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9])