In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"
ALL_CASES_PREDICTIONS = "all_cases_predictions.csv"
ALL_CASES_PREDICTIONS_COLUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [2]:
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [3]:
from sklearn.decomposition import PCA

def get_dims_variances(x, minDim, tol=None, thres=0.01):
    dims = []
    variances = []
    optimum_dim = minDim
    prev_min_variance = None
    dim = minDim
    
    while(True):
        pca = PCA(n_components=dim)
        pca.fit(x)
        variance = np.array(pca.explained_variance_ratio_)
        min_variance = variance.min()
        
        dims.append(dim)
        variances.append(min_variance)
        
        if tol != None and prev_min_variance != None and min_variance + tol > prev_min_variance:
            break

        else:
            if prev_min_variance != None and min_variance < thres:
                break
                
        prev_min_variance = min_variance
        optimum_dim = dim
        dim = dim + 1

    return dims, variances, optimum_dim

In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

def process_data(x, y, poly_features=None, pca=None, OPTIMUM_DIMENSION=None, imputer=None, scalar=None):
    training_features = x.copy()
    testing_features = y.copy()
    
    if imputer == None:
        imputer = Imputer(strategy="median")
        imputer.fit(training_features)
        
    training_features = imputer.transform(training_features)
    testing_features = imputer.transform(testing_features)
    
    if scalar == None:
        scalar = StandardScaler()
        scalar.fit(training_features)
        
    training_features = scalar.transform(training_features)
    testing_features = scalar.transform(testing_features)
    
    if poly_features == None:
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        poly_features.fit(training_features)
        
    training_features = poly_features.transform(training_features)
    testing_features = poly_features.transform(testing_features)

    if OPTIMUM_DIMENSION == None:
        dims, variances, OPTIMUM_DIMENSION = get_dims_variances(x=training_features, minDim=2, thres=0.005)
        print("Optimum Dimensions: ", OPTIMUM_DIMENSION)
        import matplotlib.pyplot as plt
        plt.plot(dims, variances)
        plt.show()
        dim_df = pd.DataFrame()
        dim_df["DIM"] = dims
        dim_df["VAR"] = variances
        print(dim_df)

    if pca == None:  
        pca = PCA(random_state=42, n_components=OPTIMUM_DIMENSION)
        pca.fit(training_features)
        
    training_features = pca.transform(training_features)
    testing_features = pca.transform(testing_features)
    
    return training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar

In [5]:
training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar = process_data(x=train_data, y=test_data)

training_labels = train_labels.values
testing_labels = test_labels.values

Optimum Dimensions:  9


<matplotlib.figure.Figure at 0x113346f98>

   DIM           VAR
0    2  2.007405e-01
1    3  1.802780e-01
2    4  7.318341e-02
3    5  6.323851e-02
4    6  4.993933e-02
5    7  3.180211e-02
6    8  2.395129e-02
7    9  9.933956e-03
8   10  4.527608e-32


In [6]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

sgd_clf = SGDClassifier(random_state=42, penalty="elasticnet", loss='log')
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=3, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.35


In [7]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'n_neighbors' : [2,3,4,5,6,7,8,9,10],
              'weights' : ['uniform', 'distance']
             }
clf = GridSearchCV(KNeighborsClassifier(), parameters)
clf.fit(X_train, Y_train)
print("\nBest params: ", clf.best_params_)

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("\nCross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=3, scoring="accuracy"))

knn_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == clf.predict(X_test)) / len(Y_test))


Best params:  {'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'uniform'}

Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.5


In [8]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=5)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=3, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.35


In [9]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels
batch_size = 3
num_classes = 10
epochs = 15

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.8))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

adam = Adam()
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test accuracy/loss ratio:', score[1] / score[0])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               2560      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2570      
Total params: 5,130
Trainable params: 5,130
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 20 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test loss: 1.8122583627700806
Test accuracy: 0.44999998807907104
Test accuracy/loss ratio: 0.24830895932036712


In [10]:
x = [[0,0,0,0,0,1,1]]
x_,_,_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.02855825 0.01154422 0.16100365 0.04070703 0.67782676 0.0106171
  0.01573212 0.01646328 0.02018954 0.01735801]]
[[0.  0.  0.  0.  0.8 0.  0.  0.  0.  0.2]]
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
[4]


In [11]:
x = [[0,1,0,1,0,0,0]]
x_,_,_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.00909319 0.07442424 0.71788096 0.02560439 0.02786103 0.02376629
  0.0212886  0.06476229 0.01873636 0.01658268]]
[[0.2 0.  0.2 0.  0.  0.  0.  0.  0.  0.6]]
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]
[2]


In [12]:
def get_all_cases_predictions(all_cases_features):
    preds_df = pd.DataFrame(columns=ALL_CASES_PREDICTIONS_COLUMS)
    preds_df["TYPE"] = None
    
    all_cases_features,_,_,_,_,_,_ = process_data(x=all_cases_features, y=all_cases_features, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
    
    mlp_preds = model.predict(all_cases_features.copy())
    mlp_preds = np.multiply(mlp_preds, 100)
    mlp_df = pd.DataFrame(mlp_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    mlp_df["TYPE"] = "MLP"
    preds_df = preds_df.append(mlp_df)
    
    rf_preds = forest_clf.predict_proba(all_cases_features.copy())
    rf_preds = np.multiply(rf_preds, 100)
    rf_df = pd.DataFrame(rf_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    rf_df["TYPE"] = "RF";
    preds_df = preds_df.append(rf_df)
    
    knn_preds = knn_clf.predict_proba(all_cases_features.copy())
    knn_preds = np.multiply(knn_preds, 100)
    knn_df = pd.DataFrame(knn_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    knn_df["TYPE"] = "KNN"
    preds_df = preds_df.append(knn_df)
    
    sgd_preds = sgd_clf.predict_proba(all_cases_features.copy())
    sgd_preds = np.multiply(sgd_preds, 100)
    sgd_df = pd.DataFrame(sgd_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    sgd_df["TYPE"] = "SGD"
    preds_df = preds_df.append(sgd_df)
    
    return preds_df

In [13]:
all_cases_features = load_data(ALL_CASES_INPUT_DATA)
all_cases_predictions_df = get_all_cases_predictions(all_cases_features.copy())

In [14]:
all_cases_predictions_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
0,0.937354,21.274891,30.506653,3.711892,21.802532,2.402566,1.984207,5.40071,9.636287,2.342908,MLP
1,3.781345,3.509519,44.956905,4.011583,31.728107,1.8457,1.487259,2.777092,3.804633,2.097848,MLP
2,2.3726,6.181278,17.321737,6.730519,50.803001,1.653923,2.755217,3.083224,5.930054,3.168456,MLP
3,2.855825,1.154422,16.100363,4.070703,67.782677,1.06171,1.573211,1.646328,2.018954,1.7358,MLP
4,0.279252,5.779332,71.463989,2.168461,5.687382,1.377366,0.528813,5.861566,1.349989,5.503853,MLP


In [15]:
all_cases_predictions_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
123,1.389167e-40,9.30888e-05,1.9165e-64,3.625981e-47,6.876645e-24,1.111708e-12,4.327368e-68,1.582514e-34,1.4709579999999999e-58,99.99991,SGD
124,100.0,6.570687e-75,5.699023e-76,1.110397e-32,7.362085000000001e-23,2.637263e-34,4.501465e-59,2.66858e-06,8.656108999999999e-41,1.445087e-40,SGD
125,1.4591549999999999e-34,4.7637490000000004e-33,4.196372e-15,100.0,2.160484e-15,9.814836e-20,5.124223e-45,8.720344e-25,3.985954e-08,1.570462e-49,SGD
126,100.0,2.761706e-49,5.5166810000000004e-71,7.779503e-29,2.115436e-33,3.6714980000000004e-33,1.5060229999999998e-50,4.095548e-06,4.250641e-15,1.0434590000000001e-40,SGD
127,1.744366e-40,4.993504e-11,1.751118e-34,2.514804e-25,7.630724e-28,1.0321310000000001e-22,9.535684e-46,1.990829e-11,100.0,1.634272e-33,SGD
