In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"
ALL_CASES_PREDICTIONS = "all_cases_predictions.csv"
ALL_CASES_PREDICTIONS_COLUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [2]:
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [3]:
from sklearn.decomposition import PCA

def get_dims_variances(x, minDim, tol=None, thres=0.01):
    dims = []
    variances = []
    optimum_dim = minDim
    prev_min_variance = None
    dim = minDim
    
    while(True):
        pca = PCA(n_components=dim)
        pca.fit(x)
        variance = np.array(pca.explained_variance_ratio_)
        min_variance = variance.min()
        
        dims.append(dim)
        variances.append(min_variance)
        
        if tol != None and prev_min_variance != None and min_variance + tol > prev_min_variance:
            break

        else:
            if prev_min_variance != None and min_variance < thres:
                break
                
        prev_min_variance = min_variance
        optimum_dim = dim
        dim = dim + 1

    return dims, variances, optimum_dim

In [4]:
from sklearn.preprocessing import PolynomialFeatures
def process_data(x, y=None, poly_features=None, pca=None, OPTIMUM_DIMENSION=None):
    training_features = x.copy()
    testing_features = y.copy()
    
    if poly_features == None:
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        poly_features.fit(training_features)
        
    training_features = poly_features.transform(training_features)
    testing_features = poly_features.transform(testing_features)

    if OPTIMUM_DIMENSION == None:
        dims, variances, OPTIMUM_DIMENSION = get_dims_variances(x=training_features, minDim=2, thres=0.01)
        print("Optimum Dimensions: ", OPTIMUM_DIMENSION)
        import matplotlib.pyplot as plt
        plt.plot(dims, variances)
        plt.show()
        dim_df = pd.DataFrame()
        dim_df["DIM"] = dims
        dim_df["VAR"] = variances
        print(dim_df)

    if pca == None:  
        pca = PCA(random_state=42, n_components=OPTIMUM_DIMENSION)
        pca.fit(training_features)
        
    training_features = pca.transform(training_features)
    testing_features = pca.transform(testing_features)
    
    return training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION

In [5]:
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

imputer = Imputer(strategy="median")
training_features = imputer.fit_transform(train_data)
testing_features = imputer.transform(test_data)
all_cases_features = imputer.transform(all_cases_input_data)

scalar = StandardScaler()
training_features = scalar.fit_transform(training_features)
testing_features = scalar.transform(testing_features)
all_cases_features = scalar.transform(all_cases_features)

training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION = process_data(x=training_features, y=testing_features)

training_labels = train_labels.values
testing_labels = test_labels.values

Optimum Dimensions:  18


<matplotlib.figure.Figure at 0x103a54710>

    DIM       VAR
0     2  0.126065
1     3  0.091955
2     4  0.078825
3     5  0.072726
4     6  0.062582
5     7  0.052270
6     8  0.047362
7     9  0.044440
8    10  0.042660
9    11  0.036638
10   12  0.030894
11   13  0.025990
12   14  0.022621
13   15  0.016864
14   16  0.013965
15   17  0.011354
16   18  0.010331
17   19  0.006118


In [6]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

sgd_clf = SGDClassifier(random_state=42, penalty="l1")
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=2, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [0.6 0.4]


Accuracy on testing data set
 0.9


In [7]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'n_neighbors' : [2,3,4,5,6,7,8,9,10],
              'weights' : ['uniform', 'distance']
             }
clf = GridSearchCV(KNeighborsClassifier(), parameters)
clf.fit(X_train, Y_train)
print("\nBest params: ", clf.best_params_)

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("\nCross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=2, scoring="accuracy"))

knn_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == clf.predict(X_test)) / len(Y_test))


Best params:  {'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'distance'}

Cross Val Scores on training set
 [0.5 0.5]


Accuracy on testing data set
 0.9


In [8]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=5)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=2, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [0.15 0.1 ]


Accuracy on testing data set
 0.95


In [9]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels
batch_size = 3
num_classes = 10
epochs = 15

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.8))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

adam = Adam()
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test accuracy/loss ratio:', score[1] / score[0])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               4864      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2570      
Total params: 7,434
Trainable params: 7,434
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 20 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test loss: 0.8972346186637878
Test accuracy: 1.0
Test accuracy/loss ratio: 1.1145356846454009


In [10]:
x = [[0,0,0,1,0,0,0]]
x_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.11031048 0.06208147 0.11678445 0.11123959 0.08184429 0.11866929
  0.1073188  0.06332638 0.09520247 0.13322277]]
[[0.  0.4 0.  0.  0.2 0.  0.  0.  0.2 0.2]]
[[0.  0.  0.  0.  0.  0.5 0.  0.  0.  0.5]]
[5]


In [11]:
x = [[0,1,0,1,0,0,0]]
x_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.1054697  0.0578808  0.10781884 0.13407862 0.09722768 0.10026815
  0.08649302 0.0578361  0.10698884 0.1459383 ]]
[[0.  0.2 0.2 0.  0.  0.  0.  0.  0.2 0.4]]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]
[3]


In [33]:
def get_all_cases_predictions(all_cases_features):
    preds_df = pd.DataFrame(columns=ALL_CASES_PREDICTIONS_COLUMS)
    preds_df["TYPE"] = None
    
    all_cases_features,_,_,_,_ = process_data(x=all_cases_features, y=all_cases_features, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION)
    
    mlp_preds = model.predict(all_cases_features.copy())
    mlp_preds = np.multiply(mlp_preds, 100)
    mlp_df = pd.DataFrame(mlp_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    print(mlp_preds)
    mlp_df["TYPE"] = "MLP"
    preds_df = preds_df.append(mlp_df)
    
    rf_preds = forest_clf.predict(all_cases_features.copy())
    knn_preds = knn_clf.predict(all_cases_features.copy())
    sgd_preds = sgd_clf.predict(all_cases_features.copy())
    
    return preds_df

In [34]:
all_cases_predictions_df = get_all_cases_predictions(all_cases_features)

[[ 1.5532231 39.33496    6.9147215 ...  9.024629  14.086148   8.52484  ]
 [ 7.5096054 12.28265   13.228414  ...  8.306694   9.907604   6.382473 ]
 [ 3.334197  11.879541   8.093478  ...  7.020385   6.4445457  5.5389566]
 ...
 [10.774941   4.0829086 18.309965  ...  4.056551  13.190492   6.9754896]
 [58.334465   2.5842335  4.2555194 ...  3.2960277  9.138535   3.03973  ]
 [ 8.517955   5.9594374  7.3085656 ...  5.555501  45.96287    5.274289 ]]


In [35]:
all_cases_predictions_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
0,1.553223,39.334961,6.914721,4.262697,6.889342,3.366898,6.042534,9.024629,14.086148,8.524840,MLP
1,7.509605,12.282650,13.228414,9.628992,18.349697,8.372885,6.030983,8.306694,9.907604,6.382473,MLP
2,3.334197,11.879541,8.093478,9.054167,38.879452,3.876256,5.879017,7.020385,6.444546,5.538957,MLP
3,4.830506,7.526378,4.747234,10.053899,44.879513,6.374301,5.093182,5.423261,6.215277,4.856456,MLP
4,4.196395,13.264624,6.939872,4.464529,6.486824,6.161917,10.069639,7.112194,6.979593,34.324413,MLP
5,7.192058,7.471269,18.833324,7.330932,11.896738,7.181845,12.511088,7.406135,4.772666,15.403947,MLP
6,8.541879,6.164216,5.498583,15.079559,30.714527,4.132267,7.180537,6.662847,3.341108,12.684469,MLP
7,6.336843,7.559615,5.517073,13.722263,24.713366,8.548863,11.741745,6.569133,4.707669,10.583437,MLP
8,6.267414,16.135756,13.460493,7.860033,6.466064,9.541339,8.281985,10.077622,11.444051,10.465247,MLP
9,9.336064,5.892505,23.692274,12.671237,6.915795,12.022660,7.145582,7.949306,6.817841,7.556728,MLP
