In [1]:
import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"
ALL_CASES_PREDICTIONS = "all_cases_predictions.csv"
ALL_CASES_PREDICTIONS_COLUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [2]:
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [3]:
from sklearn.decomposition import PCA

def get_dims_variances(x, minDim, tol=None, thres=0.01):
    dims = []
    variances = []
    optimum_dim = minDim
    prev_min_variance = None
    dim = minDim
    
    while(True):
        pca = PCA(n_components=dim)
        pca.fit(x)
        variance = np.array(pca.explained_variance_ratio_)
        min_variance = variance.min()
        
        dims.append(dim)
        variances.append(min_variance)
        
        if tol != None and prev_min_variance != None and min_variance + tol > prev_min_variance:
            break

        else:
            if prev_min_variance != None and min_variance < thres:
                break
                
        prev_min_variance = min_variance
        optimum_dim = dim
        dim = dim + 1

    return dims, variances, optimum_dim

In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

def process_data(x, y, poly_features=None, pca=None, OPTIMUM_DIMENSION=None, imputer=None, scalar=None):
    training_features = x.copy()
    testing_features = y.copy()
    
    if imputer == None:
        imputer = Imputer(strategy="median")
        imputer.fit(training_features)
        
    training_features = imputer.transform(training_features)
    testing_features = imputer.transform(testing_features)
    
    if scalar == None:
        scalar = StandardScaler()
        scalar.fit(training_features)
        
    training_features = scalar.transform(training_features)
    testing_features = scalar.transform(testing_features)
    
    if poly_features == None:
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        poly_features.fit(training_features)
        
    training_features = poly_features.transform(training_features)
    testing_features = poly_features.transform(testing_features)

    if OPTIMUM_DIMENSION == None:
        dims, variances, OPTIMUM_DIMENSION = get_dims_variances(x=training_features, minDim=2, thres=0.005)
        print("Optimum Dimensions: ", OPTIMUM_DIMENSION)
        import matplotlib.pyplot as plt
        plt.plot(dims, variances)
        plt.show()
        dim_df = pd.DataFrame()
        dim_df["DIM"] = dims
        dim_df["VAR"] = variances
        print(dim_df)

    if pca == None:  
        pca = PCA(random_state=42, n_components=OPTIMUM_DIMENSION)
        pca.fit(training_features)
        
    training_features = pca.transform(training_features)
    testing_features = pca.transform(testing_features)
    
    return training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar

In [5]:
training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar = process_data(x=train_data, y=test_data)

training_labels = train_labels.values
testing_labels = test_labels.values

Optimum Dimensions:  22


<matplotlib.figure.Figure at 0x10b6bb9b0>

    DIM       VAR
0     2  0.104665
1     3  0.086615
2     4  0.076949
3     5  0.066440
4     6  0.064644
5     7  0.058735
6     8  0.050645
7     9  0.047481
8    10  0.040086
9    11  0.037579
10   12  0.032728
11   13  0.029205
12   14  0.027545
13   15  0.024633
14   16  0.019721
15   17  0.016955
16   18  0.015201
17   19  0.011630
18   20  0.007355
19   21  0.006022
20   22  0.005319
21   23  0.002975


In [6]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

sgd_clf = SGDClassifier(random_state=42, penalty="elasticnet", loss='log')
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=3, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [0.66666667 0.9        0.8       ]


Accuracy on testing data set
 0.9642857142857143


In [7]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
              'n_neighbors' : [2,3,4,5,6,7,8,9,10],
              'weights' : ['uniform', 'distance']
             }
clf = GridSearchCV(KNeighborsClassifier(), parameters)
clf.fit(X_train, Y_train)
print("\nBest params: ", clf.best_params_)

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("\nCross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=3, scoring="accuracy"))

knn_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == clf.predict(X_test)) / len(Y_test))


Best params:  {'algorithm': 'auto', 'n_neighbors': 2, 'weights': 'uniform'}

Cross Val Scores on training set
 [0.72222222 0.6        0.8       ]


Accuracy on testing data set
 0.7857142857142857


In [8]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=5)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=3, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [0.22222222 0.2        0.4       ]


Accuracy on testing data set
 0.9285714285714286


In [9]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels
batch_size = 3
num_classes = 10
epochs = 15

model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

adam = Adam()
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test accuracy/loss ratio:', score[1] / score[0])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 256)               5888      
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                2570      
Total params: 8,458
Trainable params: 8,458
Non-trainable params: 0
_________________________________________________________________
Train on 38 samples, validate on 28 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test loss: 0.22933180630207062
Test accuracy: 0.9285714030265808
Test accuracy/loss ratio: 4.04903017160859


In [10]:
x = [[0,0,0,0,0,1,1]]
x_,_,_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.02388783 0.00991701 0.00654111 0.00775802 0.90135974 0.03303823
  0.00259724 0.00644441 0.00484932 0.00360716]]
[[0.2 0.  0.  0.  0.6 0.  0.2 0.  0.  0. ]]
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]
[4]


In [11]:
x = [[0,1,0,1,1,1,0]]
x_,_,_,_,_,_,_ = process_data(x=x, y=x, poly_features=poly_features, pca=pca, OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
print(model.predict(x_.copy()))
print(forest_clf.predict_proba(x_.copy()))
print(knn_clf.predict_proba(x_.copy()))
print(sgd_clf.predict(x_.copy()))

[[0.518829   0.01169285 0.08506533 0.05207281 0.04648864 0.0260013
  0.10674343 0.03346601 0.08589728 0.03374337]]
[[0.4 0.  0.  0.  0.  0.2 0.  0.  0.2 0.2]]
[[0.5 0.  0.  0.5 0.  0.  0.  0.  0.  0. ]]
[0]


In [12]:
def get_all_cases_predictions(all_cases_features):
    preds_df = pd.DataFrame(columns=ALL_CASES_PREDICTIONS_COLUMS)
    preds_df["TYPE"] = None
    
    all_cases_features,_,_,_,_,_,_ = process_data(x=all_cases_features, y=all_cases_features, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
    
    mlp_preds = model.predict(all_cases_features.copy())
    mlp_preds = np.multiply(mlp_preds, 100)
    mlp_df = pd.DataFrame(mlp_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    mlp_df["TYPE"] = "MLP"
    preds_df = preds_df.append(mlp_df)
    
    rf_preds = forest_clf.predict_proba(all_cases_features.copy())
    rf_preds = np.multiply(rf_preds, 100)
    rf_df = pd.DataFrame(rf_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    rf_df["TYPE"] = "RF";
    preds_df = preds_df.append(rf_df)
    
    knn_preds = knn_clf.predict_proba(all_cases_features.copy())
    knn_preds = np.multiply(knn_preds, 100)
    knn_df = pd.DataFrame(knn_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    knn_df["TYPE"] = "KNN"
    preds_df = preds_df.append(knn_df)
    
    sgd_preds = sgd_clf.predict_proba(all_cases_features.copy())
    sgd_preds = np.multiply(sgd_preds, 100)
    sgd_df = pd.DataFrame(sgd_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    sgd_df["TYPE"] = "SGD"
    preds_df = preds_df.append(sgd_df)
    
    return preds_df

In [13]:
all_cases_features = load_data(ALL_CASES_INPUT_DATA)
all_cases_predictions_df = get_all_cases_predictions(all_cases_features.copy())

In [14]:
all_cases_predictions_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
0,0.381089,88.689598,0.595055,0.722388,3.782138,0.533339,0.327221,1.754858,2.289423,0.924888,MLP
1,17.474035,29.048544,4.820099,3.918885,25.696648,4.417932,1.682335,2.958868,6.509285,3.473367,MLP
2,0.428024,2.643808,0.489288,0.864877,92.94455,1.276228,0.128378,0.501736,0.375435,0.347664,MLP
3,2.388781,0.991701,0.65411,0.775801,90.135971,3.303822,0.259724,0.644441,0.484932,0.360716,MLP
4,2.50738,39.200539,5.189125,4.01441,10.353764,3.840787,0.860542,2.556046,2.367646,29.10976,MLP


In [15]:
all_cases_predictions_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
123,4.305213e-59,6.961097e-49,4.6681779999999996e-26,1.0326240000000001e-25,2.840106e-75,5.172385e-40,1.7080200000000001e-62,1.412779e-75,7.6997289999999995e-34,100.0,SGD
124,100.0,2.639525e-148,9.811718e-56,1.358361e-22,6.992827e-75,6.295976e-94,4.8314490000000006e-175,4.517859e-32,1.566642e-87,3.894994e-146,SGD
125,2.17489e-126,7.982211999999999e-136,50.0,50.0,1.5449520000000002e-23,1.1382549999999999e-100,8.984591e-136,1.20519e-33,3.959178e-06,1.142351e-120,SGD
126,100.0,9.840145e-107,3.590375e-82,3.0402499999999996e-19,1.803035e-50,2.556247e-90,5.883163e-152,4.7772139999999995e-102,6.542462e-72,5.270384e-83,SGD
127,4.364533e-68,1.7843620000000002e-66,2.894008e-34,2.647241e-27,5.902222999999999e-26,4.175965e-69,1.1061779999999999e-58,4.560587e-68,100.0,2.618063e-24,SGD
