In [1]:
import os
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings("ignore")

LED_BLUE_PRINT = np.array([[1,1,1,1,1,1,0],
                     [0,1,1,0,0,0,0],
                     [1,1,0,1,1,0,1],
                     [1,1,1,1,0,0,1],
                     [0,0,1,0,0,1,1],
                     [1,0,1,1,0,1,1],
                     [1,0,1,1,1,1,1],
                     [1,1,1,0,0,0,0],
                     [1,1,1,1,1,1,1],
                     [1,1,1,1,0,1,1]
                    ])

TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"
ALL_CASES_PREDICTIONS = "all_cases_predictions.csv"
ALL_CASES_PREDICTIONS_COLUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [2]:
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [3]:
from sklearn.decomposition import PCA

def get_dims_variances(x, minDim, tol=None, thres=0.01):
    dims = []
    variances = []
    optimum_dim = minDim
    prev_min_variance = None
    dim = minDim
    
    while(True):
        pca = PCA(n_components=dim)
        pca.fit(x)
        variance = np.array(pca.explained_variance_ratio_)
        min_variance = variance.min()
        
        dims.append(dim)
        variances.append(min_variance)
        
        if tol != None and prev_min_variance != None and min_variance + tol > prev_min_variance:
            break

        else:
            if prev_min_variance != None and min_variance < thres:
                break
                
        prev_min_variance = min_variance
        optimum_dim = dim
        dim = dim + 1

    return dims, variances, optimum_dim

In [4]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler

def process_data(x, y, poly_features=None, pca=None, OPTIMUM_DIMENSION=None, imputer=None, scalar=None):
    training_features = x.copy()
    testing_features = y.copy()
    
    if imputer == None:
        imputer = Imputer(strategy="median")
        imputer.fit(training_features)
        
    training_features = imputer.transform(training_features)
    testing_features = imputer.transform(testing_features)
    
    if scalar == None:
        scalar = StandardScaler()
        scalar.fit(training_features)
        
    training_features = scalar.transform(training_features)
    testing_features = scalar.transform(testing_features)
    
    if poly_features == None:
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        poly_features.fit(training_features)
        
    training_features = poly_features.transform(training_features)
    testing_features = poly_features.transform(testing_features)

    if OPTIMUM_DIMENSION == None:
        dims, variances, OPTIMUM_DIMENSION = get_dims_variances(x=training_features, minDim=2, thres=0.005)
        print("Optimum Dimensions: ", OPTIMUM_DIMENSION)
        import matplotlib.pyplot as plt
        plt.plot(dims, variances)
        plt.show()
        dim_df = pd.DataFrame()
        dim_df["DIM"] = dims
        dim_df["VAR"] = variances
        print(dim_df)

    if pca == None:  
        pca = PCA(random_state=42, n_components=OPTIMUM_DIMENSION)
        pca.fit(training_features)
        
    training_features = pca.transform(training_features)
    testing_features = pca.transform(testing_features)
    
    return training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar

In [5]:
training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, imputer, scalar = process_data(x=train_data, y=test_data)

training_labels = train_labels.values
testing_labels = test_labels.values

Optimum Dimensions:  9


<matplotlib.figure.Figure at 0x10d592748>

   DIM           VAR
0    2  2.007405e-01
1    3  1.802780e-01
2    4  7.318341e-02
3    5  6.323851e-02
4    6  4.993933e-02
5    7  3.180211e-02
6    8  2.395129e-02
7    9  9.933956e-03
8   10  4.527608e-32


In [6]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = False
if gridSearch:
    parameters = {
                  'n_iter' : [2,3,4,5,6,7,8,9,10]
                 }
    clf = GridSearchCV(SGDClassifier(random_state=42), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)

sgd_clf = SGDClassifier(random_state=42, penalty="elasticnet", loss='log', n_iter=3)
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=3, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.35714285714285715




In [7]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = False
if gridSearch:
    parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  'n_neighbors' : [2,3,4,5,6,7,8,9,10],
                  'weights' : ['uniform', 'distance']
                 }
    clf = GridSearchCV(KNeighborsClassifier(), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("\nCross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=3, scoring="accuracy"))

knn_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == knn_clf.predict(X_test)) / len(Y_test))


Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.5


In [8]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = True
if gridSearch:
    parameters = {
                  'n_estimators' : [2,3,4,5,6,7,8,9,10]
                 }
    clf = GridSearchCV(RandomForestClassifier(random_state=42), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)


forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=7)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=3, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))



Best params:  {'n_estimators': 3}
Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.42857142857142855


In [9]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels
batch_size = 1
num_classes = 10
epochs = 20

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test accuracy/loss ratio:', score[1] / score[0])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               5120      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                5130      
Total params: 10,250
Trainable params: 10,250
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 28 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 1.9057906866073608
Test accuracy: 0.3928571343421936
Test accuracy/loss ratio: 0.20613865788249164


In [10]:
def inverse_leds(leds):
    leds_copy = leds.copy()
    leds_copy = np.add(leds_copy, 1)
    leds_copy = np.fmod(leds_copy, 2)
    return leds_copy

In [11]:
from sklearn.utils.extmath import softmax
def boost_by_inv_logic(preds, preds_inv, X):
    l = len(X)
    for indx in range(l):
        pred = preds[indx]
        pred_inv = preds_inv[indx]
        x = X[indx]
    
        for led_indx in range(len(LED_BLUE_PRINT)):
            seg = LED_BLUE_PRINT[led_indx]
            if np.min(seg - x) < 0:
                pred[led_indx] = 0
        
        #pred_inv = np.add(pred_inv, 0.01)
        #preds[indx] = np.divide(pred, pred_inv)
        preds[indx] = pred
        
    return softmax(preds)

In [12]:
def get_all_cases_predictions(all_cases_features):
    all_cases_features_inv = inverse_leds(all_cases_features)
    
    preds_df = pd.DataFrame(columns=ALL_CASES_PREDICTIONS_COLUMS)
    preds_df["TYPE"] = None
    
    all_cases_features_,_,_,_,_,_,_ = process_data(x=all_cases_features, y=all_cases_features, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
    
    all_cases_features_inv_,_,_,_,_,_,_ = process_data(x=all_cases_features_inv, y=all_cases_features_inv, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, imputer=imputer, scalar=scalar)
    
    preds = model.predict(all_cases_features_.copy())
    preds_inv = model.predict(all_cases_features_inv_.copy())               
    mlp_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    mlp_preds = np.multiply(mlp_preds, 100)
    mlp_df = pd.DataFrame(mlp_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    mlp_df["TYPE"] = "MLP"
    preds_df = preds_df.append(mlp_df)
    
    
    preds = forest_clf.predict_proba(all_cases_features_.copy())
    preds_inv = forest_clf.predict_proba(all_cases_features_inv_.copy()) 
    rf_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    rf_preds = np.multiply(rf_preds, 100)
    rf_df = pd.DataFrame(rf_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    rf_df["TYPE"] = "RF";
    preds_df = preds_df.append(rf_df)
    
    
    preds = knn_clf.predict_proba(all_cases_features_.copy())
    preds_inv = knn_clf.predict_proba(all_cases_features_inv_.copy()) 
    knn_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    knn_preds = np.multiply(knn_preds, 100)
    knn_df = pd.DataFrame(knn_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    knn_df["TYPE"] = "KNN"
    preds_df = preds_df.append(knn_df)
    
    
    preds = sgd_clf.predict_proba(all_cases_features_.copy())
    preds_inv = sgd_clf.predict_proba(all_cases_features_inv_.copy()) 
    sgd_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    sgd_preds = np.multiply(sgd_preds, 100)
    sgd_df = pd.DataFrame(sgd_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    sgd_df["TYPE"] = "SGD"
    preds_df = preds_df.append(sgd_df)

    return preds_df

In [13]:
all_cases_features = load_data(ALL_CASES_INPUT_DATA)
all_cases_predictions_df = get_all_cases_predictions(all_cases_features.copy())
 

In [14]:
all_cases_predictions_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
0,9.078181,12.630632,12.191221,9.00882,9.368115,8.993168,9.027301,9.073927,11.523726,9.104903,MLP
1,9.202247,9.202247,13.469519,9.299434,12.144199,9.251817,9.396681,9.202247,9.515109,9.316499,MLP
2,9.348959,9.14826,9.14826,9.14826,16.448711,9.264447,9.310493,9.14826,9.795173,9.239171,MLP
3,8.703809,8.703809,8.703809,8.703809,21.494379,8.734945,8.768975,8.703809,8.755574,8.727082,MLP
4,9.46217,9.38308,15.319827,9.38308,9.38308,9.38308,9.466035,9.38308,9.453486,9.38308,MLP


In [15]:
all_cases_predictions_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
123,8.533724,8.533724,8.533724,8.533724,8.533724,8.533724,8.533724,8.533724,8.533724,23.196481,SGD
124,23.196905,8.533677,8.533677,8.533677,8.533677,8.533677,8.533677,8.533677,8.533677,8.533677,SGD
125,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,SGD
126,23.19689,8.533679,8.533679,8.533679,8.533679,8.533679,8.533679,8.533679,8.533679,8.533679,SGD
127,8.533674,8.533674,8.533674,8.533674,8.533674,8.533674,8.533674,8.533674,23.196932,8.533674,SGD


In [16]:
all_cases_predictions_df.to_csv(ALL_CASES_PREDICTIONS, sep=',', index_label=False)