In [1]:
import os
import pandas as pd
import numpy as np
import math
import json
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# LED PATTERNS for the digits 0 to 9
LED_BLUE_PRINT = np.array([[1,1,1,1,1,1,0],
                     [0,1,1,0,0,0,0],
                     [1,1,0,1,1,0,1],
                     [1,1,1,1,0,0,1],
                     [0,0,1,0,0,1,1],
                     [1,0,1,1,0,1,1],
                     [1,0,1,1,1,1,1],
                     [1,1,1,0,0,0,0],
                     [1,1,1,1,1,1,1],
                     [1,1,1,1,0,1,1]
                    ])

# All cases essential data files
TRAIN_DATA_FILE = "train_data.csv"
TEST_DATA_FILE = "test_data.csv"
ALL_CASES_INPUT_DATA = "all_cases_input_data.csv"
ALL_CASES_PREDICTIONS = "all_cases_predictions.csv"
ALL_CASES_PREDICTIONS_COLUMS = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]

# LEDS and DIGITS arr to refer
LEDS_ARR = ["A", "B", "C", "D", "E", "F", "G"]
DIGITS = ["Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine"]


In [2]:
# Generic data loading function
def load_data(file=TRAIN_DATA_FILE, header=True):
    csv_path = os.path.join("", file)
    if header:
        return pd.read_csv(csv_path)
    else:
        return pd.read_csv(csv_path, header=None)

In [3]:
# Load the train and test data
# Drop DIGIT column and treat it as labels
train_data = load_data(TRAIN_DATA_FILE)
train_labels = train_data["DIGIT"]
train_data.drop("DIGIT", axis=1, inplace=True)

test_data = load_data(TEST_DATA_FILE)
test_labels = test_data["DIGIT"]
test_data.drop("DIGIT", axis=1, inplace=True)

In [4]:
# Load all cases input data
all_cases_input_data = load_data(ALL_CASES_INPUT_DATA)

In [5]:
# Find out the optimal dimensions using PCA
# Return dimension where the minimum variance between the features drop to thres value
from sklearn.decomposition import PCA

def get_optimum_dimensions(x, thres=0.01):
    pca = PCA(n_components=1.0 - thres)
    pca.fit(x)
    optimum_dim = pca.n_components_
    return optimum_dim

In [6]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

# Data processing routine

def process_data(x, y, poly_features=None, pca=None, OPTIMUM_DIMENSION=None, scalar=None):
    training_features = x.copy()
    testing_features = y.copy()
    
    if poly_features == None:
        poly_features = PolynomialFeatures(degree=2, include_bias=False)
        poly_features.fit(training_features)
        
    training_features = poly_features.transform(training_features)
    testing_features = poly_features.transform(testing_features)
    
    if scalar == None:
        scalar = StandardScaler()
        scalar.fit(training_features)
        
    training_features = scalar.transform(training_features)
    testing_features = scalar.transform(testing_features)

    if OPTIMUM_DIMENSION == None:
        OPTIMUM_DIMENSION = get_optimum_dimensions(x=training_features, thres=0.0001)
        print("Optimum Dimensions: ", OPTIMUM_DIMENSION)

    if pca == None:  
        pca = PCA(random_state=42, n_components=OPTIMUM_DIMENSION)
        pca.fit(training_features)
        
    training_features = pca.transform(training_features)
    testing_features = pca.transform(testing_features)
    
    return training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, scalar

In [7]:
training_features, testing_features, poly_features, pca, OPTIMUM_DIMENSION, scalar = process_data(x=train_data, y=test_data)

training_labels = train_labels.values
testing_labels = test_labels.values

Optimum Dimensions:  9


In [8]:
# SGD Classifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.base import clone

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = False
if gridSearch:
    parameters = {
                  'n_iter' : [2,3,4,5,6,7,8,9,10]
                 }
    clf = GridSearchCV(SGDClassifier(random_state=42), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)

sgd_clf = SGDClassifier(random_state=42, penalty="elasticnet", loss='log', n_iter=3)
cross_val_scores = cross_val_score(clone(sgd_clf), X_train, Y_train, cv=3, scoring="accuracy")
print("Cross Val Scores on training set\n", cross_val_scores)

sgd_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == sgd_clf.predict(X_test)) / len(Y_test))

Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.17857142857142858




In [9]:
# KNeighbors Classifier
from sklearn.neighbors import KNeighborsClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = False
if gridSearch:
    parameters = {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
                  'n_neighbors' : [2,3,4,5,6,7,8,9,10],
                  'weights' : ['uniform', 'distance']
                 }
    clf = GridSearchCV(KNeighborsClassifier(), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)

knn_clf = KNeighborsClassifier(algorithm='auto', n_neighbors=2, weights='uniform')
print("\nCross Val Scores on training set\n", cross_val_score(clone(knn_clf), X_train, Y_train, cv=3, scoring="accuracy"))

knn_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == knn_clf.predict(X_test)) / len(Y_test))


Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.6071428571428571


In [10]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier 

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels

gridSearch = True
if gridSearch:
    parameters = {
                  'n_estimators' : [2,3,4,5,6,7,8,9,10]
                 }
    clf = GridSearchCV(RandomForestClassifier(random_state=42), parameters)
    clf.fit(X_train, Y_train)
    print("\nBest params: ", clf.best_params_)


forest_clf = RandomForestClassifier(random_state=42, oob_score=True, n_estimators=7)
print("Cross Val Scores on training set\n", cross_val_score(clone(forest_clf), X_train, Y_train, cv=3, scoring="accuracy"))

forest_clf.fit(X_train, Y_train)
print("\n\nAccuracy on testing data set\n", sum(Y_test == forest_clf.predict(X_test)) / len(Y_test))



Best params:  {'n_estimators': 3}
Cross Val Scores on training set
 [1. 1. 1.]


Accuracy on testing data set
 0.35714285714285715


In [11]:
# MLP Classifier
import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelBinarizer

X_train = training_features
Y_train = training_labels
X_test = testing_features
Y_test = testing_labels
batch_size = 1
num_classes = 10
epochs = 20

model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=Adam(lr=0.0005),
              metrics=['accuracy'])

binarizer = LabelBinarizer()
binarizer.fit(Y_train)
Y_train = binarizer.transform(Y_train)
Y_test = binarizer.transform(Y_test)

history = model.fit(X_train, Y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_test, Y_test))

score = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print('Test accuracy/loss ratio:', score[1] / score[0])

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               5120      
_________________________________________________________________
dense_2 (Dense)              (None, 10)                5130      
Total params: 10,250
Trainable params: 10,250
Non-trainable params: 0
_________________________________________________________________
Train on 30 samples, validate on 28 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test loss: 1.036602258682251
Test accuracy: 0.6428571343421936
Test accuracy/loss ratio: 0.6201579525394881


In [12]:
def inverse_leds(leds):
    leds_copy = leds.copy()
    leds_copy = np.add(leds_copy, 1)
    leds_copy = np.fmod(leds_copy, 2)
    return leds_copy

In [13]:
from sklearn.utils.extmath import softmax
def boost_by_inv_logic(preds, preds_inv, X):

    l = len(X)
    for indx in range(l):
        pred = preds[indx]
        pred_inv = preds_inv[indx]        
        x = X[indx]
    
        for led_indx in range(len(LED_BLUE_PRINT)):
            seg = LED_BLUE_PRINT[led_indx]
            if np.min(seg - x) < 0:
                pred[led_indx] = 0
        
        pred = np.divide(pred, np.add(pred_inv, 0.01))
        preds[indx] = pred
    
    return softmax(preds)

In [14]:
def subsume_entity_affinity(led_label, affinity_calculation_result, pred):
    entity_affinity = affinity_calculation_result['data'][led_label]['entityAffinity']
    for indx in range(len(DIGITS)):
        pred[indx] += entity_affinity[DIGITS[indx]]
    return pred

In [15]:
def get_all_cases_predictions_affinity(all_cases_features, affinity_calculation_result):
    all_cases_features_values = all_cases_features.values
    affinity_preds = []
    for all_cases_features_value in all_cases_features_values:
        
        active_single_leds = []
        for indx in range(len(all_cases_features_value)):
            if all_cases_features_value[indx] == 0: continue
            active_single_leds.append(LEDS_ARR[indx])
        
        pred = [1.0 for x in range(10)]
        for indx in range(len(active_single_leds)):
            led_label = active_single_leds[indx]
            pred = subsume_entity_affinity(led_label, affinity_calculation_result, pred)
            
            out_indx = indx + 1
            while out_indx < len(active_single_leds):
                pred = subsume_entity_affinity(led_label + active_single_leds[out_indx], affinity_calculation_result, pred)
                out_indx += 1
            
        pred = np.divide(pred, sum(pred) + 0.1)
        affinity_preds.append(pred)
        
    return affinity_preds

In [16]:
def get_all_cases_predictions(all_cases_features):
    all_cases_features_inv = inverse_leds(all_cases_features)
    
    preds_df = pd.DataFrame(columns=ALL_CASES_PREDICTIONS_COLUMS)
    preds_df["TYPE"] = None
    
    all_cases_features_,_,_,_,_,_ = process_data(x=all_cases_features, y=all_cases_features, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, scalar=scalar)
    
    all_cases_features_inv_,_,_,_,_,_ = process_data(x=all_cases_features_inv, y=all_cases_features_inv, 
                                              poly_features=poly_features, pca=pca, 
                                              OPTIMUM_DIMENSION=OPTIMUM_DIMENSION, scalar=scalar)
    
    preds = model.predict(all_cases_features_.copy())
    preds_inv = model.predict(all_cases_features_inv_.copy())               
    mlp_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    mlp_preds = np.multiply(mlp_preds, 100)
    mlp_df = pd.DataFrame(mlp_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    mlp_df["TYPE"] = "MLP"
    preds_df = preds_df.append(mlp_df)
    
    
    preds = forest_clf.predict_proba(all_cases_features_.copy())
    preds_inv = forest_clf.predict_proba(all_cases_features_inv_.copy()) 
    rf_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    rf_preds = np.multiply(rf_preds, 100)
    rf_df = pd.DataFrame(rf_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    rf_df["TYPE"] = "RF";
    preds_df = preds_df.append(rf_df)
    
    
    preds = knn_clf.predict_proba(all_cases_features_.copy())
    preds_inv = knn_clf.predict_proba(all_cases_features_inv_.copy()) 
    knn_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    knn_preds = np.multiply(knn_preds, 100)
    knn_df = pd.DataFrame(knn_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    knn_df["TYPE"] = "KNN"
    preds_df = preds_df.append(knn_df)
    
    
    preds = sgd_clf.predict_proba(all_cases_features_.copy())
    preds_inv = sgd_clf.predict_proba(all_cases_features_inv_.copy()) 
    sgd_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    sgd_preds = np.multiply(sgd_preds, 100)
    sgd_df = pd.DataFrame(sgd_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    sgd_df["TYPE"] = "SGD"
    preds_df = preds_df.append(sgd_df)
    
    
    with open('affinity-session-data/Led7-777/AFFINITY_CALCULATION.txt', 'r') as myfile:
        affinity_calculation_result = json.loads(myfile.read().replace('\n', ''))
        
    preds = get_all_cases_predictions_affinity(all_cases_features.copy(), affinity_calculation_result)
    preds_inv = get_all_cases_predictions_affinity(all_cases_features_inv.copy(), affinity_calculation_result)
    
    affinity_preds = boost_by_inv_logic(preds, preds_inv, all_cases_features.copy().values)
    affinity_preds = np.multiply(affinity_preds, 100)
    affinity_df = pd.DataFrame(affinity_preds, columns=ALL_CASES_PREDICTIONS_COLUMS)
    affinity_df["TYPE"] = "AFFINITY";
    preds_df = preds_df.append(affinity_df)

    return preds_df

In [17]:
all_cases_features = load_data(ALL_CASES_INPUT_DATA)
all_cases_predictions_df = get_all_cases_predictions(all_cases_features.copy())

In [18]:
all_cases_predictions_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
0,5.864296e-21,100.0,3.287959e-18,6.973196e-21,2.025627e-09,1.5980319999999998e-20,4.508334e-21,3.8269659999999995e-19,3.6613509999999996e-21,4.348592e-21,MLP
1,2.4993700000000003e-28,2.4993700000000003e-28,8.943535e-25,9.24747e-28,100.0,1.057537e-27,3.0856260000000003e-28,2.4993700000000003e-28,2.7123780000000002e-28,3.175448e-28,MLP
2,5.799716e-20,4.243875e-20,4.243875e-20,4.243875e-20,100.0,1.6624079999999999e-19,4.6588519999999996e-20,4.243875e-20,4.266552e-20,5.509062e-20,MLP
3,1.222153e-30,1.222153e-30,1.222153e-30,1.222153e-30,100.0,1.9877039999999998e-30,1.2871629999999999e-30,1.222153e-30,1.253333e-30,1.450258e-30,MLP
4,2.828594e-05,6.431762e-06,99.99989,6.431762e-06,6.431762e-06,6.431762e-06,1.453442e-05,6.431762e-06,7.279723e-06,6.431762e-06,MLP


In [19]:
all_cases_predictions_df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TYPE
123,2.574348,2.574348,2.574348,2.574348,2.574348,2.574348,2.574348,2.574348,4.333474,75.071744,AFFINITY
124,30.817069,5.586174,5.586174,5.586174,5.586174,5.586174,5.586174,5.586174,24.493539,5.586174,AFFINITY
125,7.737979,7.737979,7.737979,7.737979,7.737979,7.737979,7.737979,7.737979,30.358192,7.737979,AFFINITY
126,83.289149,1.264547,1.264547,1.264547,1.264547,1.264547,1.264547,1.264547,6.594476,1.264547,AFFINITY
127,7.52666,7.52666,7.52666,7.52666,7.52666,7.52666,7.52666,7.52666,32.260062,7.52666,AFFINITY


In [20]:
all_cases_predictions_df.to_csv(ALL_CASES_PREDICTIONS, sep=',', index_label="id")