### Import statements

In [1]:
import pickle
# from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC
# from sklearn.svm import LinearSVC
import numpy as np
import random
import csv
# import cv2
# import copy

### Script for reading data from pickle files

<b>Train_data:</b> A list of 8000 images of size 28x28 pixels. <br>
<b>Train_labels:</b> A list of 8000 labels each corresponding to its image present in Train_data. <br>
<b>Test_data:</b> A list of test images.

In [2]:
Train_data = None
Train_labels = None
Test_data = None
with open("train_image.pkl", "rb") as reader:
    Train_data = (pickle.load(reader))
with open("train_label.pkl", "rb") as reader:
    Train_labels = (pickle.load(reader))
with open("test_image.pkl", "rb") as reader:
    Test_data = (pickle.load(reader))

### Image Augmentation

<div style="text-align: justify"> The following augmentation sequence was used to increase the number of images for application of neural networks. However, there was no significant improvement over other methods. Therefore, this approach was disregarded from the final classification procedure.<br>
The transformation sequence included cropping, affine transformation, flipping, addition of noise and blur and changes in contrast and brightness. This script generated 8000 new images.</div>

In [3]:
# import imgaug
# from imgaug import augmenters as iaa

In [4]:
# Train_data = np.array(Train_data)
# Train_data = np.reshape(Train_data, (len(Train_data), 28, 28, 1))
# Train_data = Train_data.astype('float32')

In [5]:
# imgaug.seed(1)
# seq = iaa.Sequential([iaa.Fliplr(0.5), iaa.Crop(percent=(0, 0.1)), iaa.Sometimes(0.5,iaa.GaussianBlur(sigma=(0,0.5))), iaa.ContrastNormalization((0.75, 1.5)), iaa.AdditiveGaussianNoise(loc=0, scale=(0.0, 0.05*255), per_channel=0.5), iaa.Multiply((0.8, 1.2), per_channel=0.2), iaa.Affine(scale={"x": (0.8, 1.2), "y": (0.8, 1.2)}, translate_percent={"x": (-0.2, 0.2), "y": (-0.2, 0.2)}, rotate=(-25, 25), shear=(-8, 8))], random_order=True)
# Train_data_aug = seq.augment_images(Train_data)

In [6]:
# Train_data = np.append(Train_data, Train_data_aug, axis=0)
# Train_labels = np.append(Train_labels, Train_labels)

In [7]:
# Train_data = np.reshape(Train_data, (len(Train_data), 784))
# Train_data = Train_data.tolist()
# Train_labels = Train_labels.tolist()

### Script to convert serialized images to .png format for visualization of dataset

In [8]:
# for x in range(len(Train_data)):
#     img = []
#     l = Train_data[x]
#     j=0
#     k=28
#     for i in range(28):
#         img.append(l[j:k])
#         j += 28
#         k += 28
#     cv2.imwrite('image'+str(x+1)+'.png', np.array(img))

### Function to binarize grayscale images

<b>Explanation:</b> This was an attempt to normalize pixel values by fixing values < 127 to 0 and >=128 to 1. However, this showed a drop in accuracy and hence, was not included in the final pre-processing.

In [9]:
# def binarize(BTrain_data, BTest_data):
#     for i in range(len(BTrain_data)):
#         for j in range(len(BTrain_data[0])):
#             if BTrain_data[i][j] < 127:
#                 BTrain_data[i][j] = 0
#             else:
#                 BTrain_data[i][j] = 1

#     for i in range(len(BTest_data)):
#         for j in range(len(BTest_data[0])):
#             if BTest_data[i][j] < 127:
#                 BTest_data[i][j] = 0
#             else:
#                 BTest_data[i][j] = 1

# BTrain_data = copy.deepcopy(Train_data)
# BTest_data = copy.deepcopy(Test_data)

### Function to train the classifier

<b>Parameters: </b> train_data => numpy array which contains training data, train_labels => numpy array containing labels of train data, test_data => numpy array containing test data <br>
<b>Return Value: </b> pred_labels => numpy array of predicted labels for test data, classifier => classifier object trained on train_data

<b>Explanation:</b> Different types of in-built classifiers were tried on the given data. _Logistic Regression_ performed the best for both training and validation data and hence was chosen as the final classifier.

In [10]:
def trainClassifier(train_data, train_labels, test_data):
    classifier = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
#     classifier = GaussianNB()
#     classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=5))
#     classifier = AdaBoostClassifier()
#     classifier = MLPClassifier()
#     classifier = DecisionTreeClassifier()
#     classifier = SVC(gamma='auto')
#     classifier = LinearSVC()
    classifier.fit(train_data, train_labels)
#     pred_prob = classifier.predict_proba(test_data)
    pred_labels = classifier.predict(test_data)
    return pred_labels, classifier

### Function to calculate accuracy of predictions

<b>Parameters:</b> test_labels => actual labels, pred_labels => predicted labels <br>
<b>Return Value: </b> accuracy value

In [11]:
def calcAccuracy(test_labels, pred_labels):
    hit = 0
    for i in range(len(test_labels)):
        if test_labels[i] == pred_labels[i]:
            hit += 1
    return hit/len(test_labels)

### Function to split training data

<b>Parameters:</b> D => training images, D_labels => labels of images, trainp => percentage split expressed in decimal, rseed (default parameter) => random seed for pseudo-randomization <br>
<b>Return Value: </b> train_data => splitted train data, train_labels => labels of splitted data, validation_data => remaining part of original data to be used as validation set, validation_labels => labels for validation_data.

In [12]:
def splitData(D, D_labels, trainp, rseed=42):
    random.Random(rseed).shuffle(D)
    random.Random(rseed).shuffle(D_labels)
    x = int(trainp*len(D))
    train_data = D[:x]
    train_labels = D_labels[:x]
    validation_data = D[x:]
    validation_labels = D_labels[x:]
    return train_data, train_labels, validation_data, validation_labels

### Function to perform Cross Validation on Training Data

<b>Parameters:</b> pdata => training data, train_labels => labels of p_data, q (default parameter) => No. of splits of training data<br>
<b>Return Value: </b> average accuracy, acc => list of accuracies across all folds, fold_index => index values for all folds

In [13]:
def performCrossValidation(pdata, train_labels, q=5):
    fold = len(pdata)//q
    j = 0
    k = fold
    acc = []
    Xs_train = []
    ys_train = []
    all_classifiers = []
    for i in range(q):
        Xs_test = pdata[j:k]
        ys_test = train_labels[j:k]
        if j>0:
            Xs_train = pdata[:j]
            ys_train = train_labels[:j]
        Xs_train += pdata[k:]
        ys_train += train_labels[k:]
        pred_labels, classifier = trainClassifier(np.array(Xs_train), np.array(ys_train), np.array(Xs_test))
        all_classifiers.append(classifier)
        acc.append(calcAccuracy(ys_test, pred_labels))
        j += fold
        if i == 3:
            k = len(pdata)
        else:
            k += fold
    return acc, all_classifiers

### Script to split training data into 70:30

In [14]:
train_data, train_labels, validation_data, validation_labels = splitData(Train_data, Train_labels, 0.7)

### Script to calculate accuracy for validation set

In [15]:
l, c = trainClassifier(train_data, train_labels, validation_data)
calcAccuracy(validation_labels, l)



0.8025

### Script for 5-fold cross validation function call

In [16]:
all_acc, all_classifiers = performCrossValidation(train_data, train_labels)



In [17]:
print(all_acc)
print(sum(all_acc)/len(all_acc))

[0.7660714285714286, 0.7821428571428571, 0.7857142857142857, 0.7741071428571429, 0.7946428571428571]
0.7805357142857143


### Script to select the best classifer and calculate accuracy for validation set

In [18]:
pos = np.argmax(all_acc)
best_classifier = all_classifiers[pos]

pred_labels = best_classifier.predict(validation_data)
calcAccuracy(validation_labels, pred_labels)

0.78875

### Script for obtaining predicted labels for test data

In [19]:
final_labels = best_classifier.predict(Test_data)

### Script to write the predicted labels into .csv file

In [20]:
with open('Udit_Pant.csv', 'w') as csvobj:
    cwriter = csv.writer(csvobj, delimiter = ',')
    cwriter.writerow(['image_index', 'class'])
    for i in range(len(final_labels)):
        cwriter.writerow([i, final_labels[i]])

### References

1. Image augmentation - https://imgaug.readthedocs.io/en/latest/source/examples_basics.html#a-simple-and-common-augmentation-sequence
2. scikit-learn documentation -  https://scikit-learn.org/stable/index.html
3. Stack Overflow - https://stackoverflow.com