In [1]:
import os
import sys
import pickle
import numpy as np
import pandas as pd
from PIL import Image, ImageFilter
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, mean_squared_error, log_loss, confusion_matrix
#from padhai import MPNeuron, Perceptron, PerceptronWithSigmoid
import matplotlib.pyplot as plt
import operator
import json
import warnings
warnings.filterwarnings("ignore")

np.random.seed(100)
LEVEL = 'level_2'

In [2]:
def read_all(folder_path, key_prefix=""):
    '''
    It returns a dictionary with 'file names' as keys and 'flattened image arrays' as values.
    '''
    print("Reading:")
    images = {}
    files = os.listdir(folder_path)
    for i, file_name in tqdm_notebook(enumerate(files), total=len(files)):
        file_path = os.path.join(folder_path, file_name)
        image_index = key_prefix + file_name[:-4]
        image = Image.open(file_path)
        image = image.convert("L")
        images[image_index] = np.array(image.copy()).flatten()
        image.close()
    return images

In [3]:
languages = ['ta', 'hi', 'en']

images_train = read_all("../input/level_2_train/"+LEVEL+"/"+"background", key_prefix='bgr_') # change the path
for language in languages:
    images_train.update(read_all("../input/level_2_train/"+LEVEL+"/"+language, key_prefix=language+"_" ))
print(len(images_train))

images_test = read_all("../input/level_2_test/kaggle_"+LEVEL, key_prefix='') # change the path
print(len(images_test))

Reading:


HBox(children=(IntProgress(value=0, max=450), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


Reading:


HBox(children=(IntProgress(value=0, max=150), HTML(value='')))


900
Reading:


HBox(children=(IntProgress(value=0, max=300), HTML(value='')))


300


In [4]:
list(images_test.keys())[:5]

['145', '34', '90', '261', '48']

In [5]:
X_train = []
Y_train = []
for key, value in images_train.items():
    X_train.append(value)
    if key[:4] == "bgr_":
        Y_train.append(0)
    else:
        Y_train.append(1)

ID_test = []
X_test = []
for key, value in images_test.items():
    ID_test.append(int(key))
    X_test.append(value)

X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)

print(X_train.shape, Y_train.shape)
print(X_test.shape)

(900, 256) (900,)
(300, 256)


In [6]:
scaler = MinMaxScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)

In [7]:
def binarise_data(X):
    x_binarised_data = []
    for x in X:
        x_temp = []
        for i in x:
            if i == 1:
                x_temp.append(1)
            else:
                x_temp.append(0)
        x_binarised_data.append(x_temp)
    return np.array(x_binarised_data)

In [8]:
X_binarised_train = binarise_data(X_scaled_train)
X_binarised_test = binarise_data(X_scaled_test)

## Since the data is linear separable, using MPneuron model.

In [9]:
class MPNeuron:
    
    def __init__(self):
        self.theta = None
        
    def mp_neuron(self, x):
        if sum(x) >= self.theta:
            return 1
        return 0
    
    def fit_brute_force(self, X, Y):
        accuracy = {}
        for theta in tqdm_notebook(range(0, X.shape[1]+1), total=X.shape[1]+1):
            self.theta = theta
            Y_pred = self.predict(X)
            accuracy[theta] = accuracy_score(Y, Y_pred)  
            
        sorted_accuracy = sorted(accuracy.items(), key=operator.itemgetter(1), reverse=True)
        best_theta, best_accuracy = sorted_accuracy[0]
        self.theta = best_theta
        
    def fit(self, X, Y, epochs=10, log=False, display_plot=False):
        self.theta = (X.shape[1]+1)//2
        if log or display_plot:
            accuracy = {}
        for i in tqdm_notebook(range(epochs), total=epochs, unit="epoch"):
            Y_pred = self.predict(X)
            tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
            if fp > fn and self.theta <= X.shape[1]:
                self.theta += 1
            elif fp < fn and self.theta >= 1:
                self.theta -= 1
            else:
                continue
                
            if log or display_plot:
                Y_pred = self.predict(X)
                accuracy[i] = accuracy_score(Y, Y_pred)
        if log:
            with open('mp_neuron_accuracy.json', 'w') as fp:
                json.dump(accuracy, fp)
        if display_plot:
            epochs_, accuracy_ = zip(*accuracy.items())
            plt.plot(epochs_, accuracy_)
            plt.xlabel("Epochs")
            plt.ylabel("Train Accuracy")
            plt.show()
    
    def predict(self, X):
        Y = []
        for x in X:
            result = self.mp_neuron(x)
            Y.append(result)
        return np.array(Y)


In [10]:
mpneuron = MPNeuron()
mpneuron.fit_brute_force(X_binarised_train,Y_train)

HBox(children=(IntProgress(value=0, max=257), HTML(value='')))




In [11]:
accuracy_score(mpneuron.predict(X_binarised_train),Y_train)

1.0

In [12]:
confusion_matrix(mpneuron.predict(X_binarised_train),Y_train)

array([[450,   0],
       [  0, 450]])

## Sample Submission

In [13]:
Y_pred_test = mpneuron.predict(X_binarised_test)

submission = pd.DataFrame({'ImageId':ID_test,
             'Class':Y_pred_test})

submission = submission.sort_values(['ImageId'])
submission.to_csv("submisision.csv", index=False)