# Assumptions -
##### I assume dataset is present on the same path as code is present.

# Import Packages 

In [99]:
import pandas as pd
import pickle
import copy
import os
import cv2
import numpy as np
from numpy import linalg as LA
from numpy import array
from collections import defaultdict
import random
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import seaborn
from sklearn.utils import shuffle

# Read files

In [100]:
with open("train_image.pkl", 'rb') as fo:
    train_data = pickle.load(fo, encoding='bytes')
with open("train_label.pkl", 'rb') as f1:
    train_label = pickle.load(f1, encoding='bytes')
with open("test_image.pkl", 'rb') as f2:
    test_data = pickle.load(f2, encoding='bytes')

# PCA with logistic 
##### Approach behind using PCA is only taking account of good fatures I used those eigen vector which have eigen energy 95%.
##### After PCA I apply logistic regression which perform good, as comparison to other classifier.

# PCA function for dimensionality reduction

In [101]:
def PCA(data,dimension,eigen_energy):
    m, n = data.shape  
    s = np.cov(data.T)
    eigen_val, eigen_vec = LA.eigh(s)         #to compute eigen value and eigen vector.
    length = len(eigen_vec)                  #to compute eigen energy.
    index = np.argsort(eigen_val)[::-1]      #determine index of eigen_val after sorted in desending order.
    eigen_vec = eigen_vec[:,index]           #shift eigen_vec according to eigen_Val.
    eigen_val[::-1].sort()                   #sort eigenval
    new_eval=[]
    new_evec=[]
    if(eigen_energy>0):
        temp=0
        count=0
        eigen_energy= sum(eigen_val)*(eigen_energy/100)
        for i in range(len(eigen_val)):
            temp=temp+eigen_val[i]
            if(temp >  eigen_energy):
                break
            else:
                count+=1
                new_eval.append(eigen_val[i])
                new_evec.append(eigen_vec[i])
        new_evec = eigen_vec[:,:count]
        new_evec=new_evec.T
        result = (np.dot(new_evec, data.T)).T
        return result, new_evec
    else:
        new_evec = eigen_vec[:,:dimension]
        new_evec=new_evec.T
    result = (np.dot(new_evec, data.T).T)
    return result, new_evec

# Perform PCA over Train and Test dataset

In [102]:
train_data,pca_evec=PCA(np.array(train_data),0,95)
train_data=train_data.tolist()
test_data = ((np.dot(pca_evec, (np.array(test_data)).T).T)).tolist() 

# Classifier method of Logistic regression

In [103]:
def classifier(train_data,train_label,test_data,test_label):
    clf = GaussianNB()
    clf.fit(train_data, train_label)
    clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(train_data,train_label)
    prediction =  clf.predict(test_data)
    acc = clf.score(test_data,test_label)
    parameter = clf.get_params()
    return acc*100,parameter,clf

# Perform cross validation
##### The reason for coss validation is that to train our model more precisely and I perform 5-fold and after which fold give best accuracy I take there parameters and predict the Testdata.

In [107]:
def fold(temp_data,temp_label,fold_time):   
    temp_data, temp_label = shuffle ((temp_data), (temp_label))
    index_1=int(len(temp_data)/fold_time)

    index_2=0
    index_3=index_1
    para=[] ; accuracy=[] ; clf_obj=[]
    for i in range (0,fold_time):
        test_data=[];test_label=[];train_data=[];train_label=[]
       
        test_data=temp_data[index_2:index_3]
        test_label=temp_label[index_2:index_3]
        index_2=index_1+index_2
        index_3=index_1+index_3
        
        for j in range(len(temp_data)):
            if temp_data[j] not in test_data:
                train_data.append(temp_data[j])
                train_label.append(temp_label[j])

        
        acc,pa,clf_obj_1=classifier(train_data,train_label,test_data,test_label)
        print("Accuracy after fold ",i+1,"is =",acc)
        para.append(pa)
        accuracy.append(acc)
        clf_obj.append(clf_obj_1)
        
    index = accuracy.index(max(accuracy))
    print("MAX Accuracy across 5-fold :",max(accuracy))
    return para[index],clf_obj[index]

In [108]:
p,c=fold(train_data,train_label,5)
c.set_params(**p)
prediction = c.predict(test_data)

Accuracy after fold  1 is = 81.375
Accuracy after fold  2 is = 79.1875
Accuracy after fold  3 is = 82.625
Accuracy after fold  4 is = 80.25
Accuracy after fold  5 is = 80.25
MAX Accuracy across 5-fold : 82.625


# Write out into csv file 

In [109]:
dic=[]
count=0
dic.append(['image_index','class'])
for i in prediction:
    d=[]
    d.append(count)
    d.append(i)
    dic.append(d)
    count+=1
df = pd.DataFrame(dic)
df.to_csv("Vikash_kumar_pandey.csv", encoding='utf-8', index=False,header =False)