In [1]:
import pandas as pd
import random
import numpy as np
import os
import glob
import cv2

In [2]:
def resize(path, img_height, img_width):
    img = cv2.imread(path)
    resized = cv2.resize(img, (img_height, img_width))
    return resized

In [3]:
def get_data():
    train_dir = "./data/train/"
    X_train = []
    y_train = []
    
    for j in range(10):
        print('Load folder c{}'.format(j))
        path = os.path.join(train_dir, 'c' + str(j), '*.jpg')
        files = glob.glob(path)
        for fl in files:
            flbase = os.path.basename(fl)
            img = resize(fl, 64, 64)
            X_train.append(img)
            y_train.append(j)
    return X_train, y_train

In [4]:
X, y = get_data()

Load folder c0
Load folder c1
Load folder c2
Load folder c3
Load folder c4
Load folder c5
Load folder c6
Load folder c7
Load folder c8
Load folder c9


In [5]:
X = np.asarray(X)
y = np.asarray(y)
print(X.shape)


(2200, 64, 64, 3)


In [6]:
#converted images into numpy array
# = np.asarray(X)
#y = np.asarray(y)
#print(X.shape,y.shape)

In [7]:
#convert each image into a vector which represented as a row in the matrix X
X = np.reshape(X, (X.shape[0], -1))

In [8]:
print(X.shape)

(2200, 12288)


In [9]:
y=y.reshape((-1,1))

In [10]:
print(y.shape)

(2200, 1)


In [11]:
#merge X and y matrix
X=np.hstack((X,y))

In [12]:
print(X.shape)

(2200, 12289)


In [13]:
#now shuffle the data points(each data point is a image)
np.random.shuffle(X)

In [14]:
#X=X/255

In [15]:
pd.DataFrame(X).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12279,12280,12281,12282,12283,12284,12285,12286,12287,12288
0,38,46,39,37,44,37,71,83,72,0,...,17,16,20,15,16,20,13,14,18,3
1,53,65,50,50,59,49,171,188,176,1,...,33,31,31,32,28,33,43,41,40,8
2,26,37,29,31,42,34,32,43,33,35,...,25,22,18,92,85,68,9,13,14,4
3,54,64,46,54,64,46,55,66,46,56,...,18,15,24,17,14,23,16,15,17,4
4,42,59,48,48,65,54,64,81,70,57,...,10,11,22,16,13,15,12,11,13,6
5,22,36,30,33,46,39,41,50,42,0,...,12,12,12,12,12,12,16,16,16,3
6,25,33,22,25,33,22,28,36,25,29,...,4,8,9,4,8,9,3,7,8,1
7,35,42,37,35,42,37,35,42,37,36,...,49,34,31,55,44,36,27,21,13,3
8,63,86,74,68,91,80,72,101,88,79,...,27,20,24,18,16,18,16,15,17,0
9,31,45,34,34,46,36,38,47,37,40,...,17,20,25,20,21,22,25,27,28,8


In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
model=LogisticRegression(multi_class='ovr', solver='liblinear')

In [18]:
X_total=X[:,:-1]

In [19]:
y_total=X[:,-1]

In [20]:
print(X_total.shape,y_total.shape)

(2200, 12288) (2200,)


In [21]:
pd.DataFrame(y_total).head(10)

Unnamed: 0,0
0,3
1,8
2,4
3,4
4,6
5,3
6,1
7,3
8,0
9,8


In [22]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits(X_total)
print(kf)

KFold(n_splits=5, random_state=None, shuffle=False)


In [23]:
acc=[]
for train_index, test_index in kf.split(X_total):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X_total[train_index], X_total[test_index]
    y_train, y_test = y_total[train_index], y_total[test_index]
    model.fit(X_train,y_train)
    print(X_train.shape,X_test.shape)
    acc.append(model.score(X_test,y_test))
    

(1760, 12288) (440, 12288)
(1760, 12288) (440, 12288)
(1760, 12288) (440, 12288)
(1760, 12288) (440, 12288)
(1760, 12288) (440, 12288)


In [24]:
print("5-Fold Cross_val Accuracy:",acc)
print("Avg Accuracy: ",(sum(acc)/5)*100)

5-Fold Cross_val Accuracy: [0.95, 0.9681818181818181, 0.9522727272727273, 0.9386363636363636, 0.9454545454545454]
Avg Accuracy:  95.09090909090908


In [25]:
print(np.sum(acc)/5)

0.9509090909090908


In [26]:
split=int(0.8*X.shape[0])

In [27]:
X_train=X[:split,:-1]

In [28]:
print(X_train.shape)

(1760, 12288)


In [29]:
X_validate=X[split:,:-1]

In [30]:
print(X_validate.shape)

(440, 12288)


In [31]:
y_train=X[:split,-1]

In [32]:
print(y_train.shape)

(1760,)


In [33]:
y_validate=X[split:,-1]

In [34]:
print(y_validate.shape)

(440,)


In [35]:
#X_train=X_train/255.0
#X_validate=X_validate/255.0

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
model=LogisticRegression(multi_class='ovr', solver='liblinear')

In [38]:
#model=LogisticRegression(multi_class='multinomial', solver='lbfgs')

In [39]:
model.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [40]:
model.score(X_validate,y_validate)

0.9454545454545454

In [41]:
y_predict=model.predict(X_validate)

In [42]:
pd.DataFrame(y_predict).head(10)

Unnamed: 0,0
0,5
1,4
2,1
3,1
4,6
5,4
6,8
7,8
8,7
9,7


In [43]:
pd.DataFrame(y_validate).head(10)

Unnamed: 0,0
0,5
1,4
2,1
3,1
4,6
5,4
6,8
7,8
8,7
9,7


In [44]:
#Find the Accuracy
def Accuracy(actual,predict):
    acc=np.sum((actual==predict))/actual.shape[0]
    return acc*100

In [45]:
#print("Training Accuracy",Accuracy(y_train,model.predict(X_train)))
print("Validation set Accuracy of LR",Accuracy(y_validate,model.predict(X_validate)))

Validation set Accuracy of LR 94.54545454545455


In [46]:
#Confussion Matrix

In [47]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_validate,y_predict)

In [48]:
print(cm)

[[34  0  0  0  0  0  0  0  0  1]
 [ 1 37  2  0  0  0  0  0  0  1]
 [ 0  0 44  0  0  0  0  0  0  0]
 [ 0  0  0 38  1  0  0  0  0  1]
 [ 0  0  0  1 49  0  0  0  0  0]
 [ 0  0  0  0  0 45  0  0  0  0]
 [ 0  2  0  0  2  0 40  0  0  0]
 [ 1  1  2  0  0  0  0 39  3  0]
 [ 0  0  1  0  0  0  1  1 42  0]
 [ 1  0  0  0  0  0  0  0  1 48]]


In [49]:
import seaborn as sn
import matplotlib.pyplot as plt
plt.figure(figsize=(10,7))
plt.title("LR Confusion Matrix")
sn.heatmap(cm,annot=True,fmt="d")
plt.xlabel("Predict")
plt.ylabel("Truth")
plt.show()

<Figure size 1000x700 with 2 Axes>

In [50]:
from sklearn import metrics
print("LR performance matrix")
print(metrics.classification_report(y_validate, model.predict(X_validate)))

LR performance matrix
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        35
           1       0.93      0.90      0.91        41
           2       0.90      1.00      0.95        44
           3       0.97      0.95      0.96        40
           4       0.94      0.98      0.96        50
           5       1.00      1.00      1.00        45
           6       0.98      0.91      0.94        44
           7       0.97      0.85      0.91        46
           8       0.91      0.93      0.92        45
           9       0.94      0.96      0.95        50

    accuracy                           0.95       440
   macro avg       0.95      0.95      0.94       440
weighted avg       0.95      0.95      0.95       440

