# Import to use script

In [1]:
from pathlib import Path
import sys
root = Path.cwd().parent.parent
sys.path.insert(1, str(root))
%load_ext autoreload
%autoreload 2

In [2]:
from appconfig.config import PROCESSED_DATA_DIR, MODEL_DIR

# Import library

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from PIL import Image
from tqdm import tqdm
from tensorflow.keras.models import load_model
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import LabelEncoder
import pickle

# Load model

In [4]:
model = load_model(MODEL_DIR)



In [5]:
model.summary()

Model: "inception_resnet_v1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 160, 160, 3) 0                                            
__________________________________________________________________________________________________
Conv2d_1a_3x3 (Conv2D)          (None, 79, 79, 32)   864         input_1[0][0]                    
__________________________________________________________________________________________________
Conv2d_1a_3x3_BatchNorm (BatchN (None, 79, 79, 32)   96          Conv2d_1a_3x3[0][0]              
__________________________________________________________________________________________________
Conv2d_1a_3x3_Activation (Activ (None, 79, 79, 32)   0           Conv2d_1a_3x3_BatchNorm[0][0]    
________________________________________________________________________________

Mixed_7a_Branch_1_Conv2d_0a_1x1 (None, 8, 8, 256)    768         Mixed_7a_Branch_1_Conv2d_0a_1x1[0
__________________________________________________________________________________________________
Mixed_7a_Branch_2_Conv2d_0b_3x3 (None, 8, 8, 256)    768         Mixed_7a_Branch_2_Conv2d_0b_3x3[0
__________________________________________________________________________________________________
Mixed_7a_Branch_0_Conv2d_0a_1x1 (None, 8, 8, 256)    0           Mixed_7a_Branch_0_Conv2d_0a_1x1_B
__________________________________________________________________________________________________
Mixed_7a_Branch_1_Conv2d_0a_1x1 (None, 8, 8, 256)    0           Mixed_7a_Branch_1_Conv2d_0a_1x1_B
__________________________________________________________________________________________________
Mixed_7a_Branch_2_Conv2d_0b_3x3 (None, 8, 8, 256)    0           Mixed_7a_Branch_2_Conv2d_0b_3x3_B
__________________________________________________________________________________________________
Mixed_7a_B

Total params: 22,808,144
Trainable params: 22,779,312
Non-trainable params: 28,832
__________________________________________________________________________________________________


In [16]:
print(model.input)
print(model.output)

Tensor("input_1:0", shape=(None, 160, 160, 3), dtype=float32)
Tensor("Bottleneck_BatchNorm/Identity:0", shape=(None, 128), dtype=float32)


# Prepare data

In [7]:
def create_data_dictionary(DATA_DIR):
    '''
    Loop through Image directory has many subfolders, subfolders's name is label

    Args: 
    DATA_DIR: Image directory. type Poxis_Path
    limit_image: Maximum images in dictionary

    return:
    Dict {'label': [images's filename]}
    '''
    data_dict = {}
    for path in DATA_DIR.glob('*'):
        label = str(path).split('/')[-1]
        if label in ['tien', 'be']:
            data_dict.setdefault(label, [])

            for image_path in (DATA_DIR/label).glob('*'):
                filename = str(image_path).split('/')[-1]
                data_dict.get(label).append(filename)
            
    return data_dict

train_data_dict = create_data_dictionary(PROCESSED_DATA_DIR / 'train')
test_data_dict = create_data_dictionary(PROCESSED_DATA_DIR / 'test')

#---------------------------------------------------------------------
print("Labels in train_data_dict: ", list(train_data_dict.keys()))
print("Images in each label: ", len(train_data_dict['tien']))
print('-' * 60)
print("Labels in test_data_dict: ", list(test_data_dict.keys()))
print("Images in each label: ", len(test_data_dict['tien']))

Labels in train_data_dict:  ['be', 'tien']
Images in each label:  800
------------------------------------------------------------
Labels in test_data_dict:  ['be', 'tien']
Images in each label:  200


In [11]:
def standardize(pixels):
    mean, std = pixels.mean(), pixels.std()
    pixels_std = (pixels - mean) / std
    return pixels_std

def expand_dims(pixels):
    return np.expand_dims(pixels, axis=0)

def prepare_sample_to_get_embedding(image):
    pixels = np.asarray(image, dtype=np.float32)
    pixels_std = standardize(pixels)
    pixels_std = expand_dims(pixels_std)
    return pixels_std

def get_embedding(model, sample):
    yhat = model.predict(sample)
    return yhat[0]
    

In [1]:
def prepare_data_and_label(model, data_dict, type_data_dict):
    embedding_list = []
    labels = []
    for label, image_names in data_dict.items():
        for image_name in tqdm(image_names):
            image_path = str(PROCESSED_DATA_DIR / type_data_dict / label / image_name)
            image = Image.open(image_path)
            sample = prepare_sample_to_get_embedding(image)
            embedding = get_embedding(model, sample)

            embedding_list.append(embedding)
            labels.append(label)
            
    return np.asarray(embedding_list), np.asarray(labels)

x_train, y_train = prepare_data_and_label(model, train_data_dict, 'train')
x_test, y_test = prepare_data_and_label(model, test_data_dict, 'test')

In [13]:
print("x_train shape: ", x_train.shape)
print("y_train shape: ", y_train.shape)
print('-' * 60)
print("x_test shape: ", x_test.shape)
print("y_test shape: ", y_test.shape)

x_train shape:  (1600, 128)
y_train shape:  (1600,)
------------------------------------------------------------
x_test shape:  (400, 128)
y_test shape:  (400,)


In [47]:
# Save prepared images and label
with open('x_train.pickle', 'wb') as f:
    pickle.dump(x_train, f)
    
with open('x_test.pickle', 'wb') as f:
    pickle.dump(x_test, f)

with open('y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)

with open('y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)

# Preprocess x_train, y_train, x_test, y_test before put in SVM

In [14]:
#Normalize it
normalizer = Normalizer()

train_data = normalizer.transform(x_train)
test_data = normalizer.transform(x_test)

In [15]:
# Encode label, convert from string to number
encoder = LabelEncoder()
encoder.fit(y_train)

train_labels = encoder.transform(y_train)
test_labels = encoder.transform(y_test)

# Test with 2 models, SVM and KNN

In [16]:
clf = SVC(kernel='linear', probability=True)

In [92]:
knn = KNeighborsClassifier(n_neighbors=2)

In [17]:
clf.fit(train_data, train_labels)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [93]:
knn.fit(train_data, train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                     weights='uniform')

# Prepare test

In [132]:
def prepare_for_predict(test_img_path):
    test_img = Image.open(test_img_path)
    test_img_embedding = get_embedding(model, test_img)
    test = expand_dims(test_img_embedding)
    test = normalizer.transform(test)
    return test

def calculate_probability_for_each_label(y_proba):
    res = ''
    for i, j in enumerate(y_proba[0]):
        arr = [i]
        name = encoder.inverse_transform(arr)
        probability = j * 100
        res += f"({name[0]}: {probability}) "
    return res.strip()

In [133]:
img_test = PROCESSED_DATA_DIR / 'test.jpg'
test = prepare_for_predict(img_test)

# Test SVM

In [230]:
y_pred = clf.predict(test)
y_proba = clf.predict_proba(test)
print(y_pred, y_proba)

[9 9] [[1.81229367e-03 2.84182395e-04 1.87307186e-03 8.55294743e-04
  4.68518874e-04 7.83600647e-04 1.19140107e-04 1.05472355e-02
  3.58349040e-04 9.81454204e-01 1.44410951e-03]]


In [175]:
face_name = encoder.inverse_transform(y_pred)
class_index = y_pred[0]
class_probability = y_proba[0,class_index] * 100
print(face_name[0], class_probability)

tien 98.14542036440774


In [148]:
print("Image of: ", face_name[0])
print("Confidence: ", class_probability)
print("Each label's probability: ", calculate_probability_for_each_label(y_proba))

Image of:  tien
Confidence:  98.56631320337118
Each label's probability:  (cuong: 0.11316819034336166) (felix: 0.020039680643087806) (khoa: 0.11869606531175673) (ly: 0.07544888717454425) (nam: 0.03251680158371052) (natalie: 0.08117485756246058) (sean: 0.010999682840925597) (tam: 0.8231625462817822) (tan: 0.02491840263129579) (tien: 98.56631320337118) (vu: 0.13356168225594356)


In [89]:
# Mean score 
clf.score(test_data, test_labels)

0.9990601503759399

# TEST KNN

In [111]:
y_pred_knn = knn.predict(test)
y_proba_knn = knn.predict_proba(test)
print(y_pred_knn, y_proba_knn)

[4] [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


In [112]:
face_name_knn = encoder.inverse_transform(y_pred_knn)
class_index_knn = y_pred_knn[0]
class_probability_knn = y_proba_knn[0,class_index] * 100
print(face_name_knn[0], class_probability_knn)

nam 100.0


In [113]:
print("Image of: ", face_name_knn[0])
print("Confidence: ", class_probability_knn)
print("Each label's probability: ", calculate_probability_for_each_label(y_proba_knn))

Image of:  nam
Confidence:  100.0
Each label's probability:  (cuong: 0.0) (felix: 0.0) (khoa: 0.0) (ly: 0.0) (nam: 100.0) (natalie: 0.0) (sean: 0.0) (tam: 0.0) (tan: 0.0) (tien: 0.0) (vu: 0.0)


In [106]:
knn.score(test_data, test_labels)

0.9985902255639098