In [462]:
import pandas as pd
import numpy as np

import os
import glob
from multiprocessing import cpu_count
from multiprocessing.pool import Pool
from tqdm.notebook import tqdm

import cv2
import dlib
import h5py

from matplotlib import pyplot as plt
%matplotlib inline
from scipy import ndimage

In [463]:
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import Normalizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [464]:
predictor_path = "shape_predictor_5_face_landmarks.dat"
face_rec_model_path = "dlib_face_recognition_resnet_model_v1.dat"

sp = dlib.shape_predictor(predictor_path)
facerec = dlib.face_recognition_model_v1(face_rec_model_path)

In [465]:
def extract(img_path: str) -> np.ndarray:
    img = dlib.load_rgb_image(img_path)
    left, top, bottom, right = 0, 0, img.shape[0], img.shape[1]
    det = dlib.rectangle(left=left, top=top, right=right, bottom=bottom)
    shape = sp(img, det)
    face_descriptor = np.array(
        facerec.compute_face_descriptor(img, shape, 1, 0.4))

    return face_descriptor.flatten()

In [466]:
hf = h5py.File('dataset.h5', 'w')
for name in ['train', 'test']:
    df = pd.read_csv('augmented_{}.csv'.format(name))
    
    paths = df['path']
    paths = paths.values.tolist()
    
    res = []
    with Pool(processes=cpu_count() - 1) as p:
        with tqdm(total=len(paths), desc='Encode {}'.format(name)) as pbar:
            for _, encoding in tqdm(enumerate(p.imap(extract, paths))):
                pbar.update()
                res.append(encoding)
        res = np.array(res)
        print(res.shape)
        hf.create_dataset(name, data=res)
hf.close()

HBox(children=(IntProgress(value=0, description='Encode train', max=744, style=ProgressStyle(description_width…

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



(744, 128)


HBox(children=(IntProgress(value=0, description='Encode test', max=57, style=ProgressStyle(description_width='…

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))



(57, 128)


In [467]:
train_df = pd.read_csv('augmented_train.csv')
test_df = pd.read_csv('augmented_test.csv')

y_train = train_df['label']
y_test = test_df['label']

In [468]:
hf = h5py.File('dataset.h5', 'r')

x_train = np.array(hf.get('train'))
x_test = np.array(hf.get('test'))

hf.close()

In [469]:
clf = LinearSVC(random_state=42)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

0.8245614035087719

In [458]:
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

In [459]:
lb = LabelEncoder()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [460]:
clf = LGBMClassifier(random_state=42)
clf.fit(x_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [461]:
clf.score(x_test, y_test)

0.6491228070175439