In [None]:
pip install face_recognition

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!tar -xvf /content/drive/MyDrive/colorferet.tar

In [None]:
import face_recognition
from email.mime import base
from utils import IMAGE_PATH
import numpy as np
import random as rd
import os
import shutil
import bz2
import time
import pickle
import utils
import json
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [None]:
if not os.path.exists('images/'):
    os.mkdir('images/')

### Putting all files in a single folder

In [None]:
for i in range(1,3):
    dir = os.listdir(f'colorferet/dvd2/gray_feret_cd{i}/data/images')
    for img in dir:
        if img[-4:] != ".bz2":
            continue
        fullname = f'colorferet/dvd2/gray_feret_cd{i}/data/images/' + img
        with bz2.open(fullname) as fh, open('images/'+img[:-4], "wb") as fw:
            shutil.copyfileobj(fh, fw)
    for img in os.listdir('images'):
        if img.endswith('.tif'):
            code = img[:5]
            print(code)
            if not os.path.exists('images/' + code):
                os.mkdir('images/' + code)
            shutil.move('images/' + img,
                        'images/' + code + '/' + img)

### Generate test set

In [None]:
from random import randint
import os
import json

NSAMPLES = 100

def generate_set(output_path, set):
    d = {}
    for i in range(NSAMPLES):
        set = []
        final_idx = -1
        while (len(set) < 2):
            idx = randint(0, 1209)
            if idx in d: continue
            if idx in set: continue
            folder = 'images/' + str(idx).zfill(5)
            if not os.path.exists(folder): continue
            for img in os.listdir(folder):
                if img.endswith(".tif") and ('fa' in img or 'fb' in img):
                    img_path = folder + '/' + img
                    set.append(img_path)
                if len(set) == 2: break
            final_idx = idx
        while len(set) < 3:
            idx = randint(0, 1209)
            if idx == final_idx: continue
            folder = 'images/' + str(idx).zfill(5)
            if not os.path.exists(folder): continue
            for img in os.listdir(folder):
                if img.endswith(".tif") and ('fa' in img or 'fb' in img):
                    img_path = folder + '/' + img
                    set.append(img_path)
                    break
        d[final_idx] = set
    with open(output_path, 'w') as f:
        json.dump(d, f, indent=2)
    return d

x = {}
train = generate_set('train_set.json', x)
generate_set('test_set.json', train)

In [None]:
N_SAMPLES = 10
PCA_N_COMPONENTS=25
N_NEIGHBORS=10

In [None]:
def get_image(path):
    picture = face_recognition.load_image_file(path)
    return face_recognition.face_encodings(picture)[0]

def generate_C1(set):
    return get_image(set[0]) - get_image(set[1])

def generate_C2(set):
    return get_image(set[0]) - get_image(set[2])

In [None]:
def get_sets(dataset='train'):
    with open("{}_set.json".format(dataset)) as f:
        individuals = json.load(f)

    C1 = []
    C2 = []

    count = 0
    for dir, set in individuals.items():
        # print("Generating C1 for {}...".format(dir))
        C1.append(generate_C1(set))
        # print("Generating C2 for {}...".format(dir))
        C2.append(generate_C2(set))
        if count == 50: break
        
    return[*C1, *C2], [*[1 for individual in C1], *[0 for individual in C2]]

In [None]:
def new_train_svm(X, y, kernel='rbf', C=0.5):
    clf = svm.SVC(kernel=kernel, C=C)
    
    # print(f"Fitting {kernel} kernel...")
    start = time.time()
    clf.fit(X, y)
    end = time.time()
    # print("Fitted in {} seconds".format(end - start))
    
    return clf

In [None]:
def new_train_knn_pca(X, y, n_components=PCA_N_COMPONENTS,k=N_NEIGHBORS):
    with open("train_set.json") as f:
        individuals = json.load(f)

    C1 = []
    C2 = []

    for dir, set in individuals.items():
        print("Generating C1 for {}...".format(dir))
        C1.append(generate_C1(set))
        print("Generating C2 for {}...".format(dir))
        C2.append(generate_C2(set))
    
    X = [*C1, *C2]

    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    pca = PCA(n_components=n_components)
    pca.fit(X)
    X = pca.transform(X)
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X, [*[1 for individual in C1], *[0 for individual in C2]])
    return knn

In [None]:
def test_svm(clf):
    with open("test_set.json") as f:
            individuals = json.load(f)

    switch = 0
    correct = 0
    for dir, set in individuals.items():
        if (switch):
            res = clf.predict([get_image(set[0]) - get_image(set[1])])
            if (res[0] == np.int64(1)): correct +=1
            # print(type(res[0]))
            switch = 0
        else:
            res = clf.predict([get_image(set[0]) - get_image(set[2])])
            if (res[0] == np.int64(0)): correct +=1
            # print(res[0])
            switch = 1
    accuracy = correct / len(individuals)
    # print("Accuracy = {}".format(accuracy))
    return accuracy


In [None]:
test_svm(clf)

### K-fold cross validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
def cross_validate_params_svm(X, y, C, kernel):
    kf = KFold(n_splits=5)
    kf.get_n_splits(X)

    accuracies = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

        
        clf = svm.SVC(C=C, kernel=kernel)
        clf.fit(X_train, y_train)
        y_hat = clf.predict(X_test)

        score = 0
        for predicted, target in zip(y_hat, y_test):
            if y_hat == y_test:
                score += 1

        accuracies.append(score/len(y_test))

    return np.mean(accuracies)


def cross_validate_params_knn_pca(X, y, K, n_components):
    kf = KFold(n_splits=5)
    kf.get_n_splits(X)

    accuracies = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
        y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]

        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train_scaled = scaler.transform(X_train)
        pca = PCA(n_components=min(n_components, len(X_train)))
        pca.fit(X_train_scaled)
        X_scaled_reduced = pca.transform(X_train_scaled)
        knn = KNeighborsClassifier(n_neighbors=K)
        knn.fit(X_scaled_reduced, y_train)

        scaler.fit(X_test)
        X_test_scaled = scaler.transform(X_test)
        pca.fit(X_test_scaled)
        X_test_scaled_reduced = pca.transform(X_test_scaled)

        y_hat = knn.predict(X_scaled_reduced)

        score = 0
        for predicted, target in zip(y_hat, y_test):
            if y_hat == y_test:
                score += 1

        accuracies.append(score/len(y_test))
    return np.mean(accuracies)


def k_fold_cross_validation_svm(C_values, kernels):
    X, y = get_sets('train')
    for kernel in kernels:
        for C in C_values:
            score = cross_validate_params_svm(X, y, C, kernel)
            print("C: {}, kernel: {}, accuracy: {}".format(C, kernel, score))

def k_fold_cross_validation_pca_knn(K_values, n_components_values):
    X, y = get_sets('train')
    for K in K_values:
        for n_components in n_components_values:
            #return cross_validate_params_knn_pca(X, y, K, n_components)
            score = cross_validate_params_knn_pca(X, y, K, n_components)
            print("k:{}, N components: {}, accuracy: {}".format(K, n_components, score))

In [None]:
kernels = ['poly', 'rbf', 'sigmoid', 'linear']
C_values = [0.01, 0.1, 1, 10, 100, 1000]
k_fold_cross_validation_svm(C_values=C_values, kernels=kernels)

In [None]:
K_values = [2, 4, 8, 16, 32]
K_values = [8]
n_components = [10, 20, 30, 40, 50, 60, 70, 80]
k_fold_cross_validation_pca_knn(K_values=K_values, n_components_values=n_components)

### Test

In [None]:
def train(kernel, C, X, y):
    accuracy = -1
    # while accuracy < 0.9:
    clf = new_train_svm(X, y, kernel, C)
    accuracy = test_svm(clf)
    print("Accuracy = {}".format(accuracy))
    print("C = {}".format(C))
    print("Kernel = {}".format(kernel))

In [None]:
X, y = get_sets('train')

In [None]:
train(kernel='rbf', C=1, X=X, y=y)

In [None]:
knn = new_train_knn_pca()

In [None]:
correct = 0
X, y = get_sets('test')
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
pca = PCA(n_components=PCA_N_COMPONENTS)
pca.fit(X)
X = pca.transform(X)
count = 0

for set in X:
    if (count < 50):
        res = knn.predict([set])
        if (res[0] == np.int64(1)): correct +=1
        count += 1
    else:
        res = knn.predict([set])
        if (res[0] == np.int64(0)): correct +=1


print("Accuracy = {}".format(correct/len(individuals)))


In [None]:
knn.predict([get_image("images/00911/00911fa010_960530.tif") - get_image("images/00510/00510fa010h_940519.tif")])