Imports

In [None]:
from time import time
import logging
import matplotlib.pyplot as plt
import numpy as np
import os
import cv2

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_lfw_people
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from sklearn.svm import SVC

Eisagwgh eikonwn

In [None]:
datapath=('C:/Users/Thomas/Documents/Datasets/lfw(250x250)/')

In [None]:
#some dataset have a different number of faces for each person, so we only use the people with min_faces images in their folders
min_faces=30

In [None]:
faces=[]
labels=[]

In [None]:
target_names=[]  #all the names are saved here
total_photos_seen=0 #keeping track of the total images
n_classes=0  #number of classes/diffent people
folders = os.listdir(datapath)
for folder in folders:
    label = os.path.basename(folder)
    training_images_path = datapath + '/' + folder
    num_of_faces = len(os.listdir(training_images_path))
    if num_of_faces>=min_faces:    #people with low number of faces are skipped
        target_names.append(label)
        n_classes=n_classes+1
        for image in os.listdir(training_images_path):
            total_photos_seen=total_photos_seen+1
            image_path = training_images_path + '/' + image
            training_image = cv2.imread(image_path)
            face=cv2.cvtColor(training_image,cv2.COLOR_BGR2GRAY)  #making the images black and white
            faces.append(face)
            labels.append(n_classes)

In [None]:
#for the eigenfaces method, every image must be "flattened", meaning the array should be one dimentional
flat_faces = []

for face in faces:
    flat_faces.append(face.reshape(-1))

In [None]:
flat_faces = np.array(flat_faces)
labels=np.array(labels)

Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(flat_faces, labels, test_size=0.20, random_state=42)

In [None]:
#These are in list type, and should become numpy.ndarray
X_train = np.array(X_train)
X_test = np.array(X_test)

In [None]:
#Firstly, PCA is fitted without the component parameter, so we can find the optimal number of components
pca = PCA().fit(X_train)

plt.figure(figsize=(18, 7))
plt.plot(pca.explained_variance_ratio_.cumsum(), lw=3)

In [None]:
#the smallest value is the optimal number of components
np.where(pca.explained_variance_ratio_.cumsum() > 0.95) #dokimes 90%-95%

Initializations

In [None]:
import pandas as pd

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score,cross_validate

In [None]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process.kernels import DotProduct
from sklearn.gaussian_process.kernels import Matern
from sklearn.gaussian_process.kernels import RationalQuadratic
from sklearn.gaussian_process.kernels import WhiteKernel

In [None]:
X_train, X_test, y_train, y_test = train_test_split(flat_faces, labels, test_size=0.15, random_state=42)

In [None]:
pca = PCA(n_components=374, svd_solver='randomized',whiten=True)

In [None]:
lda = LDA()

In [None]:
#cv=KFold(n_splits=3)
cv=StratifiedKFold(n_splits=10)

In [None]:
scoring = {'accuracy': 'accuracy',
           'precision_macro': 'precision_macro',
           #'precision_micro': 'precision_micro',
           'precision_weighted': 'precision_weighted',
           'recall_macro': 'recall_macro',
           #'recall_micro': 'recall_micro',
           'recall_weighted': 'recall_weighted',           
           'f1_macro':'f1_macro',
           #'f1_micro':'f1_micro',
           'f1_weighted':'f1_weighted'}

In [None]:
from functools import partial
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
import optuna
from sklearn.metrics import accuracy_score

In [None]:
#Now, for each machine learning algorithm, optuna will run and give the best parameters

SVC

In [None]:
def optimize_svm(trial,x,y):
    svc__C=trial.suggest_uniform("C",0.001,100)
    svc__gamma=trial.suggest_uniform("gamma",0.0001,100)
    svc__kernel=trial.suggest_categorical("kernel",["rbf","linear", "poly","sigmoid"])
    svc__degree=trial.suggest_int("degree",1,6)
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    svc=SVC(class_weight='balanced',
           C=svc__C,
           gamma=svc__gamma,
           kernel=svc__kernel,
           degree=svc__degree)
    model = Pipeline([('pca', pca),
                 ('lda',lda),
                ('svc', svc)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_svm=partial(optimize_svm,x=flat_faces,y=labels)

In [None]:
study_svm=optuna.create_study(direction="minimize")

In [None]:
study_svm.optimize(optimization_function_svm,n_trials=150,n_jobs=-1)

In [None]:
print(study_svm.best_trial)

K Nearest Neighbors

In [None]:
def optimize_knn(trial,x,y):
    n_neighbors=trial.suggest_int("n_neighbors",1,15)
    p=trial.suggest_int("p",1,5)
    leaf_size=trial.suggest_int("leaf_size",10,50)
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    knn = KNeighborsClassifier(n_neighbors=n_neighbors,p=p,leaf_size=leaf_size)
    model = Pipeline([('pca', pca),
                 ('lda',lda),
                ('knn', knn)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_knn=partial(optimize_knn,x=flat_faces,y=labels)

In [None]:
study_knn=optuna.create_study(direction="minimize")

In [None]:
study_knn.optimize(optimization_function_knn,n_trials=150,n_jobs=-1)

In [None]:
print(study_knn.best_trial)

MLP

In [None]:
def optimize_mlp(trial,x,y):
    #parametroi
    n_layers = trial.suggest_int('n_layers', 1, 10)
    layers = []
    for i in range(n_layers):
        layers.append(trial.suggest_int(f'n_units_{i}', 1, 500))
    alpha=trial.suggest_uniform("alpha",0.0001,0.05)
    learning_rate=trial.suggest_categorical("learning_rate",["constant","adaptive"])
    activation=trial.suggest_categorical("activation",["tanh","relu"])
    solver=trial.suggest_categorical("solver",["sgd","adam"])
    momentum = trial.suggest_uniform('momentum', 0.0, 1.0)
    
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    mlp = MLPClassifier(solver=solver,activation=activation,hidden_layer_sizes=tuple(layers),learning_rate=learning_rate,alpha=alpha,momentum=momentum, verbose=0, early_stopping=True)
    #mlp = MLPClassifier(solver='adam',hidden_layer_sizes=(h,w), batch_size=256, verbose=, early_stopping=True)
    model = Pipeline([('pca', pca),
                 ('lda',lda),
                ('mlp', mlp)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_mlp=partial(optimize_mlp,x=flat_faces,y=labels)

In [None]:
study_mlp=optuna.create_study(direction="minimize")

In [None]:
study_mlp.optimize(optimization_function_mlp,n_trials=200,n_jobs=-1)

In [None]:
print(study_mlp.best_trial)

Logistic Regression

In [None]:
def optimize_logreg(trial,x,y):
    class_weight=trial.suggest_categorical("class_weight",[None,"balanced"])
    C=trial.suggest_uniform("C",0.001,100)
    solver=trial.suggest_categorical("solver",['newton-cg', 'lbfgs', 'sag', 'saga'])
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    logreg = LogisticRegression(C=C,class_weight=class_weight,solver=solver)
    model=Pipeline([('pca', pca),
                 ('lda',lda),
                ('logreg', logreg)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_logreg=partial(optimize_logreg,x=flat_faces,y=labels)

In [None]:
study_logreg=optuna.create_study(direction="minimize")

In [None]:
study_logreg.optimize(optimization_function_logreg,n_trials=150,n_jobs=-1)

In [None]:
print(study_logreg.best_trial)

Linear SVC

In [None]:
def optimize_linsvc(trial,x,y):
    C=trial.suggest_uniform("C",0.001,100)
    loss=trial.suggest_categorical("loss",['hinge', 'squared_hinge'])
    class_weight=trial.suggest_categorical("class_weight",[None, 'balanced'])
    fit_intercept=trial.suggest_categorical("fit_intercept",[True,False])
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    linear_svm = LinearSVC(C=C,loss=loss,fit_intercept=fit_intercept,class_weight=class_weight)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('linear_svm', linear_svm)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_linsvc=partial(optimize_linsvc,x=flat_faces,y=labels)

In [None]:
study_linsvc=optuna.create_study(direction="minimize")

In [None]:
study_linsvc.optimize(optimization_function_linsvc,n_trials=150,n_jobs=-1)

In [None]:
print(study_linsvc.best_trial)

Desicion Trees

In [None]:
def optimize_trees(trial,x,y):
    list_max_depth=[None,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]
    list_max_leaf_nodes=[None,2,3,4,5,6,7]
    list_max_features=[None,1,2,3,4,5,6,7,8,9,10]
    
    criterion=trial.suggest_categorical("criterion",["gini","entropy"])
    min_impurity_decrease=trial.suggest_uniform("min_impurity_decrease",0.0,0.3)
    max_depth=trial.suggest_categorical("max_depth",list_max_depth)
    min_samples_leaf=trial.suggest_int("min_samples_leaf",1,10)
    max_leaf_nodes=trial.suggest_categorical("max_leaf_nodes",list_max_leaf_nodes)
    max_features=trial.suggest_categorical("max_features",list_max_features)
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    tree = DecisionTreeClassifier(random_state=0,
                                 min_impurity_decrease=min_impurity_decrease,
                                 max_depth=max_depth,
                                 min_samples_leaf=min_samples_leaf,
                                 max_leaf_nodes=max_leaf_nodes,
                                 max_features=max_features,
                                 criterion=criterion)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('tree', tree)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_trees=partial(optimize_trees,x=flat_faces,y=labels)

In [None]:
study_trees=optuna.create_study(direction="minimize")

In [None]:
study_trees.optimize(optimization_function_trees,n_trials=150,n_jobs=-1)

In [None]:
print(study_trees.best_trial)

Random Forest Classifier 

In [None]:
def optimize_forrest(trial,x,y):
    #impurity_decrease=trial.suggest_uniform("impurity_decrease",0.0,0.3)
    #min_impurity_split=trial.suggest_uniform("min_impurity_split",,)
    list_max_depth=[None,2,3,4,5,6,7,8,9,10]   
    max_depth=trial.suggest_categorical("max_depth",list_max_depth)
    min_samples_leaf=trial.suggest_int("min_samples_leaf",1,4)
    n_estimators=trial.suggest_int("n_estimators",10,600)
    class_weight=trial.suggest_categorical("class_weight",[None, 'balanced'])
    bootstrap=trial.suggest_categorical("bootstrap",[False, True])
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    forrest = RandomForestClassifier(random_state=0,
                                    max_depth=max_depth,
                                    bootstrap=bootstrap,
                                    class_weight=class_weight,
                                    min_samples_leaf=min_samples_leaf,
                                    n_estimators=n_estimators)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('forrest', forrest)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_forrest=partial(optimize_forrest,x=flat_faces,y=labels)

In [None]:
study_forrest=optuna.create_study(direction="minimize")

In [None]:
study_forrest.optimize(optimization_function_forrest,n_trials=150,n_jobs=-1)

In [None]:
print(study_forrest.best_trial)

AdaBoost Classifier

In [None]:
def optimize_ada(trial,x,y):
    n_estimators=trial.suggest_int("n_estimators",50,300)
    learning_rate=trial.suggest_uniform("learning_rate",0.001,1.0)
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    adaboost = AdaBoostClassifier(random_state=0,
                                 n_estimators=n_estimators,
                                 learning_rate=learning_rate)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('adaboost', adaboost)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_ada=partial(optimize_ada,x=flat_faces,y=labels)

In [None]:
study_ada=optuna.create_study(direction="minimize")

In [None]:
study_ada.optimize(optimization_function_ada,n_trials=150,n_jobs=-1)

In [None]:
print(study_ada.best_trial)

GaussianNB (den xreiazetai tuning)

In [None]:
t0 = time()

gnb = GaussianNB()

pipe = Pipeline([('pca', pca),
                 ('lda', lda),
                ('gnb', gnb)])

scores=cross_validate(pipe, flat_faces, labels, cv=cv,scoring=scoring,return_train_score=True,n_jobs=-1)

print("done in %0.3fs" % (time() - t0))

Gaussian Process Classifier

In [None]:
def optimize_gauss(trial,x,y):
    #kernel_l=["DotProduct","Matern","RBF","RationalQuadratic","WhiteKernel"]
    #kernel = trial.suggest_categorical("kernel", kernel_l)
    
#    if kernel=="DotProduct":
#        sigma_0=trial.suggest_uniform("sigma_0",0.2,3.0)
 #       gpc = GaussianProcessClassifier(kernel=1.0*DotProduct(sigma_0=sigma_0),
 #                                   random_state=0)
#    elif kernel=="Matern":
 #       length_scale=trial.suggest_uniform("length_scale",0.2,3.0)
#        gpc = GaussianProcessClassifier(kernel=1.0*Matern(length_scale=length_scale),
#                                    random_state=0)
#    elif kernel=="RBF":
    length_scale=trial.suggest_uniform("length_scale",0.2,3.0)
    gpc = GaussianProcessClassifier(kernel=1.0*RBF(length_scale=length_scale),
                                random_state=0)
#    elif kernel=="RationalQuadratic":
#        length_scale=trial.suggest_uniform("length_scale",0.2,3.0)
#        gpc = GaussianProcessClassifier(kernel=1.0*RationalQuadratic(length_scale=length_scale),
#                                    random_state=0)
#    else:
#        noise_level=trial.suggest_uniform("noise_level",0.5,1.5)
#        gpc = GaussianProcessClassifier(kernel=1.0*WhiteKernel(noise_level=noise_level),
#                                    random_state=0)
    
    #warm_start = trial.suggest_categorical("warm_start", [True,False])
    #n_restarts_optimizer = trial.suggest_categorical("n_restarts_optimizer", [0,1,2,3,4,5,6,7,8,9,10])
    #copy_X_train = trial.suggest_categorical("copy_X_train", [True,False])
    
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    #gpc = GaussianProcessClassifier(warm_start=warm_start,
     #                               n_restarts_optimizer=n_restarts_optimizer,
      #                              copy_X_train=copy_X_train,
       #                             random_state=0)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('gpc', gpc)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_gauss=partial(optimize_gauss,x=flat_faces,y=labels)

In [None]:
study_gauss=optuna.create_study(direction="minimize")

In [None]:
study_gauss.optimize(optimization_function_gauss,n_trials=150,n_jobs=-1)

In [None]:
print(study_gauss.best_trial)

Ridge Classifier

In [None]:
def optimize_ridge(trial,x,y):
    alpha_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
    alpha=trial.suggest_categorical("alpha",alpha_list)
    solver=trial.suggest_categorical("solver",['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])  
    class_weight=trial.suggest_categorical("class_weight",[None, 'balanced'])
    
    pca = PCA(n_components=42, svd_solver='randomized',whiten=True)
    lda = LDA()
    ridge = RidgeClassifier(alpha=alpha,solver=solver,class_weight=class_weight)
    model=Pipeline([('pca', pca),
                ('lda',lda),
                ('ridge', ridge)])
    kf=StratifiedKFold(n_splits=5)
    accuracies=[]
    for idx in kf.split(X=x, y=y):
        train_idx,test_idx=idx[0],idx[1]
        x_train=x[train_idx]
        y_train=y[train_idx]
        
        x_test=x[test_idx]
        y_test=y[test_idx]
        
        model.fit(x_train,y_train)
        preds=model.predict(x_test)
        fold_acc=accuracy_score(y_test,preds)
        accuracies.append(fold_acc)
        
    return - 1.0 * np.mean(accuracies)

In [None]:
optimization_function_ridge=partial(optimize_ridge,x=flat_faces,y=labels)

In [None]:
study_ridge=optuna.create_study(direction="minimize")

In [None]:
study_ridge.optimize(optimization_function_ridge,n_trials=100,n_jobs=2)

In [None]:
print(study_ridge.best_trial)