In [10]:
already_loaded = True
augmented = False
oversampled = True
transformed = True
trained_knn = True
trained_svm =True
plot = False

#### **Imports**

In [11]:
from joblib import dump, load
import numpy as np
from collections import Counter
import os
import cv2
import random

from sklearn.utils import shuffle
from sklearn.datasets import fetch_lfw_people

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE, ADASYN

import matplotlib.pyplot as plt
import seaborn as sns

#### **Loading Dataset**

In [12]:
if not already_loaded:
    
    # Load the LFW dataset in color with original image size
    lfw_people = fetch_lfw_people(color=True, resize=None, min_faces_per_person=20)

    n_samples, h, w, c = lfw_people.images.shape

    X = lfw_people.data
    n_features = X.shape[1]

    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print(f"Image dimensions: {h} x {w} x {c}")
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)

    dump(lfw_people, 'lfw_dataset.joblib')

else:
    # Load the dataset from the saved file
    lfw_people = load('lfw_dataset.joblib')

    n_samples, h, w, c = lfw_people.images.shape

    X = lfw_people.data
    n_features = X.shape[1]

    y = lfw_people.target
    target_names = lfw_people.target_names
    n_classes = target_names.shape[0]

    print("Total dataset size:")
    print("n_samples: %d" % n_samples)
    print(f"Image dimensions: {h} x {w} x {c}")
    print("n_features: %d" % n_features)
    print("n_classes: %d" % n_classes)

Total dataset size:
n_samples: 2489
Image dimensions: 125 x 94 x 3
n_features: 35250
n_classes: 43


##### **Train, val, test split**

In [13]:
# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42, stratify=y_train_val)

#### **SMOTE & ADASYN Oversampling**

In [14]:
if not oversampled:

    # Apply SMOTE oversampling only on the training set
    smote = SMOTE(random_state=42)
    X_train_resampled_smote, y_train_resampled_smote = smote.fit_resample(X_train, y_train)

    # Count the class distribution after SMOTE oversampling
    print("Class distribution after SMOTE oversampling:", np.bincount(y_train_resampled_smote))

    # Apply ADASYN oversampling only on the training set
    adasyn = ADASYN(random_state=42)
    X_train_resampled_adasyn, y_train_resampled_adasyn = adasyn.fit_resample(X_train, y_train)

    # Count the class distribution after ADASYN oversampling
    print("Class distribution after ADASYN oversampling:", np.bincount(y_train_resampled_adasyn))

    dump(X_train_resampled_smote, 'X_train_resampled_smote.joblib')
    dump(y_train_resampled_smote, 'y_train_resampled_smote.joblib')

    dump(X_train_resampled_adasyn, 'X_train_resampled_adasyn.joblib')
    dump(y_train_resampled_adasyn, 'y_train_resampled_adasyn.joblib')

else:

    X_train_resampled_smote = load('X_train_resampled_smote.joblib')
    y_train_resampled_smote = load('y_train_resampled_smote.joblib')

    X_train_resampled_adasyn = load('X_train_resampled_adasyn.joblib')
    y_train_resampled_adasyn = load('y_train_resampled_adasyn.joblib')

#### **PCA & LDA Transform**

In [15]:
if not transformed:

    # Perform PCA after oversampling
    pca_smote = PCA(n_components=100)
    pca_adasyn = PCA(n_components=100)

    X_train_pca_smote = pca_smote.fit_transform(X_train_resampled_smote)
    X_train_pca_adasyn = pca_adasyn.fit_transform(X_train_resampled_adasyn)

    X_val_pca_smote = pca_smote.transform(X_val)
    X_test_pca_smote = pca_smote.transform(X_test)

    X_val_pca_adasyn = pca_adasyn.transform(X_val)
    X_test_pca_adasyn = pca_adasyn.transform(X_test)

    # Save the transformed data
    dump(X_train_pca_smote, 'X_train_pca_smote.joblib')
    dump(X_val_pca_smote, 'X_val_pca_smote.joblib')
    dump(X_test_pca_smote, 'X_test_pca_smote.joblib')

    dump(X_train_pca_adasyn, 'X_train_pca_adasyn.joblib')
    dump(X_val_pca_adasyn, 'X_val_pca_adasyn.joblib')
    dump(X_test_pca_adasyn, 'X_test_pca_adasyn.joblib')

    # Perform LDA after oversampling
    # lda_smote = LDA(n_components=None)
    # lda_adasyn = LDA(n_components=None)

    # X_train_lda_smote = lda_smote.fit_transform(X_train_resampled_smote, y_train_resampled_smote)
    # X_train_lda_adasyn = lda_adasyn.fit_transform(X_train_resampled_adasyn, y_train_resampled_adasyn)

    # X_val_lda_smote = lda_smote.transform(X_val)
    # X_test_lda_smote = lda_smote.transform(X_test)

    # X_val_lda_adasyn = lda_adasyn.transform(X_val)
    # X_test_lda_adasyn = lda_adasyn.transform(X_test)

    # dump(X_train_lda_smote, 'X_train_lda_smote.joblib')
    # dump(X_val_lda_smote, 'X_val_lda_smote.joblib')
    # dump(X_test_lda_smote, 'X_test_lda_smote.joblib')

    # dump(X_train_lda_adasyn, 'X_train_lda_adasyn.joblib')
    # dump(X_val_lda_adasyn, 'X_val_lda_adasyn.joblib')
    # dump(X_test_lda_adasyn, 'X_test_lda_adasyn.joblib')

else:

    # Save the transformed data
    X_train_pca_smote = load('X_train_pca_smote.joblib')
    X_val_pca_smote = load('X_val_pca_smote.joblib')
    X_test_pca_smote = load('X_test_pca_smote.joblib')

    # X_train_lda_smote = load('X_train_lda_smote.joblib')
    # X_val_lda_smote = load('X_val_lda_smote.joblib')
    # X_test_lda_smote = load('X_test_lda_smote.joblib')

    X_train_pca_adasyn = load('X_train_pca_adasyn.joblib')
    X_val_pca_adasyn = load('X_val_pca_adasyn.joblib')
    X_test_pca_adasyn = load('X_test_pca_adasyn.joblib')

    # X_train_lda_adasyn = load('X_train_lda_adasyn.joblib')
    # X_val_lda_adasyn = load('X_val_lda_adasyn.joblib')
    # X_test_lda_adasyn = load('X_test_lda_adasyn.joblib')


#### **Exp Var PLot**

In [16]:
if plot:
    # Plot explained variance ratio for PCA after SMOTE and ADASYN oversampling side by side
    plt.figure(figsize=(16, 6))

    # Plot for PCA on smote oversampled data
    plt.subplot(1, 2, 1)
    plt.plot(np.cumsum(pca_smote.explained_variance_ratio_), linestyle='-')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA - Explained Variance Ratio (SMOTE)')
    plt.grid(True)

    # Plot for PDA on adasyn oversampled data
    plt.subplot(1, 2, 2)
    plt.plot(np.cumsum(pca_adasyn.explained_variance_ratio_), linestyle='-')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('PCA - Explained Variance Ratio (ADASYN)')
    plt.grid(True)

    plt.tight_layout() 
    plt.show()

#### **KNN**

In [17]:
if not trained_knn:

    # Define kNN classifier
    knn = KNeighborsClassifier()

    # Define parameter grid for grid search
    knn_param_grid = {'n_neighbors': [1, 3, 5, 7]} 

    # Perform grid search for kNN
    knn_grid_search = GridSearchCV(knn, knn_param_grid, cv=5)
    knn_grid_search.fit(X_train_pca_smote, y_train_resampled_smote)

    # Get best hyperparameters for kNN
    best_knn_params = knn_grid_search.best_params_

    # Train kNN classifier with best hyperparameters
    best_knn = KNeighborsClassifier(**best_knn_params)
    best_knn.fit(X_train_pca_smote, y_train_resampled_smote)

    # Save the trained kNN model
    dump(best_knn, 'best_knn_model.joblib')

    # Predictions on train, val, and test sets
    train_pred_knn = best_knn.predict(X_train_pca_smote)
    val_pred_knn = best_knn.predict(X_val_pca_smote)
    test_pred_knn = best_knn.predict(X_test_pca_smote)

    # Calculate accuracies
    train_accuracy_knn = accuracy_score(y_train_resampled_smote, train_pred_knn)
    val_accuracy_knn = accuracy_score(y_val, val_pred_knn)
    test_accuracy_knn = accuracy_score(y_test, test_pred_knn)

    # Print accuracies
    print("kNN:")
    print(f"Best params: {best_knn_params}")
    print(f"Train Accuracy: {train_accuracy_knn}")
    print(f"Validation Accuracy: {val_accuracy_knn}")
    print(f"Test Accuracy: {test_accuracy_knn}")

else:

    # Load the saved kNN model
    best_knn = load('best_knn_model.joblib')

    # Predictions on train, val, and test sets
    train_pred_knn = best_knn.predict(X_train_pca_smote)
    val_pred_knn = best_knn.predict(X_val_pca_smote)
    test_pred_knn = best_knn.predict(X_test_pca_smote)

    # Calculate accuracies
    train_accuracy_knn = accuracy_score(y_train_resampled_smote, train_pred_knn)
    val_accuracy_knn = accuracy_score(y_val, val_pred_knn)
    test_accuracy_knn = accuracy_score(y_test, test_pred_knn)

    # Print accuracies
    print("kNN:")
    # print(f"Best params: {best_knn_params}")
    print(f"Train Accuracy: {train_accuracy_knn}")
    print(f"Validation Accuracy: {val_accuracy_knn}")
    print(f"Test Accuracy: {test_accuracy_knn}")

kNN:
Train Accuracy: 1.0
Validation Accuracy: 0.3453815261044177
Test Accuracy: 0.3293172690763052


#### **SVM**

In [18]:
if not trained_svm:

    # Define SVM classifier
    svm = SVC()

    # Define parameter grid for grid search
    svm_param_grid = {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10]} 

    # Perform grid search for SVM
    svm_grid_search = GridSearchCV(svm, svm_param_grid, cv=5)
    svm_grid_search.fit(X_train_pca_smote, y_train_resampled_smote)

    # Get best hyperparameters for SVM
    best_svm_params = svm_grid_search.best_params_

    # Train SVM classifier with best hyperparameters
    best_svm = SVC(**best_svm_params)
    best_svm.fit(X_train_pca_smote, y_train_resampled_smote)

    # Save the trained SVM model
    dump(best_svm, 'best_svm_model.joblib')

    # Predictions on train, val, and test sets
    train_pred_svm = best_svm.predict(X_train_pca_smote)
    val_pred_svm = best_svm.predict(X_val_pca_smote)
    test_pred_svm = best_svm.predict(X_test_pca_smote)

    # Calculate accuracies
    train_accuracy_svm = accuracy_score(y_train_resampled_smote, train_pred_svm)
    val_accuracy_svm = accuracy_score(y_val, val_pred_svm)
    test_accuracy_svm = accuracy_score(y_test, test_pred_svm)

    # Print accuracies
    print("SVM:")
    print(f"Best params: {best_svm_params}")
    print(f"Train Accuracy: {train_accuracy_svm}")
    print(f"Validation Accuracy: {val_accuracy_svm}")
    print(f"Test Accuracy: {test_accuracy_svm}")

else:
    
    # Load the saved SVM model
    best_svm = load('best_svm_model.joblib')

    # Predictions on train, val, and test sets
    train_pred_svm = best_svm.predict(X_train_pca_smote)
    val_pred_svm = best_svm.predict(X_val_pca_smote)
    test_pred_svm = best_svm.predict(X_test_pca_smote)

    # Calculate accuracies
    train_accuracy_svm = accuracy_score(y_train_resampled_smote, train_pred_svm)
    val_accuracy_svm = accuracy_score(y_val, val_pred_svm)
    test_accuracy_svm = accuracy_score(y_test, test_pred_svm)

    # Print accuracies
    print("SVM:")
    # print(f"Best params: {best_svm_params}")
    print(f"Train Accuracy: {train_accuracy_svm}")
    print(f"Validation Accuracy: {val_accuracy_svm}")
    print(f"Test Accuracy: {test_accuracy_svm}")

SVM:
Train Accuracy: 1.0
Validation Accuracy: 0.5863453815261044
Test Accuracy: 0.5943775100401606
