# ELEC0134 - Applied Machine Learning Systems Assignment 22-23
## B1: Face Shape Detection

This notebook will create a random forests model to detect gender on the celeba dataset. To create the model the following steps will be followed:

1. Image preprocessing to extract facial landmarks.
2. Normalization of the features.
3. Tuning hyperparameters of SVM model 
4. Training the random forests model 
5. Evaluation of the model

## 1) Preprocessing
### Facial landmark extraction



In [8]:
# Imports 

import os.path
import numpy as np
from keras.preprocessing import image
import cv2
import dlib
import imutils
from imutils import face_utils

Uses dlib library's pretrained shape predictor to predict the locations of 668 landmarks on detected faces. 

In [9]:
detector = dlib.get_frontal_face_detector()

# Uses landmark predictor from parent directory
predictor = dlib.shape_predictor('../shape_predictor_68_face_landmarks.dat')

NameError: name 'dlib' is not defined

In [None]:
def shape_to_np(shape, dtype="int"):
    # initialize the list of (x, y)-coordinates
    coords = np.zeros((shape.num_parts, 2), dtype=dtype)

    # loop over all facial landmarks and convert them
    # to a 2-tuple of (x, y)-coordinates
    for i in range(0, shape.num_parts):
        coords[i] = (shape.part(i).x, shape.part(i).y)

    # return the list of (x, y)-coordinates
    return coords

In [None]:
def rect_to_bb(rect):
    # take a bounding predicted by dlib and convert it
    # to the format (x, y, w, h) as we would normally do
    # with OpenCV
    x = rect.left()
    y = rect.top()
    w = rect.right() - x
    h = rect.bottom() - y

    # return a tuple of (x, y, w, h)
    return (x, y, w, h)

In [None]:
def run_dlib_shape(image):
    # in this function we load the image, detect the landmarks of the face, and then return the image and the landmarks
    # load the input image, resize it, and convert it to grayscale
    resized_image = image.astype('uint8')

    gray = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
    gray = gray.astype('uint8')

    # detect faces in the grayscale image
    rects = detector(gray, 1)
    num_faces = len(rects)

    if num_faces == 0:
        return None, resized_image

    face_areas = np.zeros((1, num_faces))
    face_shapes = np.zeros((136, num_faces), dtype=np.int64)

    # loop over the face detections
    for (i, rect) in enumerate(rects):
        # determine the facial landmarks for the face region, then
        # convert the facial landmark (x, y)-coordinates to a NumPy
        # array
        temp_shape = predictor(gray, rect)
        temp_shape = shape_to_np(temp_shape)

        # convert dlib's rectangle to a OpenCV-style bounding box
        # [i.e., (x, y, w, h)],
        #   (x, y, w, h) = face_utils.rect_to_bb(rect)
        (x, y, w, h) = rect_to_bb(rect)
        face_shapes[:, i] = np.reshape(temp_shape, [136])
        face_areas[0, i] = w * h
    # find largest face and keep
    dlibout = np.reshape(np.transpose(face_shapes[:, np.argmax(face_areas)]), [68, 2])

    return dlibout, resized_image

In [None]:
def extract_features_labels(images_dir, labels_dir, name):
    """
    This funtion extracts the landmarks features for all images in the folder 'dataset/celeba'.
    It also extracts the gender label for each image.
    :return:
        landmark_features:  an array containing 68 landmark points for each image in which a face was detected
        shape_labels:      an array containing the gender label (male=0 and female=1) for each image in
                            which a face was detected
    """
    image_paths = [os.path.join(images_dir, l) for l in os.listdir(images_dir)]
    target_size = None
    labels_file = open(labels_dir, 'r')
    print('labels_file ', labels_file)
    lines = labels_file.readlines()
    shape_labels = {line.split('\t')[0] : int(line.split('\t')[2]) for line in lines[1:]}
    counter = 0
    if os.path.isdir(images_dir):
        all_features = []
        all_labels = []
        fails = []

        for img_path in image_paths:
            file_name= img_path.split('.')[0].split('/')[-1]

            # load image
            img = image.img_to_array(
                image.load_img(img_path,
                               target_size=target_size,
                               interpolation='bicubic'))
            features, _ = run_dlib_shape(img)
            if features is not None:
                all_features.append(features)
                all_labels.append(shape_labels[file_name])
            else: 
                fails.append(file_name)
            counter += 1
            print(counter)


    landmark_features = np.array(all_features)
    landmark_features = landmark_features.reshape(len(landmark_features), 136)
    shape_labels = np.array(all_labels)

    np.save("all_features_{}.npy".format(name), landmark_features)
    np.save("shape_labels_{}.npy".format(name), shape_labels)

    return landmark_features, shape_labels, fails


In [None]:
train_img = '/Users/yash/Documents/ELEC0134/AMLS_22-23_SN19076187/Dataset/dataset_AMLS_22-23/cartoon_set/img'
train_label = '/Users/yash/Documents/ELEC0134/AMLS_22-23_SN19076187/Dataset/dataset_AMLS_22-23/cartoon_set/labels.csv'
extract_features_labels(train_img, train_label, 'train')

test_img = '/Users/yash/Documents/ELEC0134/AMLS_22-23_SN19076187/Dataset/dataset_AMLS_22-23_test/cartoon_set_test/img'
test_label = '/Users/yash/Documents/ELEC0134/AMLS_22-23_SN19076187/Dataset/dataset_AMLS_22-23_test/cartoon_set_test/labels.csv'
extract_features_labels(test_img, test_label, 'test')


In [None]:
# Loading in training and test data from npy files
features_train = np.load('all_features_train.npy')
labels_train = np.load('shape_labels_train.npy')

features_test = np.load('all_features_test.npy')
labels_test = np.load('shape_labels_test.npy')


FileNotFoundError: [Errno 2] No such file or directory: 'shape_labels_train.npy'

In [None]:
print('features_train: ', features_train.shape)
print('features_test: ', features_test.shape)
print('labels_train: ', labels_train.shape)
print('labels_test: ', labels_test.shape)

Standardize data function

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
def preprocessing(features, test_features):
    
    # Scaling of data, trained on training data and applied to both training and test datasets
    scaler = StandardScaler()
    scaler.fit(features)

    features = scaler.transform(features)
    test_features = scaler.transform(test_features)
    
    return features, test_features

In [None]:
features_train, features_test = preprocessing(features_train, features_test)

In [None]:
print(features_train)

Checking features to decide ranges of hyperparameters to check for tuning

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV


In [None]:
param_grid = {'n_estimators': [50, 100, 150],
              'max_depth': [2, 4, 6, 8, 10, 12]}


# rfm = random forest model
rfm = RandomForestClassifier()
#grid_search = RandomizedSearchCV(rfm, param_grid, verbose = 3, cv = 10)
#grid_search.fit(features_train, labels_train)

In [None]:
#print("Best parameters: ", grid_search.best_params_)
#print("Best score: ", grid_search.best_score_)
best_params = {'n_estimators': 50, 'max_depth': 12}

In [None]:
rfm = RandomForestClassifier(**grid_search.best_params_)

In [None]:
rfm.fit(features_train, labels_train)
y_pred = rfm.predict(features_test)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:


# Evaluate the model's performance using different metrics
acc = accuracy_score(features_test, y_pred)
prec = precision_score(features_test, y_pred)
recall = recall_score(features_test, y_pred)
f1 = f1_score(y_test, features_pred)

print(f'Accuracy: {acc:.3f}')
print(f'Precision: {prec:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1-score: {f1:.3f}')

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt

In [None]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None):


    axes[0].set_title(title)

    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs = -1, \
                       train_sizes = np.linspace(.1, 1.0, 5),
                       return_times = True)

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)
    
    # Plot learning curve
    axes[0].grid()
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")
    axes[0].set_ylabel("Accuracy Score")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit times (s)")
    axes[1].set_title("Scalability of the model")
    return plt

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

# Plot learning curve for Random Forests 
title = "Learning Curve for Random Forests for Gender Detection"


cv = ShuffleSplit(n_splits=50, test_size=0.2, random_state=0)
plot_learning_curve(rfm, title, X_train, y_train, axes=axes, 
                    ylim=(0.7, 1.01), cv = cv)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

In [None]:
def plot_confusion_matrix(labels_test, y_pred, normalize = False, title = None):
    conf_matrix = confusion_matrix(labels_test, y_pred)
    if normalize:
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
    plt.imshow(conf_matrix, cmap='Blues')
    
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')

    if title:
        plt.title(title)

    plt.xticks([0, 1], ['0 (Negative)', '1 (Positive)'])
    plt.yticks([0, 1], ['0 (Negative)', '1 (Positive)'])

    thresh = conf_matrix.max() / 2.
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            plt.text(j, i, round(conf_matrix[i, j], 4),
                     horizontalalignment="center",
                     color="white" if conf_matrix[i, j] > thresh else "black")

    plt.show()

In [None]:
plot_confusion_matrix(labels_test, y_pred, normalize=True, title = 'Random Forests Face Shape')

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

In [None]:
def plot_ROC_curve(model, features_test, labels_test, title = None):

    # predict the scores of the positive class
    y_scores = model.decision_function(features_test)

    # calculate the false positive rate, true positive rate and thresholds
    fpr, tpr, thresholds = roc_curve(labels_test, y_scores)

    # calculate the AUC
    auc = roc_auc_score(labels_test, y_scores)
    
    # plot the ROC curve
    if title:
        plt.title(title)
        
    plt.plot(fpr, tpr, label=f'AUC: {auc:.2f}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
plot_ROC_curve(rmf, features_test, labels_test, title = 'ROC curve for Random Forest model on Face Shape Detection dataset')