#### Necessary imports

In [1]:
import os
import pandas as pd
import numpy as np
import cv2
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path
from sklearn import neighbors
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from keras_facenet import FaceNet as FN
from tqdm import tqdm
import time
import contextlib
import sys

#### Prepare directory

In [41]:
current_directory = os.getcwd()
print("Current working directory:", current_directory)

Current working directory: /Users/wiesruyters/Documents/WhD/Repositories/2402 Etmaal/Notebooks


In [42]:
data_folder = Path('../Datasets/Training_in_isolation')

In [77]:
# Method to remove the .DS_Store files that unnoticably appear in the directory
def delete_ds_store_files(directory):
    for ds_store_file in directory.rglob(".DS_Store"):
        try:
            if ds_store_file.is_file():
                ds_store_file.unlink()  # Delete the file
                print(f"Deleted: {ds_store_file}")
            else:
                print(f"Skipping: {ds_store_file} (not a file)")
        except Exception as e:
            print(f"Error deleting {ds_store_file}: {e}")

In [78]:
class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

In [44]:
# Call the function to delete .DS_Store files
delete_ds_store_files(data_folder)

#### Set up learning variables and environment
Chosen embedder is FaceNet, the model proposed in Google's paper 'FaceNet: A Unified Embedding for Face Recognition and Clustering' (Schroff et al., 2015) <br>
FaceNet became a popular and accurate model for face detection, and serves the current study by means of Transfer Learning*<br>
For more information about FaceNet: https://arxiv.org/abs/1503.03832
<br><br>
Transfer Learning concerns the practice in machine learning engineering, e.g. in the context of neural networks, where sophistical pre-trained models are used as a 'base' model for a new, yet similar classification problem. The model is typically finetuned to fit the new data better. 

In [2]:
embedder = FN()

In [70]:
# Method to extract the face embedding of an image
def get_face_embedding(face_image):
    detections = embedder.extract(face_image, threshold=0.95)
    
    # Check if a face was found
    if len(detections) > 0:
        # Return the embedding of the found face
        return detections[0]['embedding']
    else: return None

    time.sleep(0.5)

#### Create dataset
X: face_embeddings <br>
y: labels (politician name/'Unknown') <br>
z: filename

In [64]:
num_politicians = len(os.listdir(data_folder))
for i, politician_subfolder in enumerate(os.listdir(data_folder)):
    politician_path = os.path.join(data_folder, politician_subfolder)
    print(f'{i}/{num_politicians}', politician_path)

0/18 ../Datasets/Training_in_isolation/Wybren van Haga
1/18 ../Datasets/Training_in_isolation/Stephan van Baarle
2/18 ../Datasets/Training_in_isolation/Joost Eerdmans
3/18 ../Datasets/Training_in_isolation/Rob Jetten
4/18 ../Datasets/Training_in_isolation/Laurens Dassen
5/18 ../Datasets/Training_in_isolation/Thierry Baudet
6/18 ../Datasets/Training_in_isolation/Dilan Yesilgöz
7/18 ../Datasets/Training_in_isolation/Mirjam Bikker
8/18 ../Datasets/Training_in_isolation/Kees van der Staaij
9/18 ../Datasets/Training_in_isolation/Frans Timmermans
10/18 ../Datasets/Training_in_isolation/Henri Bontebal
11/18 ../Datasets/Training_in_isolation/Caroline van der Plas
12/18 ../Datasets/Training_in_isolation/Pieter Omtzigt
13/18 ../Datasets/Training_in_isolation/Unknown
14/18 ../Datasets/Training_in_isolation/Esther Ouwehand
15/18 ../Datasets/Training_in_isolation/Geert Wilders
16/18 ../Datasets/Training_in_isolation/Lilian Marijnissen
17/18 ../Datasets/Training_in_isolation/Edson Olf


In [74]:
iteration = 0
empty_embeddings = []

In [79]:
# Loop over politician subfolders in the isolated face image data folder
for politician_subfolder in tqdm(os.listdir(data_folder), desc='Embedding, embedding, embedding aan de wand', unit='politician_subfolder'):
    politician_path = os.path.join(data_folder, politician_subfolder)
    
    # Loop over the image files in the directory
    for iso_img_file in os.listdir(politician_path):
        iteration += i
        # Check if file is an image
        if not iso_img_file.endswith('.jpeg'):
                continue
        iso_img_path = os.path.join(politician_path, iso_img_file)
        
        # Read the image files and extract the face_embeddings with cv2
        if cv2.imread(iso_img_path) is not None:
            with HiddenPrints():
                # Call method to extract the face embedding
                face_embedding = get_face_embedding(cv2.imread(iso_img_path))

                # Leave out None-embedded images
                if face_embedding is not None:
                    X.append(face_embedding)
                    y.append(politician_subfolder)
                    z.append(iso_img_file)
                else:
                    empty_embeddings.append((politician_subfolder, iso_img_file))
    time.sleep(10)

Embedding, embedding, embedding aan de wand: 100%|█| 18/18 [27:36<00:00, 92.02s/


In [100]:
print(f'Total number of calculated embeddings: {len(X)} \nTotal number of empty embeddings: {len(empty_embeddings)}')

Total number of calculated embeddings: 12206 
Total number of empty embeddings: 2259


In [101]:
npX = np.array(X)
npy = np.array(y)
npz = np.array(z)

#### Store to file

In [102]:
embedded_data = pd.DataFrame({'X': npX.tolist(), 'y': npy, 'z': npz})
embedded_data.to_excel('../Datasets/Embeddings_labels.xlsx')

In [103]:
embedded_data.shape

(12206, 3)

In [104]:
unique_values, counts = np.unique(npy, return_counts=True)

for value, count in zip(unique_values, counts):
    print(f"{value}: {count} times")

Caroline van der Plas: 640 times
Dilan Yesilgöz: 802 times
Edson Olf: 51 times
Esther Ouwehand: 522 times
Frans Timmermans: 614 times
Geert Wilders: 704 times
Henri Bontebal: 690 times
Joost Eerdmans: 556 times
Kees van der Staaij: 762 times
Laurens Dassen: 494 times
Lilian Marijnissen: 797 times
Mirjam Bikker: 748 times
Pieter Omtzigt: 860 times
Rob Jetten: 764 times
Stephan van Baarle: 410 times
Thierry Baudet: 718 times
Unknown: 1596 times
Wybren van Haga: 478 times
