In [None]:
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
import os
from PIL import Image
from ipywidgets import IntProgress, IntText
from sklearn.decomposition import PCA

In [None]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades +'haarcascade_frontalface_default.xml')

In [None]:
def convert_image(filename):
    image = Image.open("./attachements/"+filename).convert('L').resize((512,512))
    output_name = "./attachements/faces/"+filename
    image.save(output_name, image.format)
    return output_name

In [None]:
def detect_face(image_path):
    image_pil = Image.open(image_path)
    image = np.array(image_pil, 'uint8')
    # Detect the face in the image
    faces = face_cascade.detectMultiScale(image)
    if (len(faces) == 0):
        return None, None
    
    (x, y, w, h) = faces[0]
    return image[y:y+w, x:x+h], faces[0]

In [None]:
def get_images_and_labels(path):
    facepath = path+"faces/"
    No_facepath = path+"no_faces/"
    if not os.path.exists(facepath):
        os.makedirs(facepath)
    if not os.path.exists(No_facepath):
        os.makedirs(No_facepath)
        
    image_paths = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.jpg')]
    df = pd.DataFrame(columns=['slug','image'])
    label = 0
    img = 0
    
    progressBar = IntProgress(description='Finding faces in '+str(len(image_paths))+ ' images: ', min=0, max=len(image_paths), style = {'description_width': 'initial'})
    progressText = IntText(value=0, description='Images searched so far: ', style = {'description_width': 'initial'})
    display(progressBar, progressText)
    
    for image_path in image_paths:
        filename = image_path.split('/')[2]
        image_pil = Image.open(convert_image(filename))
        image = np.array(image_pil, 'uint8')
        faces = face_cascade.detectMultiScale(image)
        if len(faces) == 0:
            os.rename(facepath+filename, No_facepath+filename)
        for (x, y, w, h) in faces:
            df.loc[label] = [filename, image[y: y + h, x: x + w]]
            label +=1
        progressBar.value +=1
        progressText.value +=1
    return df

In [None]:
path = './attachements/'

df_faces = get_images_and_labels(path)

In [None]:
def flatten(list_2d):
    flat_list = np.empty(0,)
    for l in list_2d:
        if type(l[0]) == int or type(l[0]) == np.uint8:
            flat_list = np.concatenate((flat_list,np.array(l)))
        else:
            print(type(l[0]))
            
    return list(flat_list)

In [None]:
df_faces['flat_image'] = df_faces['image'].apply(flatten)

In [None]:
vectors = df_faces['flat_image'].apply(pd.Series).fillna(0)

In [None]:
c=15
pca = PCA(n_components=15)

pca_result = pca.fit_transform(vectors.values)
print(sum(pca.explained_variance_ratio_))

In [None]:
pca_df=pd.DataFrame(index=df_faces['slug'])

for column in range(c):
    c_str = "pca-" + str(column)
    pca_df[c_str] = pca_result[:,column]

In [None]:
pca_df.head()

In [None]:
pca_df.to_csv("./data/faces.csv", encoding='utf-8')