In [1]:
import cv2
import numpy as np
import pandas as pd
from pathlib import Path
import os
from PIL import Image
from ipywidgets import IntProgress, IntText
from sklearn.decomposition import PCA
import imagehash

In [2]:
face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades +'haarcascade_frontalface_default.xml')

In [3]:
def convert_image(filename):
    image = Image.open("./attachements/"+filename).convert('L').resize((512,512))
    output_name = "./attachements/faces/"+filename
    image.save(output_name, image.format)
    return output_name

In [10]:
def is_image(filename):
        f = filename.lower()
        return f.endswith(".png") or f.endswith(".jpg") or \
            f.endswith(".jpeg") or f.endswith(".bmp") or f.endswith(".gif")

In [5]:
def detect_face(image_path):
    image_pil = Image.open(image_path)
    image = np.array(image_pil, 'uint8')
    # Detect the face in the image
    faces = face_cascade.detectMultiScale(image)
    if (len(faces) == 0):
        return None, None
    
    (x, y, w, h) = faces[0]
    return image[y:y+w, x:x+h], faces[0]

In [8]:
def get_images_and_labels(path):
    facepath = path+"faces/"
    No_facepath = path+"no_faces/"
    if not os.path.exists(facepath):
        os.makedirs(facepath)
    if not os.path.exists(No_facepath):
        os.makedirs(No_facepath)
        
    image_paths = [os.path.join(path, f) for f in os.listdir(path) if is_image(f)]
    df = pd.DataFrame(columns=['filename','image','hash'])
    label = 0
    img = 0
    
    progressBar = IntProgress(description='Finding faces in '+str(len(image_paths))+ ' images: ', min=0, max=len(image_paths), style = {'description_width': 'initial'})
    progressText = IntText(value=0, description='Images searched so far: ', style = {'description_width': 'initial'})
    display(progressBar, progressText)
    
    for image_path in image_paths:
        filename = image_path.split('/')[2]
        image_pil = Image.open(convert_image(filename))
        img_hash = imagehash.average_hash(image_pil)
        image = np.array(image_pil, 'uint8')
        faces = face_cascade.detectMultiScale(image)
        if len(faces) == 0:
            os.rename(facepath+filename, No_facepath+filename)
            df.loc[label] = [filename,'', img_hash]
            label +=1
        else:
            for (x, y, w, h) in faces:
                df.loc[label] = [filename, image[y: y + h, x: x + w], img_hash]
                label +=1
        progressBar.value +=1
        progressText.value +=1
    return df

In [11]:
path = './attachements/'

df_faces = get_images_and_labels(path)

IntProgress(value=0, description='Finding faces in 450 images: ', max=450, style=ProgressStyle(description_wid…

IntText(value=0, description='Images searched so far: ', style=DescriptionStyle(description_width='initial'))

In [12]:
def flatten(list_2d):
    flat_list = np.empty(0,)
    for l in list_2d:
        if type(l[0]) == int or type(l[0]) == np.uint8:
            flat_list = np.concatenate((flat_list,np.array(l)))
        else:
            print(type(l[0]))
            
    return list(flat_list)

In [13]:
df_faces['flat_image'] = df_faces['image'].apply(flatten)

In [14]:
vectors = df_faces['flat_image'].apply(pd.Series).fillna(0)

In [15]:
c=15
pca = PCA(n_components=15)

pca_result = pca.fit_transform(vectors.values)
print(sum(pca.explained_variance_ratio_))

0.8840836508080817


In [18]:
pca_df=pd.DataFrame(index=df_faces['filename'])

for column in range(c):
    c_str = "pca-" + str(column)
    pca_df[c_str] = pca_result[:,column]

In [19]:
pca_df.head()

Unnamed: 0_level_0,pca-0,pca-1,pca-2,pca-3,pca-4,pca-5,pca-6,pca-7,pca-8,pca-9,pca-10,pca-11,pca-12,pca-13,pca-14
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
115112414280106035.png,-3376.582794,1984.159923,-920.523419,-474.713471,-118.332401,172.430943,136.360839,79.730272,-111.081101,-69.563377,26.102781,-43.883006,-51.687309,30.332052,-14.202686
IMG_20150425_132120.jpg,-3376.582794,1984.159923,-920.523419,-474.713471,-118.332401,172.430943,136.360839,79.730272,-111.081101,-69.563377,26.102781,-43.883006,-51.687309,30.332052,-14.202686
IMG_20150210_162955.jpg,-3376.582794,1984.159923,-920.523419,-474.713471,-118.332401,172.430943,136.360839,79.730272,-111.081101,-69.563377,26.102781,-43.883006,-51.687309,30.332052,-14.202686
DSC_1058.jpg,-3376.582794,1984.159923,-920.523419,-474.713471,-118.332401,172.430943,136.360839,79.730272,-111.081101,-69.563377,26.102781,-43.883006,-51.687309,30.332052,-14.202686
20170215_121045.jpg,-1406.930152,-1360.536186,2255.89218,2202.637337,549.76801,-1303.395063,-753.089846,-231.856124,292.627192,-72.947747,41.407711,-72.303217,-190.590595,29.364069,31.637323


In [20]:
pca_df.to_csv("./data/faces.csv", encoding='utf-8')