In [1]:
import pandas as pd
import numpy as np
from glob import glob

from tqdm import tqdm

from sklearn.cluster import SpectralClustering
from scipy.spatial.distance import cosine

from deepface import DeepFace
from utils.visualize_pics import visualize_pics
from pathlib import Path




In [2]:
pics = glob('./norm_cut_faces/*.jpg')

In [None]:
embeddings = []
for pic in tqdm(pics):
    embedding = DeepFace.represent(pic, model_name='Facenet', enforce_detection=False)
    embeddings.append(embedding)

 27%|██▋       | 2460/9000 [17:21<52:01,  2.09it/s]  

In [None]:
df = pd.DataFrame({'pics': pics, 'embeddings': embeddings})
df['embeddings'] = df['embeddings'].apply(lambda x: x[0]['embedding'])
df['embeddings'] = df['embeddings'].apply(lambda x: np.array(x))
df['pics'] = df['pics'].apply(lambda x: Path(x).name)
df['age'] = df['pics'].apply(lambda x: int(x[-4-4:-4])-int(x[-4-4-7-4:-4-7-4]))
df

In [None]:
len(df)

In [None]:
from sklearn.metrics.pairwise import pairwise_kernels
X=pairwise_kernels(df['embeddings'].tolist(), metric='cosine')
X=X-min(X.flatten())

In [None]:
model = SpectralClustering(affinity="precomputed", n_clusters=16,
                           assign_labels='cluster_qr').fit(X=X)

In [None]:
pd.Series(model.labels_).value_counts().sort_index()

In [None]:
df['cluster'] = model.labels_

In [None]:
cl = df[df['cluster']==8]
avg = cl['embeddings'].mean()
cl['dist'] = cl['embeddings'].apply(lambda x: cosine(x, avg))
print(cl['dist'].min())
visualize_pics(cl.sort_values(by='dist', ascending=True)['pics'], source='./wiki_crop_all/', n=6)

In [None]:
df.groupby(by='cluster')['age'].describe()