In [1]:
from os import listdir

import numpy as np
import pandas as pd

from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log

In [6]:
# Return ids of images in each dataset as a list
def retrive_ids(filepath):
    with open(filepath, 'r') as file:
        ids = file.read().split()
    return ids

data_path = '../data/'
names_path = lambda dataset_name: '%s%s-names.txt' % (data_path, dataset_name)
embeddings_path = lambda dataset_name: '%s%s_embeddings.npz' % (data_path, dataset_name)

datasets = ['lfw', 'cf']

data_dfs = []
for name in datasets:
    data_df = pd.DataFrame(np.load(embeddings_path(name))['arr_0'])
    data_df['id'] = retrive_ids(names_path(name))
    data_dfs.append(data_df)
    
test_data_df = pd.concat(data_dfs).sample(frac=1).reset_index(drop=True)
X = test_data_df.drop(columns='id').values
# X = test_data_df.drop(columns='id').apply(lambda row : (row / np.linalg.norm(row)), axis = 1).values

print('Dataset contains %s images of %s different people' % (len(test_data_df), len(test_data_df['id'].value_counts().keys())))

Dataset contains 24517 images of 6743 different people


In [10]:
# Fix log_length, vary n_people
log_length = 1000
n_peoples = [50, 100, 250, 400, 500]
epsilon = 9.8

for n_people in n_peoples:
    log = generate_log(test_data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
    X = log.drop(columns='id').values
    print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))
    
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

(Generating Log: len(df)=137, log_length=1000...Done.)
Log of length 1000 with 50 unique faces
eps: 9.800000, people: 50 clusters: 53 error: 6.000000

(Generating Log: len(df)=354, log_length=1000...Done.)
Log of length 1000 with 100 unique faces
eps: 9.800000, people: 100 clusters: 103 error: 3.000000

(Generating Log: len(df)=810, log_length=1000...Done.)
Log of length 1000 with 250 unique faces
eps: 9.800000, people: 250 clusters: 248 error: 0.800000

(Generating Log: len(df)=1110, log_length=1000...Done.)
Log of length 1000 with 400 unique faces
eps: 9.800000, people: 400 clusters: 377 error: 5.750000

(Generating Log: len(df)=1556, log_length=1000...Done.)
Log of length 1000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 471 error: 5.800000



In [11]:
# Fix log_length, vary n_people
log_length = 10000
n_peoples = [500, 1000, 2500, 4000, 5000]
epsilon = 9.8

for n_people in n_peoples:
    log = generate_log(test_data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
    X = log.drop(columns='id').values
    print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))
    
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

(Generating Log: len(df)=2022, log_length=10000...Done.)
Log of length 10000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 486 error: 2.800000

(Generating Log: len(df)=3399, log_length=10000...Done.)
Log of length 10000 with 1000 unique faces
eps: 9.800000, people: 1000 clusters: 903 error: 9.700000

(Generating Log: len(df)=8941, log_length=10000...Done.)
Log of length 10000 with 2500 unique faces
eps: 9.800000, people: 2500 clusters: 2094 error: 16.240000

(Generating Log: len(df)=14487, log_length=10000...Done.)
Log of length 10000 with 4000 unique faces
eps: 9.800000, people: 4000 clusters: 3297 error: 17.575000

(Generating Log: len(df)=18218, log_length=10000...Done.)
Log of length 10000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 4172 error: 16.560000



In [13]:
# Fix log_length, vary n_people
log_length = 25000
n_peoples = [500, 1000, 2500, 4000, 5000]
epsilon = 9.8

for n_people in n_peoples:
    log = generate_log(test_data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
    X = log.drop(columns='id').values
    print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))
    
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

(Generating Log: len(df)=1754, log_length=25000...Done.)
Log of length 25000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 473 error: 5.400000

(Generating Log: len(df)=4172, log_length=25000...Done.)
Log of length 25000 with 1000 unique faces
eps: 9.800000, people: 1000 clusters: 882 error: 11.800000

(Generating Log: len(df)=9060, log_length=25000...Done.)
Log of length 25000 with 2500 unique faces
eps: 9.800000, people: 2500 clusters: 2041 error: 18.360000

(Generating Log: len(df)=14297, log_length=25000...Done.)
Log of length 25000 with 4000 unique faces
eps: 9.800000, people: 4000 clusters: 3142 error: 21.450000

(Generating Log: len(df)=18335, log_length=25000...Done.)
Log of length 25000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 3789 error: 24.220000

