In [1]:
from os import listdir

import numpy as np
import pandas as pd

from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log

In [2]:
# Return ids of images in each dataset as a list
def retrive_ids(filepath):
    with open(filepath, 'r') as file:
        ids = file.read().split()
    return ids

data_path = '../data/'
names_path = lambda dataset_name: '%s%s-names.txt' % (data_path, dataset_name)
embeddings_path = lambda dataset_name: '%s%s_embeddings.npz' % (data_path, dataset_name)

datasets = ['lfw', 'cf']

data_dfs = []
for name in datasets:
    data_df = pd.DataFrame(np.load(embeddings_path(name))['arr_0'])
    data_df['id'] = retrive_ids(names_path(name))
    data_dfs.append(data_df)
    
test_data_df = pd.concat(data_dfs).sample(frac=1).reset_index(drop=True)
X = test_data_df.drop(columns='id').values
# X = test_data_df.drop(columns='id').apply(lambda row : (row / np.linalg.norm(row)), axis = 1).values

print('Dataset contains %s images of %s different people' % (len(test_data_df), len(test_data_df['id'].value_counts().keys())))

Dataset contains 24517 images of 6743 different people


In [3]:
# Fix n_people, vary log_length
log_lengths = [500, 1000, 2500, 4000, 5000]
n_people = 500
epsilon = 9.8

for log_length in log_lengths:
    log = generate_log(test_data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
    X = log.drop(columns='id').values
    print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))
    
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

(Generating Log: len(df)=1781, log_length=500...Done.)
Log of length 500 with 500 unique faces
eps: 9.800000, people: 500 clusters: 491 error: 1.800000

(Generating Log: len(df)=1686, log_length=1000...Done.)
Log of length 1000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 472 error: 5.600000

(Generating Log: len(df)=1705, log_length=2500...Done.)
Log of length 2500 with 500 unique faces
eps: 9.800000, people: 500 clusters: 444 error: 11.200000

(Generating Log: len(df)=1682, log_length=4000...Done.)
Log of length 4000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 469 error: 6.200000

(Generating Log: len(df)=1779, log_length=5000...Done.)
Log of length 5000 with 500 unique faces
eps: 9.800000, people: 500 clusters: 473 error: 5.400000



In [4]:
# Fix n_people, vary log_length
log_lengths = [5000, 10000, 25000, 40000, 50000]
n_people = 5000
epsilon = 9.8

for log_length in log_lengths:
    log = generate_log(test_data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
    X = log.drop(columns='id').values
    print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))
    
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

(Generating Log: len(df)=18380, log_length=5000...Done.)
Log of length 5000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 4353 error: 12.940000

(Generating Log: len(df)=18786, log_length=10000...Done.)
Log of length 10000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 4130 error: 17.400000

(Generating Log: len(df)=18523, log_length=25000...Done.)
Log of length 25000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 3783 error: 24.340000

(Generating Log: len(df)=17673, log_length=40000...Done.)
Log of length 40000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 3764 error: 24.720000

(Generating Log: len(df)=18283, log_length=50000...Done.)
Log of length 50000 with 5000 unique faces
eps: 9.800000, people: 5000 clusters: 3817 error: 23.660000

