In [2]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log, get_data, do_chinese_whispers, estimation_error

In [3]:
# Fix log_length, vary n_people
log_lengths = [500, 1000, 2500, 4000, 5000]
n_peoples = 500

epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for log_length in log_lengths:
    # Generate log
    log = generate_log(data_df, log_length=log_length, n_faces=n_peoples, exact=True).sample(frac=1).reset_index(drop=True)
    n_people = len(log['id'].value_counts().keys())
    print('Log of length %d with %d unique faces' % (len(log), n_people))
        
    # DBScan
    X = log.drop(columns='id').values
    # Do DBScan - Timed
    db_start = time.perf_counter()
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    db_end = time.perf_counter()
    labels = pd.Series(db.labels_)
    db_n_clusters = len(list(labels.value_counts().values))
    
    # Chinese Whispers
    log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
    log['randID'] = log['id'] + log['randID']
    d = log.drop(columns='id').set_index('randID').T.to_dict('list')
    # Do Chinese Whispers - Timed
    cw_start = time.perf_counter()
    cw_clusters = do_chinese_whispers(d, threshold=threshold)
    cw_end = time.perf_counter()
    cw_n_clusters = len(cw_clusters.keys())

    # Print results
    db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
    cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)
    
    print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
    print(f"DBscan took {db_end - db_start:0.4f} seconds")
    
    print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))
    print(f"CW     took {cw_end - cw_start:0.4f} seconds") 
    print('----------------------------------------------')

Dataset contains 24517 images of 6743 different people 

Log of length 500 with 500 unique faces
DBScan: people: 500 clusters: 484 error: 3.200000
DBscan took 0.0620 seconds
CW:     people: 500 clusters: 453 error: 9.400000
CW     took 5.1199 seconds
----------------------------------------------
Log of length 1000 with 500 unique faces
DBScan: people: 500 clusters: 463 error: 7.400000
DBscan took 0.1971 seconds
CW:     people: 500 clusters: 453 error: 9.400000
CW     took 16.2923 seconds
----------------------------------------------
Log of length 2500 with 500 unique faces
DBScan: people: 500 clusters: 470 error: 6.000000
DBscan took 0.2695 seconds
CW:     people: 500 clusters: 510 error: 2.000000
CW     took 201.8060 seconds
----------------------------------------------
Log of length 4000 with 500 unique faces
DBScan: people: 500 clusters: 466 error: 6.800000
DBscan took 0.4905 seconds
CW:     people: 500 clusters: 511 error: 2.200000
CW     took 299.1070 seconds
------------------

In [4]:
# Fix log_length, vary n_people
log_lengths = [5000, 10000, 25000, 40000, 50000]
n_peoples = 5000

epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for log_length in log_lengths:
    # Generate log
    log = generate_log(data_df, log_length=log_length, n_faces=n_peoples, exact=True).sample(frac=1).reset_index(drop=True)
    n_people = len(log['id'].value_counts().keys())
    print('Log of length %d with %d unique faces' % (len(log), n_people))
        
    # DBScan
    X = log.drop(columns='id').values
    # Do DBScan - Timed
    db_start = time.perf_counter()
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    db_end = time.perf_counter()
    labels = pd.Series(db.labels_)
    db_n_clusters = len(list(labels.value_counts().values))
    
    # Chinese Whispers
    log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
    log['randID'] = log['id'] + log['randID']
    d = log.drop(columns='id').set_index('randID').T.to_dict('list')
    # Do Chinese Whispers - Timed
    cw_start = time.perf_counter()
    cw_clusters = do_chinese_whispers(d, threshold=threshold)
    cw_end = time.perf_counter()
    cw_n_clusters = len(cw_clusters.keys())

    # Print results
    db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
    cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)
    
    print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
    print(f"DBscan took {db_end - db_start:0.4f} seconds")
    
    print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))
    print(f"CW     took {cw_end - cw_start:0.4f} seconds") 
    print('----------------------------------------------')

Dataset contains 24517 images of 6743 different people 



KeyboardInterrupt: 