In [1]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log, get_data, get_encoding_graph, do_chinese_whispers, estimation_error

In [2]:
# Fix log_length, vary n_people
log_length = 1000
n_peoples = [50, 100, 250, 400, 500]

result_file_name = 'log_%d_n_vary.txt' % log_length

epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for i in range(5):
    print('This is round %d' % (i+1))
    for n_people in n_peoples:
        # Generate log
        log = generate_log(data_df, log_length=log_length, n_faces=n_people, exact=True).sample(frac=1).reset_index(drop=True)
        n_people = len(log['id'].value_counts().keys())
        print('Log of length %d with %d unique faces' % (len(log), n_people))

        # DBScan
        X = log.drop(columns='id').values
        # Do DBScan - Timed
        db_start = time.perf_counter()
        db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
        db_end = time.perf_counter()
        labels = pd.Series(db.labels_)
        db_n_clusters = len(list(labels.value_counts().values))

        # Chinese Whispers
        log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
        log['randID'] = log['id'] + log['randID']
        d = log.drop(columns='id').set_index('randID').T.to_dict('list')
        G = get_encoding_graph(d.items(), threshold=threshold)
        # Do Chinese Whispers - Timed 
        cw_start = time.perf_counter()
        cw_clusters = do_chinese_whispers(G)
        cw_end = time.perf_counter()
        cw_n_clusters = len(cw_clusters.keys())

        # Save results to text file
        with open(result_file_name, 'a') as file:
            file.write(('alg:db log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, db_n_clusters) + f'{db_end - db_start:0.3f}' + '\n'))
            file.write(('alg:cw log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, cw_n_clusters) + f'{cw_end - cw_start:0.3f}' + '\n'))

        # Print results
        db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
        cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)

        print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
        print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))

        print(f"DBscan took {db_end - db_start:0.3f} seconds")
        print(f"CW     took {cw_end - cw_start:0.3f} seconds")

        print('----------------------------------------------')

Dataset contains 24517 images of 6743 different people 

This is round 1
Log of length 1000 with 50 unique faces
DBScan: people: 50 clusters: 51 error: 2.000000
CW:     people: 50 clusters: 53 error: 6.000000
DBscan took 0.025 seconds
CW     took 0.148 seconds
----------------------------------------------
Log of length 1000 with 100 unique faces
DBScan: people: 100 clusters: 104 error: 4.000000
CW:     people: 100 clusters: 120 error: 20.000000
DBscan took 0.026 seconds
CW     took 0.086 seconds
----------------------------------------------
Log of length 1000 with 250 unique faces
DBScan: people: 250 clusters: 257 error: 2.800000
CW:     people: 250 clusters: 265 error: 6.000000
DBscan took 0.025 seconds
CW     took 0.140 seconds
----------------------------------------------
Log of length 1000 with 400 unique faces
DBScan: people: 400 clusters: 377 error: 5.750000
CW:     people: 400 clusters: 365 error: 8.750000
DBscan took 0.026 seconds
CW     took 0.110 seconds
------------------

In [None]:
# Fix log_length, vary n_people
log_length = 10000
n_peoples = [500, 1000, 2500, 4000, 5000]

result_file_name = 'log_%d_n_vary.txt' % log_length

epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for i in range(1):
    print('This is round %d' % (i+1))
    for n_people in n_peoples:
        # Generate log
        log = generate_log(data_df, log_length=log_length, n_faces=n_people, exact=True).sample(frac=1).reset_index(drop=True)
        n_people = len(log['id'].value_counts().keys())
        print('Log of length %d with %d unique faces' % (len(log), n_people))

        # DBScan
        X = log.drop(columns='id').values
        # Do DBScan - Timed
        db_start = time.perf_counter()
        db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
        db_end = time.perf_counter()
        labels = pd.Series(db.labels_)
        db_n_clusters = len(list(labels.value_counts().values))

        # Chinese Whispers
        log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
        log['randID'] = log['id'] + log['randID']
        d = log.drop(columns='id').set_index('randID').T.to_dict('list')
        G = get_encoding_graph(d.items(), threshold=threshold)
        # Do Chinese Whispers - Timed
        cw_start = time.perf_counter()
        cw_clusters = do_chinese_whispers(G)
        cw_end = time.perf_counter()
        cw_n_clusters = len(cw_clusters.keys())

        # Save results to text file
        with open(result_file_name, 'a') as file:
            file.write(('alg:db log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, db_n_clusters) + f'{db_end - db_start:0.3f}' + '\n'))
            file.write(('alg:cw log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, cw_n_clusters) + f'{cw_end - cw_start:0.3f}' + '\n'))

        # Print results
        db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
        cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)

        print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
        print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))

        print(f"DBscan took {db_end - db_start:0.4f} seconds")
        print(f"CW     took {cw_end - cw_start:0.4f} seconds") 
        print('----------------------------------------------')

Dataset contains 24517 images of 6743 different people 

This is round 1
Log of length 10000 with 500 unique faces
DBScan: people: 500 clusters: 488 error: 2.400000
CW:     people: 500 clusters: 604 error: 20.800000
DBscan took 4.0398 seconds
CW     took 2.7920 seconds
----------------------------------------------
Log of length 10000 with 1000 unique faces
