In [1]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.cluster import DBSCAN
from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log, get_data, do_chinese_whispers, estimation_error, get_encoding_graph

In [2]:
# Fix log_length, vary n_people
log_lengths = [500, 1000, 2500, 4000, 5000]
n_peoples = 500

result_file_name = 'log_vary_n_%d.txt' % n_peoples


epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for i in range(5):
    print('This is round %d' % (i+1))
    for log_length in log_lengths:
        # Generate log
        log = generate_log(data_df, log_length=log_length, n_faces=n_peoples, exact=True).sample(frac=1).reset_index(drop=True)
        n_people = len(log['id'].value_counts().keys())
        print('Log of length %d with %d unique faces' % (len(log), n_people))

        # DBScan
        X = log.drop(columns='id').values
        # Do DBScan - Timed
        db_start = time.perf_counter()
        db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
        db_end = time.perf_counter()
        labels = pd.Series(db.labels_)
        db_n_clusters = len(list(labels.value_counts().values))

        # Chinese Whispers
        log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
        log['randID'] = log['id'] + log['randID']
        d = log.drop(columns='id').set_index('randID').T.to_dict('list')
        G = get_encoding_graph(d.items(), threshold=threshold)
        # Do Chinese Whispers - Timed
        cw_start = time.perf_counter()
        cw_clusters = do_chinese_whispers(G)
        cw_end = time.perf_counter()
        cw_n_clusters = len(cw_clusters.keys())

        # Save results to text file
        with open(result_file_name, 'a') as file:
            file.write(('alg:db log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, db_n_clusters) + f'{db_end - db_start:0.3f}' + '\n'))
            file.write(('alg:cw log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, cw_n_clusters) + f'{cw_end - cw_start:0.3f}' + '\n'))

        # Print results
        db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
        cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)

        print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
        print(f"DBscan took {db_end - db_start:0.4f} seconds")

        print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))
        print(f"CW     took {cw_end - cw_start:0.4f} seconds") 
        print('----------------------------------------------')

Dataset contains 24517 images of 6743 different people 

This is round 1
Log of length 500 with 500 unique faces
DBScan: people: 500 clusters: 471 error: 5.800000
DBscan took 0.0107 seconds
CW:     people: 500 clusters: 450 error: 10.000000
CW     took 0.0071 seconds
----------------------------------------------
Log of length 1000 with 500 unique faces
DBScan: people: 500 clusters: 485 error: 3.000000
DBscan took 0.0263 seconds
CW:     people: 500 clusters: 447 error: 10.600000
CW     took 0.1404 seconds
----------------------------------------------
Log of length 2500 with 500 unique faces
DBScan: people: 500 clusters: 450 error: 10.000000
DBscan took 0.2055 seconds
CW:     people: 500 clusters: 474 error: 5.200000
CW     took 0.9807 seconds
----------------------------------------------
Log of length 4000 with 500 unique faces
DBScan: people: 500 clusters: 475 error: 5.000000
DBscan took 0.3061 seconds
CW:     people: 500 clusters: 563 error: 12.600000
CW     took 0.8629 seconds
---

  d = log.drop(columns='id').set_index('randID').T.to_dict('list')


DBScan: people: 500 clusters: 468 error: 6.400000
DBscan took 0.1551 seconds
CW:     people: 500 clusters: 538 error: 7.600000
CW     took 0.5531 seconds
----------------------------------------------
Log of length 4000 with 500 unique faces
DBScan: people: 500 clusters: 473 error: 5.400000
DBscan took 0.3238 seconds
CW:     people: 500 clusters: 543 error: 8.600000
CW     took 0.8557 seconds
----------------------------------------------
Log of length 5000 with 500 unique faces
DBScan: people: 500 clusters: 487 error: 2.600000
DBscan took 0.6021 seconds
CW:     people: 500 clusters: 558 error: 11.600000
CW     took 5.0587 seconds
----------------------------------------------
This is round 3
Log of length 500 with 500 unique faces
DBScan: people: 500 clusters: 466 error: 6.800000
DBscan took 0.0101 seconds
CW:     people: 500 clusters: 446 error: 10.800000
CW     took 0.0123 seconds
----------------------------------------------
Log of length 1000 with 500 unique faces
DBScan: people:

  d = log.drop(columns='id').set_index('randID').T.to_dict('list')


DBScan: people: 500 clusters: 479 error: 4.200000
DBscan took 0.3527 seconds
CW:     people: 500 clusters: 537 error: 7.400000
CW     took 5.8647 seconds
----------------------------------------------
Log of length 5000 with 500 unique faces
DBScan: people: 500 clusters: 476 error: 4.800000
DBscan took 0.5574 seconds
CW:     people: 500 clusters: 564 error: 12.800000
CW     took 1.2980 seconds
----------------------------------------------


In [None]:
# TODO Cancel this!!!!!!!!!
# Fix log_length, vary n_people
log_lengths = [5000, 10000, 25000, 40000, 50000]
n_peoples = 5000

result_file_name = 'log_vary_n_%d.txt' % n_peoples

epsilon = 9.8
threshold = 72

# Get data
data_df = get_data(['lfw', 'cf'])

for i in range(5):
    print('This is round %d' % (i+1))
    for log_length in log_lengths:
        # Generate log
        log = generate_log(data_df, log_length=log_length, n_faces=n_peoples, exact=True).sample(frac=1).reset_index(drop=True)
        n_people = len(log['id'].value_counts().keys())
        print('Log of length %d with %d unique faces' % (len(log), n_people))

        # DBScan
        X = log.drop(columns='id').values
        # Do DBScan - Timed
        db_start = time.perf_counter()
        db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
        db_end = time.perf_counter()
        labels = pd.Series(db.labels_)
        db_n_clusters = len(list(labels.value_counts().values))

        # Chinese Whispers
        log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
        log['randID'] = log['id'] + log['randID']
        d = log.drop(columns='id').set_index('randID').T.to_dict('list')
        # Do Chinese Whispers - Timed
        cw_start = time.perf_counter()
        cw_clusters = do_chinese_whispers(d, threshold=threshold)
        cw_end = time.perf_counter()
        cw_n_clusters = len(cw_clusters.keys())

        # Save results to text file
        with open(result_file_name, 'a') as file:
            file.write(('alg:db log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, db_n_clusters) + f'{db_end - db_start:0.3f}' + '\n'))
            file.write(('alg:cw log_length:%d n_people:%d n_clusters:%d time:' % (log_length, n_people, cw_n_clusters) + f'{cw_end - cw_start:0.3f}' + '\n'))

        # Print results
        db_est_err = estimation_error(n_clusters=db_n_clusters, n_people=n_people)
        cw_est_err = estimation_error(n_clusters=cw_n_clusters, n_people=n_people)

        print('DBScan: people: %d clusters: %d error: %f' % (n_people, db_n_clusters, db_est_err))
        print(f"DBscan took {db_end - db_start:0.4f} seconds")

        print('CW:     people: %d clusters: %d error: %f' % (n_people, cw_n_clusters, cw_est_err))
        print(f"CW     took {cw_end - cw_start:0.4f} seconds") 
        print('----------------------------------------------')