In [54]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn import preprocessing

from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log, estimation_error

# Work out average over 5 rounds

In [47]:
log_length = 1000
result_path = 'log_%d_n_vary.txt' % log_length

df = pd.DataFrame(columns=['alg', 'log_length', 'n_people', 'n_clusters', 'time'])

with open(result_path, 'r') as file:
    lines = file.readlines()
    
    for line in lines:
        row = pd.DataFrame({
            'alg': [line.split()[0].split(':')[1]], 
            'log_length': [float(line.split()[1].split(':')[1])], 
            'n_people': [float(line.split()[2].split(':')[1])], 
            'n_clusters': [float(line.split()[3].split(':')[1])], 
            'time': [float(line.split()[4].split(':')[1])]
        })

        df = pd.concat([df, row])

In [84]:
df_average =  df.groupby(['alg', 'n_people'], as_index=False).mean()
df_average['est_error'] = df_average.apply(lambda row: estimation_error(n_clusters=row['n_clusters'], n_people=row['n_people']), axis = 1)

df_average
# df_average.groupby('alg').apply(print)

Unnamed: 0,alg,n_people,log_length,n_clusters,time,est_error
0,cw,50.0,1000.0,55.4,0.1672,10.8
1,cw,100.0,1000.0,113.0,0.119,13.0
2,cw,250.0,1000.0,262.6,0.128,5.04
3,cw,400.0,1000.0,371.0,0.1804,7.25
4,cw,500.0,1000.0,446.6,0.2456,10.68
5,db,50.0,1000.0,52.4,0.0252,4.8
6,db,100.0,1000.0,104.0,0.0232,4.0
7,db,250.0,1000.0,250.0,0.0246,0.0
8,db,400.0,1000.0,381.2,0.0244,4.7
9,db,500.0,1000.0,466.8,0.0236,6.64


-------

In [2]:
def get_data(datasets):
    # Load Datasets
    data_path = '../data/'

    names_path = lambda dataset_name: '%s%s-names.txt' % (data_path, dataset_name)
    embeddings_path = lambda dataset_name: '%s%s_embeddings.npz' % (data_path, dataset_name)

    # Return ids of images in each dataset as a list
    def retrive_ids(filepath):
        with open(filepath, 'r') as file:
            ids = file.read().split()
        return ids

    data_dfs = []
    for name in datasets:
        data_df = pd.DataFrame(np.load(embeddings_path(name))['arr_0'])
        data_df['id'] = retrive_ids(names_path(name))

        data_dfs.append(data_df)

    all_data_df = pd.concat(data_dfs)
    all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
    print('Dataset contains %s images of %s different people\n' % (len(all_data_df), len(all_data_df['id'].value_counts().keys())))
    
    return all_data_df

In [3]:
data_df = get_data(['lfw', 'cf'])
log_length, n_people = 1000, 100

log = generate_log(data_df, log_length, n_people).sample(frac=1).reset_index(drop=True)
X = log.drop(columns='id').values
print('Log of length %d with %d unique faces' % (len(log), len(log['id'].value_counts().keys())))

Dataset contains 24517 images of 6743 different people

(Generating Log: len(df)=372, log_length=1000...Done.)
Log of length 1000 with 100 unique faces


In [4]:
epsilon = 9.8

db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
labels = pd.Series(db.labels_)

n_people = len(log['id'].value_counts().keys())
n_clusters = len(list(labels.value_counts().values))

est_error = (abs(n_clusters - n_people) / n_people) * 100
print('eps: %f, people: %d clusters: %d error: %f\n' % (epsilon, n_people, n_clusters, est_error))

eps: 9.800000, people: 100 clusters: 99 error: 1.000000



In [5]:
results = pd.DataFrame(log['id'])
results['cluster'] = labels

In [6]:
people = []
for label, images in results.groupby('cluster'):
    person_counts = pd.Series(images.values[:,0]).value_counts()
    
    person_proportion = person_counts / person_counts.values.sum()
    
    people.append(person_proportion.idxmax())

In [7]:
from collections import Counter
people_counter = Counter(people)

In [8]:
print('Given %d pictures of %d unique people the clustering algorithm found %d clusters but of those only %d are of unique people' % (log_length, n_people, n_clusters, len(people_counter.keys())))

Given 1000 pictures of 100 unique people the clustering algorithm found 99 clusters but of those only 94 are of unique people


In [10]:
# X clusters were 100% correct, X clusters were mixed? X people are missed out?

In [None]:
For each cluster 
- Assign a label to the cluster as the most popular image id in that cluster
- Save label, confidence in df 
- If tie?