# Test Chinese Whispers Clustering Algorithm

In [28]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn import preprocessing

from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import generate_log

In [29]:
def get_data(datasets):
    # Load Datasets
    data_path = '../data/'

    names_path = lambda dataset_name: '%s%s-names.txt' % (data_path, dataset_name)
    embeddings_path = lambda dataset_name: '%s%s_embeddings.npz' % (data_path, dataset_name)

    # Return ids of images in each dataset as a list
    def retrive_ids(filepath):
        with open(filepath, 'r') as file:
            ids = file.read().split()
        return ids

    data_dfs = []
    for name in datasets:
        data_df = pd.DataFrame(np.load(embeddings_path(name))['arr_0'])
        data_df['id'] = retrive_ids(names_path(name))

        data_dfs.append(data_df)

    all_data_df = pd.concat(data_dfs)
    all_data_df = all_data_df.sample(frac=1).reset_index(drop=True)
    print('Dataset contains %s images of %s different people' % (len(all_data_df), len(all_data_df['id'].value_counts().keys())))
    
    return all_data_df

In [30]:
""" Face Cluster """
import numpy as np
import math

def face_distance(face_encodings, face_to_compare):
    """
    Given a list of face encodings, compare them to a known face encoding and get a euclidean distance
    for each comparison face. The distance tells you how similar the faces are.
    :param faces: List of face encodings to compare
    :param face_to_compare: A face encoding to compare against
    :return: A numpy ndarray with the distance for each face in the same order as the 'faces' array
    """
    import numpy as np
    if len(face_encodings) == 0:
        return np.empty((0))

    #return 1/np.linalg.norm(face_encodings - face_to_compare, axis=1)
    face_to_compare = np.array(face_to_compare)
    return np.sum(face_encodings*face_to_compare,axis=1)

def get_encoding_graph(encoding_list, threshold=0.75):
    """ Chinese Whispers Algorithm
    Modified from Alex Loveless' implementation,
    http://alexloveless.co.uk/data/chinese-whispers-graph-clustering-in-python/
    Inputs:
        encoding_list: a list of facial encodings from face_recognition
        threshold: facial match threshold,default 0.6
        iterations: since chinese whispers is an iterative algorithm, number of times to iterate
    Outputs:
        sorted_clusters: a list of clusters, a cluster being a list of imagepaths,
            sorted by largest cluster to smallest
    """

    #from face_recognition.api import _face_distance
    from random import shuffle
    import networkx as nx
    # Create graph
    nodes = []
    edges = []

    image_paths, encodings = zip(*encoding_list)

    if len(encodings) <= 1:
        print ("No enough encodings to cluster!")
        return []

    for idx, face_encoding_to_check in enumerate(encodings):
        # Adding node of facial encoding
        node_id = idx+1

        # Initialize 'cluster' to unique value (cluster of itself)
        node = (node_id, {'cluster': image_paths[idx], 'path': image_paths[idx]})
        nodes.append(node)

        # Facial encodings to compare
        if (idx+1) >= len(encodings):
            # Node is last element, don't create edge
            break

        compare_encodings = encodings[idx+1:]
        distances = face_distance(compare_encodings, face_encoding_to_check)
        encoding_edges = []
        for i, distance in enumerate(distances):
            if distance > threshold:
                # Add edge if facial match
                edge_id = idx+i+2
                encoding_edges.append((node_id, edge_id, {'weight': distance}))

        edges = edges + encoding_edges

    G = nx.Graph()
    G.add_nodes_from(nodes)
    G.add_edges_from(edges)
    
    return G

In [31]:
import networkx as nx
from chinese_whispers import chinese_whispers, aggregate_clusters

In [32]:
data_df = get_data(['celeb_a_1', 'celeb_a_2'])

data_df['randNumCol'] = np.random.randint(100000, 999999, data_df.shape[0]).astype(str)
data_df['id'] = data_df['id'] + data_df['randNumCol']
data_df = data_df.drop(columns='randNumCol')
d = data_df.set_index('id').T.to_dict('list')

Dataset contains 40995 images of 2000 different people


In [33]:
start = time.perf_counter()

G = get_encoding_graph(d.items(), threshold=90)
chinese_whispers(G, weighting='top', iterations=20)

end = time.perf_counter()
print(f"chinese whispers took {end - start:0.4f} seconds")

print(len(aggregate_clusters(G)))

KeyboardInterrupt: 

In [None]:

import tracemalloc

tracemalloc.start()
my_complex_analysis_method()
current, peak = tracemalloc.get_traced_memory()
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
tracemalloc.stop()