# Import Necessary Libraries

In [1]:
import pickle
import numpy as np
import networkx as nx
import os
import shutil
import math
from random import shuffle
import pprint
pp = pprint.PrettyPrinter()

#### Importing chinese whispers algorithm
As we have already defined the clustering algorithm in `clusterer.py` we can directly import it

In [2]:
from clusterer import draw_graph
from clusterer import chineseWhispers

The embeddings for the faces in the dataset (LFW in this case) should be loaded. The embeddings can be computed from the script
`embedder.py`

# Run Chinese Whispers on the embeddings

In [3]:
data = pickle.load(open("embeddings.pickle","rb"))

Since the embeddings are loaded we can then create the graph

In [4]:
graph = draw_graph(data,0.8)
graph = chineseWhispers(graph,20)
# Takes about 5 minutes for this dataset

Creating graph: 100%|███████████████████████████████████████████████████████████▉| 13806/13807 [03:40<00:00, 62.63it/s]
Iterations: 100%|██████████████████████████████████████████████████████████████████████| 20/20 [00:19<00:00,  1.03it/s]


# Calculate Evaluation Metric

The evaluation metric used is the F-score.  
F-score is defined as the harmonic mean of precision and recall.

F-score = (2*precison*recall)/(precision + recall) 
(Replace with latex later)

We will first calculate the precision and recall using True positives, False positives and False negatives

In [5]:
# Define function for nCr
def nCr(n,r):
    fact = math.factorial
    return fact(n)/(fact(r)*fact(n-r))

def partial_dict_view(dictionary,n):
    pp.pprint({k: v for i, (k, v) in enumerate(dictionary.items()) if i < n})

Firstly create a dictionary which maps a cluster to the number of images it contains.  
**NOTE:** a lot of people in the LFW dataset have only one image of them, hence there can be many clusters with only one image

In [6]:
cluster_to_num_images = {}
for node in graph.nodes:
    if graph.nodes[node]['pseudoClass'] in cluster_to_num_images:
        cluster_to_num_images[graph.nodes[node]['pseudoClass']] += 1
    else:
        cluster_to_num_images[graph.nodes[node]['pseudoClass']] = 1

In [7]:
partial_dict_view(cluster_to_num_images,10)

{1: 1, 2: 1, 3: 1, 7: 4, 8: 1, 9: 2, 12: 1, 13: 2, 14: 1, 5375: 4}


Now let's compute the total positives
(add explanation later)

In [8]:
total_pw_positives = 0
for cluster,num_images in cluster_to_num_images.items():
    # It's a positive only if a pair can be formed
    if num_images >= 2:
        total_pw_positives += nCr(num_images,2)

In [9]:
print(total_pw_positives)

269571.0


- The subdirectory of the image in which it resides in the LFW folder is the identity of the person. 
- In this scenario the identity is the class.
- Hence the  classes can be taken from the subdirectory in the path of the image.
- The nodes of the networkx graph contain an attribute which has the relative path for the image.
- For example `lfw\\Aaron_Eckhart\\Aaron_Eckhart_0001.jpg` is the image corresponding to the person named Aaron Eckhart

Firstly we create a dictionary which maps a cluster to a dictionary which describes the cluster. i.e the latter's value is a dictionary which maps each class (identity) in the cluster to the number of nodes (image embeddings) in the cluster

In [10]:
clusters = list(cluster_to_num_images.keys())

In [11]:
cluster_to_desc = {}
for cluster in clusters:
    cluster_to_desc[cluster] = {}
    for node in graph.nodes:
        if graph.nodes[node]['pseudoClass'] == cluster:
            
            Class = os.path.split(graph.nodes[node]['path'])[0]
            Class = Class[4:]
            # find a cleaner implementation
            if Class not in cluster_to_desc[cluster]:
                cluster_to_desc[cluster][Class] = 1
            else:
                cluster_to_desc[cluster][Class] += 1

In [12]:
# Detailed description of the dictionary
partial_dict_view(cluster_to_desc,5)

{1: {'Aaron_Eckhart': 1},
 2: {'Aaron_Guiel': 1},
 3: {'Aaron_Patterson': 1},
 7: {'Aaron_Peirsol': 4},
 8: {'Aaron_Pena': 1}}


Now we can compute the true positives. (give explanation later)

In [13]:
true_pw_positives = 0
for cluster in cluster_to_desc.keys():
    for Class in cluster_to_desc[cluster].keys():
        if cluster_to_desc[cluster][Class] >= 2:
            true_pw_positives += nCr(cluster_to_desc[cluster][Class],2)

In [14]:
print(true_pw_positives)

238593.0


False positives can then be calculated from subtracting from the total

In [15]:
false_pw_positives = total_pw_positives - true_pw_positives
print(false_pw_positives)

30978.0


Now let's create a dictionary which maps a class(identity) to the number of images it contains. 

In [16]:
class_to_num_images = {}
for Class in os.listdir("lfw"):
    num_images = 0
    for image in os.listdir(os.path.join("lfw",Class)):
        num_images += 1
    class_to_num_images[Class] = num_images

In [17]:
partial_dict_view(class_to_num_images,10)

{'Aaron_Eckhart': 1,
 'Aaron_Guiel': 1,
 'Aaron_Patterson': 1,
 'Aaron_Peirsol': 4,
 'Aaron_Pena': 1,
 'Aaron_Sorkin': 2,
 'Aaron_Tippin': 1,
 'Abba_Eban': 1,
 'Abbas_Kiarostami': 1,
 'Abdel_Aziz_Al-Hakim': 1}


We can then calculate the false negatives using the above dictionary

In [18]:
false_pw_negatives = 0
# Iterate through all classes
for Class in class_to_num_images.keys():
    prev_occurence = 0
    for cluster in cluster_to_num_images.keys():
        if Class in cluster_to_desc[cluster]:
            # Get the number of nodes for the current class in the cluster
            num_of_class_in_cluster = cluster_to_desc[cluster][Class]
            
            # Get the number of nodes for the current class not in the cluster
            # It can be calculated by subtracting the number of times a class occurs in the cluster
            # from the total number of times it occurs in the dataset
            num_of_class_out_of_cluster = class_to_num_images[Class] - num_of_class_in_cluster
            
            # The number of pairs formed containing mismatched nodes is the number of nodes in
            # the cluster multiplied with the number out of the cluster
            # To account for pairs added by the previous cluster we subtract 'prev'
            false_pw_negatives += num_of_class_in_cluster*(num_of_class_out_of_cluster-prev_occurence)
            prev_occurence += num_of_class_in_cluster
            

In [19]:
print(false_pw_negatives)

1894


Finally with all these values we can calculate the F-measure

In [20]:
precision = true_pw_positives / (true_pw_positives + false_pw_positives)

recall = true_pw_positives / (true_pw_positives + false_pw_negatives)

f_measure = (2*precision*recall)/(precision + recall)

print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F measure: {:.2f}".format(f_measure))

Precision: 0.89
Recall: 0.99
F measure: 0.94
