In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import csv
from collections import Counter
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from itertools import chain, combinations
from sklearn.cluster import SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.cluster import Birch
import itertools
import time
from sklearn.cluster import AffinityPropagation
import re

def decode(labels):
    encoding = 'utf-8'

    # decode from byte to string
    labels = [str(x, encoding) for x in labels]
    label_decoded = [x.replace('_', ' ') for x in labels]
    
    return label_decoded

def get_labels_idx(keys, raw_labels):
    # key: category
    # value: index
    
    labels = {}

    for i in range(len(raw_labels)):
        for k in keys:
            if k in raw_labels[i]:
                if k not in labels:
                    labels[k] = [i]
                else:
                    labels[k].append(i)
                    
    return labels

def get_clusters_dict(labels):
    # key: label
    # value: indices of images
    clusters = {}

    for idx, label in enumerate(labels):
        if label not in clusters:
            # The label is seen for first time, create a new list.
            clusters[label] = [idx]
        else:
            clusters[label].append(idx)
            
    return clusters

# Create label pairs

def create_label_pairs(labels):
    
    label_pairs = {}
    
    for key, value in labels.items():
        label_pairs[key] = list(itertools.combinations(value, 2)) 
        
    label_pairs_concat = []

    for key, value in label_pairs.items():
        label_pairs_concat += value
        
    return label_pairs_concat

# F-measure

def f_measure(true_labels, cluster_labels, algo):
    
    true_positive = list(set(true_labels).intersection(cluster_labels))
    false_positive = list(set(cluster_labels) - set(true_labels))
    false_negative = list(set(true_labels) - set(cluster_labels))

    TP = len(true_positive)
    FP = len(false_positive)
    FN = len(false_negative)
    
    precision = round(TP/(TP+FP), 3)
    
    recall = round(TP/(TP+FN), 3)
    
    f_measure = round(2*((precision*recall)/(precision+recall)), 3)
    
    print("{} F-Measure: {}".format(algo, f_measure))
    print("{} Precision: {}".format(algo, precision))
    print("{} Recall: {}".format(algo, recall))
    print("{} Number of False Positives: {}".format(algo, FP))

In [4]:
# read in embeddings from ArcFace

arcface_embeddings = np.load("embeddings/experiment3/arcface/ex3_arcface_embeddings.npy")
arcface_raw_labels = np.load("embeddings/experiment3/arcface/ex3_arcface_names.npy")
arcface_raw_labels = decode(arcface_raw_labels)
arcface_raw_labels = [re.sub(".png", "", x) for x in arcface_raw_labels]
arcface_raw_labels = [re.sub(".jpg", "", x) for x in arcface_raw_labels]

print(arcface_embeddings.shape)
print(arcface_raw_labels)

(4593, 512)
['Martin Meißner lawyer', 'Davy Fitzgerald manager', 'Arno Berg architect', 'Jan Josef Liefers actor', 'Wilhelm Swensen architect', 'Fyodor Schechtel architect', 'Cub Swanson fighter', 'Atle Antonsen entrepreneur', 'Vladimir Gomelsky coach', 'Vladimir Putin coach', 'David Cronenberg entrepreneur', 'Carlos Payán politician', 'Traugott Bardt architect', 'Drilon Cocaj entrepreneur', 'Jan Grønli entrepreneur', 'Vyacheslav Tikhonov entrepreneur', 'Phil Hansel coach', 'Artem Ermolaev manager', 'Stephen Ackles actor', 'Aleksey Gubry military officer', 'David Evans manager', 'Maksim Atayants architect', 'GastonSILVA soccer', 'Alan Bern musician', 'Aiden Jude musician', 'Archibald Simpson architect', 'Noach Dear politician', 'MatthiasGINTER soccer', 'Felipe Silva fighter', 'George Parker Bidder architect', 'Benito Lopez fighter', 'SebastianCOATES soccer', 'Demetrious Johnson fighter', 'JR Bourne actor', 'Chad Laprise fighter', 'Rezo Chkheidze entrepreneur', 'Daniel Kagan lawyer', 'D

In [6]:
# read in embeddings from dlib

# dlib_embeddings = np.load("embeddings/experiment2/dlib/ex2_dlib_embeddings.npy")
# dlib_raw_labels = np.load("embeddings/experiment2/dlib/ex2_dlib_names.npy")
# dlib_raw_labels = [re.sub(".png", "", x) for x in dlib_raw_labels]
# dlib_raw_labels = [re.sub(".jpg", "", x) for x in dlib_raw_labels]
# dlib_raw_labels = [re.sub("_", " ", x) for x in dlib_raw_labels]

# print(dlib_embeddings.shape)

# print(dlib_raw_labels)

In [7]:
# read in embeddings from Openface

data = list(csv.reader(open("embeddings/experiment3/openface/ex3_openface_reps.csv")))
label_s = list(csv.reader(open("embeddings/experiment3/openface/ex3_openface_labels.csv")))

# data = list(csv.reader(open("embeddings/experiment2/openface/reps.csv")))
# label_s = list(csv.reader(open("embeddings/experiment2/openface/labels.csv")))

openface_embeddings = np.asarray(data, dtype=float)
openface_raw_labels = []

for i in range(len(label_s)):
    openface_raw_labels.append(label_s[i][1])

# Experiment 1
#openface_raw_labels = [re.sub("./datasets/data1_aligned/", "", x) for x in openface_raw_labels]
# Experiment 2
#openface_raw_labels = [re.sub("./datasets/ex2_openface_aligned/", "", x) for x in openface_raw_labels]
# Experiment 3
openface_raw_labels = [re.sub("./datasets/openface_aligned/", "", x) for x in openface_raw_labels]
openface_raw_labels = [re.sub("(?=\/).*$", "", x) for x in openface_raw_labels]
openface_raw_labels = [re.sub("_", " ", x) for x in openface_raw_labels]

print(openface_embeddings.shape)
#print(openface_raw_labels)

(4609, 128)


In [8]:
# read in embeddings from FaceNet",

facenet_embeddings = np.load("embeddings/experiment3/facenet/ex3_facenet_embeddings.npy")
label_strings = np.load("embeddings/experiment3/facenet/ex3_facenet_label_strings.npy")
facenet_raw_labels = decode(label_strings)

print(facenet_embeddings.shape)

#print(facenet_raw_labels)

(4609, 512)


In [11]:
## Starting clustering and evaluation

#keys = ["tennis", "basketball", "golf", "fighter", "soccer"]
# keys = ["tennis", "basketball", "golf", "fighter", "soccer", \
#         "actor", "artist", "businessperson", "computer scientist", \
#         "convict", "musician", "philosopher", "physician", "politician", "writer"]

# keys = ["tennis", "basketball", "golf", "fighter", "soccer", \
#         "businessperson", "computer scientist", \
#         "convict", "physician", "politician", "writer", \
#         "military officer", "judge", "film producer", "conductor", "painter"]

# keys = ["military officer", "judge", "soccer", "coach", \
#         "manager", "politician", "conductor", "actor", "architect", \
#        "fighter", "entrepreneur"]

# keys = [ "soccer", "manager", "politician", \
#         "coach", "actor", "military officer", "architect"]

keys = ["military officer", "politician", \
       "manager", "soccer", "architect", \
       "coach", "actor", "lawyer", "entrepreneur", "fighter", "musician"]

# Get label/index dictionary
facenet_labels = get_labels_idx(keys, facenet_raw_labels)
openface_labels = get_labels_idx(keys, openface_raw_labels)
#dlib_labels = get_labels_idx(keys, dlib_raw_labels)
arcface_labels = get_labels_idx(keys, arcface_raw_labels)

# Choose method
feature_extraction_method = "arcface"

if feature_extraction_method == "openface":

    X = openface_embeddings
    # Create ground truth pairs for evaulation
    true_label_pairs = create_label_pairs(openface_labels)
    
elif feature_extraction_method == "facenet":
    
    X = facenet_embeddings
    # Create ground truth pairs for evaulation
    true_label_pairs = create_label_pairs(facenet_labels)

# elif feature_extraction_method == "dlib":
    
#     X = dlib_embeddings
#     # Create ground truth pairs for evaulation
#     true_label_pairs = create_label_pairs(dlib_labels)
    
elif feature_extraction_method == "arcface":
    
    X = arcface_embeddings
    # Create ground truth pairs for evaulation
    true_label_pairs = create_label_pairs(arcface_labels)

In [12]:
c = 0
for key, value in facenet_labels.items():
    print(key)
    print(len(value))
    c+=len(value)
    
print(c)

print(len(facenet_labels))

print()

c = 0
for key, value in arcface_labels.items():
    print(key)
    print(len(value))
    c+=len(value)
    
print(c)

print(len(arcface_labels))

print()

c = 0
for key, value in openface_labels.items():
    print(key)
    print(len(value))
    c+=len(value)
    
print(c)

print(len(openface_labels))


# print()

# c = 0
# for key, value in dlib_labels.items():
#     print(key)
#     print(len(value))
#     c+=len(value)
    
# print(c)

# print(len(dlib_labels))

soccer
450
manager
371
musician
375
actor
413
entrepreneur
485
politician
449
fighter
537
military officer
407
coach
340
architect
388
lawyer
394
4609
11

lawyer
394
manager
371
architect
388
actor
403
fighter
537
entrepreneur
485
coach
334
politician
449
military officer
407
soccer
450
musician
375
4593
11

manager
371
architect
388
actor
413
fighter
537
entrepreneur
485
coach
340
musician
375
military officer
407
soccer
450
politician
449
lawyer
394
4609
11


In [14]:
# K-means 
num_clusters = 11

start_time = time.time()

kmeans = KMeans(n_clusters = num_clusters).fit(X)
#print(kmeans.labels_)

k_means_clusters = get_clusters_dict(kmeans.labels_)

# print(labels)
# print("\n")
# print(k_means_clusters)

kmeans_label_pairs = create_label_pairs(k_means_clusters)

#F-measure

f_measure(true_label_pairs, kmeans_label_pairs, "K-means")

print("--- %s seconds ---" % (time.time() - start_time))

print()

# Hierarchical Agglomerative Clustering

start_time = time.time()

clustering = AgglomerativeClustering(n_clusters=num_clusters, distance_threshold=None).fit(X)
hac_clusters = get_clusters_dict(clustering.labels_)

hac_label_pairs = create_label_pairs(hac_clusters)

f_measure(true_label_pairs, hac_label_pairs, "HAC")

print("--- %s seconds ---" % (time.time() - start_time))

print()


# Spectral Clustering

start_time = time.time()

clustering = SpectralClustering(n_clusters=num_clusters).fit(X)

spectral_cluster = get_clusters_dict(clustering.labels_)

spectral_label_pairs = create_label_pairs(spectral_cluster)

f_measure(true_label_pairs, spectral_label_pairs, "Spectral")

print("--- %s seconds ---" % (time.time() - start_time))

print()

# Gaussian Mixture EM

start_time = time.time()

gmm_labels = GaussianMixture(n_components=num_clusters, init_params='kmeans').fit_predict(X)

gmm_clusters = get_clusters_dict(gmm_labels)

gmm_label_pairs = create_label_pairs(gmm_clusters)

f_measure(true_label_pairs, gmm_label_pairs, "GMM")

print("--- %s seconds ---" % (time.time() - start_time))

print()

# Birch

start_time = time.time()

brc = Birch(n_clusters=num_clusters, compute_labels=True).fit(X) 

birch_labels = brc.predict(X)

birch_clusters = get_clusters_dict(birch_labels)

birch_label_pairs = create_label_pairs(birch_clusters)

f_measure(true_label_pairs, birch_label_pairs, "Birch")

print("--- %s seconds ---" % (time.time() - start_time))

K-means F-Measure: 0.147
K-means Precision: 0.144
K-means Recall: 0.15
K-means Number of False Positives: 875467
--- 2.6260807514190674 seconds ---

HAC F-Measure: 0.14
HAC Precision: 0.135
HAC Recall: 0.146
HAC Number of False Positives: 913705
--- 2.444263219833374 seconds ---

Spectral F-Measure: 0.129
Spectral Precision: 0.128
Spectral Recall: 0.13
Spectral Number of False Positives: 868415
--- 5.143836736679077 seconds ---

GMM F-Measure: 0.148
GMM Precision: 0.144
GMM Recall: 0.152
GMM Number of False Positives: 885234
--- 5.352085590362549 seconds ---

Birch F-Measure: 0.141
Birch Precision: 0.131
Birch Recall: 0.152
Birch Number of False Positives: 984786
--- 1.761704444885254 seconds ---


In [68]:
# Find error pairs

true_positive = list(set(true_label_pairs).intersection(hac_label_pairs))
false_positive = list(set(hac_label_pairs) - set(true_label_pairs))
false_negative = list(set(true_label_pairs) - set(hac_label_pairs))

print(false_positive[:100])

f = false_positive[:100]

[(1531, 3553), (2538, 2974), (2044, 2237), (588, 1914), (535, 3421), (290, 4065), (1032, 4489), (4271, 4584), (3348, 4324), (1378, 3559), (1654, 4212), (1459, 3142), (2845, 4467), (650, 1583), (3286, 3827), (1263, 3817), (934, 4667), (97, 4096), (1604, 1771), (2611, 3326), (1551, 2540), (22, 761), (3155, 4093), (3214, 3930), (1110, 2279), (3298, 4844), (865, 2951), (1654, 2528), (125, 1275), (4095, 4742), (1735, 3957), (1303, 1584), (206, 3664), (1334, 4219), (2804, 3732), (2034, 3610), (525, 3772), (2167, 4149), (2816, 3504), (2012, 2734), (3398, 3533), (1122, 1752), (3636, 3854), (2413, 4033), (2673, 3368), (1613, 3842), (2494, 4398), (4607, 4741), (165, 850), (1881, 2854), (464, 3065), (977, 3761), (3441, 5039), (3388, 4242), (955, 1473), (1541, 3627), (12, 90), (1415, 4102), (215, 1725), (578, 4619), (759, 1440), (3123, 3436), (2160, 3918), (2241, 4607), (3472, 3851), (1159, 3457), (1725, 3540), (308, 1075), (1008, 4690), (3522, 4656), (3366, 3730), (2403, 2432), (74, 3852), (2641,

In [69]:
for pair in f:
    print(facenet_raw_labels[pair[0]])
    print(facenet_raw_labels[pair[1]])
    print()

Forward243 basketball
MoussaDEMBELE soccer

Jon Mikl Thor actor m
Leonardo León convict m

Homi K. Bhabha philosopher m
James Cameron actor m

Bob Sowards golf
Gyenge Zoltán philosopher m

Bernat Vivancos i Farràs businessperson m
Mieczysław Franaszek actor m

Andre Landzaat actor m
Ric Clark businessperson m

Danny Roberts fighter
SteveMANDANDA soccer

Sadek Wahba businessperson m
Taoufik Jebali actor m

Mehdi Khazali physician m
Saydulla Mamatqulov musician m

Eugène Savitzkaya writer m
Muhammad Fahim politician m

Gary Hallberg golf
Roy Arad artist m

Florin Popențiu Vlădicescu computer scientist m
Manuel Jorba i Jorba writer m

Kevin Power writer m
Stephan Hartmann philosopher m

Brian Colon politician m
Frank Fagan businessperson m

Mathias Tegnér politician m
Patton Kizzire golf

Ed Dougherty golf
Patrick Dixon businessperson m

CristhianSTUANI soccer
Tim Oliver Schultz actor m

Ahmad Alirezabeighi politician m
Rick Lamb golf

François-Xavier Bossard musician m
Greg Norman golf



In [None]:
# # DBSCAN

# start_time = time.time()

# clustering = DBSCAN(eps=1, min_samples= 3).fit(X)
# DBSCAN_cluster = get_clusters_dict(clustering.labels_)

# print(clustering.labels_)
# print("\n")
# print(len(DBSCAN_cluster))
# print("\n")
# DBSCAN_label_pairs = create_label_pairs(DBSCAN_cluster)

# f_measure(true_label_pairs, DBSCAN_label_pairs, "DBSCAN")

# print("--- %s seconds ---" % (time.time() - start_time))

# print()

# # Affinity Propagation
# start_time = time.time()

# clustering = AffinityPropagation().fit(X)

# ap_clusters = get_clusters_dict(clustering.labels_)

# print(len(ap_clusters))

# ap_label_pairs = create_label_pairs(ap_clusters)

# f_measure(true_label_pairs, ap_label_pairs, "Affinity Porpagation")

# print("--- %s seconds ---" % (time.time() - start_time))

# print()

# # Mean shift

# start_time = time.time()

# clustering = MeanShift(bandwidth=1).fit(X)

# mean_shift_cluster = get_clusters_dict(clustering.labels_)

# print(clustering.labels_)
# print("\n")
# print(len(mean_shift_cluster))
# print("\n")
# mean_shift_label_pairs = create_label_pairs(mean_shift_cluster)

# f_measure(true_label_pairs, mean_shift_label_pairs, "Mean Shift")

# print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# pca = PCA(n_components=2)
# principalComponents = pca.fit_transform(X)
# reduced_centroids = pca.fit_transform(kmeans.cluster_centers_)

# colors = ["#ffe119", "#f032e6", "#9A6324", "#3cb44b", "#e6194B", "#f58231", "#ffe119", "#469990", "#42d4f4", "#4363d8", "#911eb4"]

# # plt.scatter(X[:,0], X[:,1], s=5)

# for i in kmeans.labels_:
#     color = colors[i]
#     for feature in principalComponents[kmeans.labels_ == i]:
#         plt.scatter(feature[0], feature[1], marker="x", color=color, s=5, linewidths=5)
#     plt.scatter(reduced_centroids[i][0], reduced_centroids[i][1], marker="o", color=color, edgecolors='black',  s=30, linewidths=1)

# plt.show()