In [1]:
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
import sys
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import csv
from collections import Counter
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
from itertools import chain, combinations
import itertools

In [2]:
embeddings = np.round(np.load("embeddings/test/embeddings.npy"), decimals=6)
t_labels = np.load("embeddings/test/labels.npy")
label_strings = np.load("embeddings/test/label_strings.npy")

X = np.round(embeddings, decimals=6)

encoding = 'utf-8'
# decode from byte to string
l = [str(x, encoding) for x in label_strings]
label_decoded = [x.replace('_', ' ') for x in l]

In [3]:
keys = ["tennis", "basketball", "golf", "fighter", "soccer"]

# key: category
# value: tuple of name and index, e.g. ('Andy Murray', 0)
labels = {k: [] for k in keys}

for i in range(len(label_decoded)):
    for k in keys:
        if k in label_decoded[i]:
            name = label_decoded[i].replace(k, "")
            #labels[k].append((name, i))
            labels[k].append(i)

In [4]:
num_clusters = 5

kmeans = KMeans(n_clusters = num_clusters).fit(X)
print(kmeans.labels_)

[4 1 0 1 0 4 4 3 2 0 1 1 2 1 3 3 3 0 2 1 3 4 3 0 4 2 3 0 1 3 3 1 2 0 0 4 1
 0 1 3 1 2 0 2 0 0 2 3 1 4]


In [5]:
# key: label
# value: indices of images
clusters = {}

for idx, label in enumerate(kmeans.labels_):
    if label not in clusters:
        # The label is seen for first time, create a new list.
        clusters[label] = [idx]
    else:
        clusters[label].append(idx)

print(labels)
print("\n")
print(clusters)

{'tennis': [0, 3, 12, 24, 27, 32, 35, 38, 46, 49], 'basketball': [6, 7, 8, 9, 18, 19, 20, 21, 22, 23], 'golf': [2, 4, 17, 33, 34, 37, 39, 42, 44, 45], 'fighter': [1, 5, 10, 11, 13, 31, 36, 40, 47, 48], 'soccer': [14, 15, 16, 25, 26, 28, 29, 30, 41, 43]}


{4: [0, 5, 6, 21, 24, 35, 49], 1: [1, 3, 10, 11, 13, 19, 28, 31, 36, 38, 40, 48], 0: [2, 4, 9, 17, 23, 27, 33, 34, 37, 42, 44, 45], 3: [7, 14, 15, 16, 20, 22, 26, 29, 30, 39, 47], 2: [8, 12, 18, 25, 32, 41, 43, 46]}


In [6]:
a = [1, 2, 3]

print(list(itertools.combinations(a, 2)))


[(1, 2), (1, 3), (2, 3)]


In [21]:
# Create label pairs

label_pairs = {}
cluster_pairs = {}

for key, value in labels.items():
    label_pairs[key] = list(itertools.combinations(value, 2))
    
for key, value in clusters.items():
    cluster_pairs[key] = list(itertools.combinations(value, 2))

In [27]:
a = label_pairs['tennis']
b = cluster_pairs[4]

true_positive = list(set(a).intersection(b))
false_positive = list(set(a) - set(b))
false_negative = list(set(a) - set(b))

print(true_positive)


[(24, 49), (0, 24), (0, 49), (24, 35), (0, 35), (35, 49)]


In [9]:
# tmp_true_labels = labels['basketball']
# tmp_clusters = clusters[2]

# print(tmp_true_labels)
# print("\n")
# #print(tmp_clusters)

# label_assignment = {k: [] for k in keys}

# for i in range(num_clusters):
#     for k in keys:
#         label_assignment[k].append(len(list(set(labels[k]).intersection(clusters[i]))))
                                   
# print(label_assignment)

In [10]:
# # F-Measure
# true_positive = list(set(tmp_true_labels).intersection(tmp_clusters))
# false_positive = list(set(tmp_clusters) - set(tmp_true_labels))
# false_negative = list(set(tmp_true_labels) - set(tmp_clusters))

# TP_names = []
# FP_names = []
# FN_names = []

# for i in true_positive:
#     TP_names.append(label_decoded[i])
    
# for i in false_positive:
#     FP_names.append(label_decoded[i])
    
# for i in false_negative:
#     FN_names.append(label_decoded[i])

In [11]:
# print(TP_names)
# print("\n")
# print(FP_names)
# print("\n")
# print(FN_names)

In [12]:
# TP = len(true_positive)
# FP = len(false_positive)
# FN = len(false_negative)

# precision = TP/(TP+FP)
# print(precision)
# recall = TP/(TP+FN)
# print(recall)
# f_measure = 2*((precision*recall)/(precision+recall))
# print(f_measure)

In [13]:
# clustering = AgglomerativeClustering(n_clusters=5, distance_threshold=None).fit(X)
# print(clustering.labels_)

In [14]:
# pca = PCA(n_components=2)
# principalComponents = pca.fit_transform(X)
# reduced_centroids = pca.fit_transform(kmeans.cluster_centers_)

In [15]:
# colors = ["#ffe119", "#f032e6", "#9A6324", "#3cb44b", "#e6194B", "#f58231", "#ffe119", "#469990", "#42d4f4", "#4363d8", "#911eb4"]

# # plt.scatter(X[:,0], X[:,1], s=5)

# for i in kmeans.labels_:
#     color = colors[i]
#     for feature in principalComponents[kmeans.labels_ == i]:
#         plt.scatter(feature[0], feature[1], marker="x", color=color, s=5, linewidths=5)
#     plt.scatter(reduced_centroids[i][0], reduced_centroids[i][1], marker="o", color=color, edgecolors='black',  s=30, linewidths=1)

# plt.show()