# Stalker detection
Given a dataset where specifc percentages of the data are one specific individual, can the algorithm correctly pick it out?

In [1]:
from os import listdir

import numpy as np
import pandas as pd
import time

from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn import preprocessing

from matplotlib import pyplot as plt

from ipynb.fs.full.GenerateLogs import get_encoding_graph, generate_biased_log, get_data, do_chinese_whispers, estimation_error

In [6]:
# Perform DBScan
def do_DBScan(eps, log):
    X = log.drop(columns='id').values
    epsilon = 9.8

    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    labels = pd.Series(db.labels_)

    n_people = len(log['id'].value_counts().keys())
    n_clusters = len(list(labels.value_counts().values))

    est_error = (abs(n_clusters - n_people) / n_people) * 100
    print('eps: %f, people: %d clusters: %d error: %f' % (epsilon, n_people, n_clusters, est_error))
    
    return labels

In [None]:
data_df = get_data(['lfw', 'cf'])
epsilon = 9.8
threshold = 71

for length in [100, 1000, 2500, 5000, 10000]:
    log = generate_biased_log(data_df, log_length=length, proportion=0.1)
    n_people = len(log['id'].value_counts().keys())

    # DBScan
    X = log.drop(columns='id').values
    db_start = time.perf_counter()
    db = DBSCAN(eps=epsilon, min_samples=1).fit(X)
    db_end = time.perf_counter()
    labels = pd.Series(db.labels_)
    db_n_clusters = len(list(labels.value_counts().values))
    
    # Chinese Whispers
    log['randID'] = np.random.randint(100000, 999999, log.shape[0]).astype(str)
    log['randID'] = log['id'] + log['randID']
    d = log.drop(columns='id').set_index('randID').T.to_dict('list')
    G = get_encoding_graph(d.items(), threshold=threshold)
    # Do Chinese Whispers - Timed
    cw_start = time.perf_counter()
    cw_clusters = do_chinese_whispers(G)
    cw_end = time.perf_counter()
    cw_n_clusters = len(cw_clusters.keys())
 
    print('true: ', log['id'].value_counts().values[:10].tolist())
    print('db:   ', labels.value_counts().values[:10].tolist())
    print('cw:   ', [len(cw_clusters[k]) for k in (sorted(cw_clusters, key=lambda k: len(cw_clusters[k]), reverse=True)[:10])])
    
    print(f"DBscan took {db_end - db_start:0.4f} seconds")
    print(f"CW     took {cw_end - cw_start:0.4f} seconds")
    print('------------------------------------------------')

In [None]:
# Log of length 100 with 87 unique people and 10 images of one person
# true:  [10, 2, 2, 2, 2, 1, 1, 1, 1, 1]
# db:    [10, 3, 2, 2, 2, 1, 1, 1, 1, 1]
# cw:    [10, 2, 2, 2, 2, 2, 1, 1, 1, 1]
# DBscan took 0.0031 seconds
# CW     took 0.0010 seconds
# ------------------------------------------------
# Log of length 1000 with 711 unique people and 102 images of one person
# true:  [102, 15, 5, 4, 4, 4, 4, 4, 3, 3]
# db:    [205, 15, 5, 4, 4, 4, 4, 4, 3, 3]
# cw:    [104, 16, 10, 8, 6, 5, 5, 5, 5, 4]
# DBscan took 0.0279 seconds
# CW     took 0.1246 seconds
# ------------------------------------------------
# Log of length 2500 with 1413 unique people and 252 images of one person
# true:  [252, 52, 22, 19, 14, 13, 10, 10, 9, 9]
# db:    [667, 65, 22, 18, 11, 11, 9, 9, 8, 7]
# cw:    [276, 67, 28, 19, 19, 19, 18, 17, 16, 15]
# DBscan took 0.1147 seconds
# CW     took 0.7799 seconds
# ------------------------------------------------
# Log of length 5000 with 2281 unique people and 505 images of one person
# true:  [505, 92, 47, 28, 21, 17, 16, 13, 12, 12]
# db:    [1597, 94, 47, 28, 21, 19, 18, 13, 12, 11]
# cw:    [522, 136, 58, 57, 57, 37, 36, 34, 27, 26]
# DBscan took 0.4573 seconds
# CW     took 4.3620 seconds
# ------------------------------------------------
# Log of length 10000 with 3340 unique people and 1053 images of one person
# true:  [1053, 217, 86, 34, 31, 28, 27, 26, 26, 25]
# db:    [4563, 46, 36, 31, 24, 22, 21, 20, 20, 19]
# cw:    [1194, 294, 108, 87, 83, 74, 68, 64, 59, 57]
# DBscan took 1.8957 seconds
# CW     took 30.4682 seconds

## T-SNE plot

In [None]:
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
log = generate_biased_log(data_df, log_length=600, proportion=0.10)

tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(log.drop(columns='id').values)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

# unique colour for each person
df = pd.DataFrame({'x':x, 'y':y})

In [None]:
plt.figure(figsize=(7,6))
sns.scatterplot(
    x="x", y="y",
    hue=log['id'].reset_index(drop=True),
    palette=sns.color_palette("colorblind", log['id'].nunique()),
    data=df,
    legend=False,
    alpha=0.9,
    s=60
)

plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.savefig('figs/test2-tsne-600.png', bbox_inches='tight', dpi=200)
plt.show()