In [None]:
%matplotlib inline
import sys
sys.path.append("..")
from importlib import reload
import numpy as np
import scipy as sp
import scipy.stats
from matplotlib import pyplot as plt
import igraph
import time

from sklearn.manifold import TSNE

import pythd

In [None]:
NUM_ITER = 5

def run_loop(sz, dim, num_iter=NUM_ITER, thresh=1, auto_thresh=False, filter_f=pythd.filter.IdentityFilter()):
    """Run seveal iterations of THD with fixed settings, getting a list of times"""
    total_times = []
    setup_times = []
    filter_times = []
    thd_times = []
    
    center = [0.0 for i in range(dim)]
    sd = [10.0 for i in range(dim)]

    for i in range(num_iter):
        total_time = time.perf_counter()
        setup_time = total_time
        
        dataset = np.random.rand(sz, dim)
        filter_time = time.perf_counter()
        filter_f.reset()
        f_x = filter_f(dataset)
        filter_time = time.perf_counter() - filter_time
        clustering = pythd.clustering.HierarchicalClustering()
        cover = pythd.cover.IntervalCover.EvenlySpacedFromValues(f_x, 15, 0.5)
        thresh = max(1, sz / 10) if auto_thresh else thresh
        setup_time = time.perf_counter() - setup_time

        thd_time = time.perf_counter()
        thd = pythd.thd.THD(dataset, filt, cover, group_threshold=thresh)
        last_time = time.perf_counter()

        total_time = last_time - total_time
        thd_time = last_time - thd_time
        
        total_times.append(total_time)
        setup_times.append(setup_time)
        filter_times.append(filter_time)
        thd_times.append(thd_time)
    
    return total_times, setup_times, filter_times, thd_times

def confidence_interval(data, confidence=0.99):
    """Compute a confidence interval for a series of data.
    
    Returns half the width of the interval"""
    data = np.array(data)
    n = data.shape[0]
    sem = sp.stats.sem(data) # standard error of the mean
    return sem * sp.stats.t.ppf((1 + confidence) / 2.0, n - 1)

def format_times(tms):
    """Format a list of times using a 99% confidence interval"""
    tms = np.array(tms)
    mean = tms.mean()
    h = confidence_interval(tms)
    return "{:.4f} +/- {:.4f}".format(mean, h)
    

In [None]:
res_types = ["total", "setup", "filter", "thd"]
colors = ["blue", "green", "red", "orange"]

In [None]:
# TSNE filter - dataset size
dimension = 10
DATASET_SIZES_A = np.logspace(1, 3.5, num=50, base=10.0, dtype=int)

means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

filt = pythd.filter.ScikitLearnFilter(TSNE, n_components=2, n_jobs=3)

for sz in DATASET_SIZES_A:
    print("Dataset size {}... ".format(sz), end='')
    res = run_loop(sz=sz, dim=dimension, filter_f=filt, num_iter=50)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
# TSNE filter - dataset size
plt.close()
for i, k in enumerate((set(means.keys()) - set(["setup"]))):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(DATASET_SIZES_A, v, "-", color=colors[i], label=k)
    plt.fill_between(DATASET_SIZES_A, v-h, v+h, color=colors[i], alpha=0.3)
plt.legend(loc="best")
plt.xlabel("dataset size")
plt.ylabel("average time (s)")
_ = plt.show()

In [None]:
# TSNE filter - dataset dimension
sz = 50
DIMENSIONS = np.array(range(5, 101))

means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

filt = pythd.filter.ScikitLearnFilter(TSNE, n_components=2, n_jobs=3)

for dim in DIMENSIONS:
    print("Dimension {}...".format(dim), end='')
    res = run_loop(sz=sz, dim=dim, filter_f=filt, num_iter=50)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
# TSNE filter - dataset dimension
plt.close()
for i, k in enumerate((set(means.keys()) - set(["setup"]))):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(DIMENSIONS, v, "-", color=colors[i], label=k)
    plt.fill_between(DIMENSIONS, v-h, v+h, color=colors[i], alpha=0.3)
plt.legend(loc="best")
plt.xlabel("num. dimensions")
plt.ylabel("average time (s)")
_ = plt.show()

In [None]:
# TSNE filter - number of components
sz = 50
COMPONENTS = np.array(range(100, 1, -1))
dim = 100

means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

for num_components in COMPONENTS:
    filt = pythd.filter.ScikitLearnFilter(TSNE, n_components=num_components, n_jobs=3, method="exact")
    print("{} components...".format(num_components), end='')
    res = run_loop(sz=sz, dim=dim, filter_f=filt, num_iter=50)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
# TSNE filter - number of components
plt.close()
for i, k in enumerate((set(means.keys()) - set(["setup"]))):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(COMPONENTS, v, "-", color=colors[i], label=k)
    plt.fill_between(COMPONENTS, v-h, v+h, color=colors[i], alpha=0.3)
plt.legend(loc="best")
plt.xlabel("num. tSNE components")
plt.ylabel("average time (s)")
_ = plt.show()

In [None]:
# With/without TSNE - num dimensions
sz = 100
DIMENSIONS = np.array(range(2, 101))

res_types = ["thd_tsne", "thd_id"]
means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

filt = pythd.filter.ScikitLearnFilter(TSNE, n_components=2, n_jobs=3)

for dim in DIMENSIONS:
    print("Dimension {}...".format(dim), end='')
    res = run_loop(sz=sz, dim=dim, filter_f=filt, num_iter=50)
    means["thd_tsne"].append(np.mean(res[-1]))
    intervals["thd_tsne"].append(confidence_interval(res[-1], confidence=0.99))
    
    res = run_loop(sz=sz, dim=dim, num_iter=50)
    means["thd_id"].append(np.mean(res[-1]))
    intervals["thd_id"].append(confidence_interval(res[-1], confidence=0.99))
    
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
# With/without TSNE - num dimensions
plt.close()
for i, k in enumerate((set(means.keys()) - set(["setup"]))):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(DIMENSIONS, v, "-", color=colors[i], label=k)
    plt.fill_between(DIMENSIONS, v-h, v+h, color=colors[i], alpha=0.3)
plt.legend(loc="best")
plt.xlabel("num. dimensions")
plt.ylabel("average time (s)")
_ = plt.show()