In [None]:
%matplotlib inline
import sys
sys.path.append("..")
from importlib import reload
import numpy as np
import scipy as sp
import scipy.stats
from matplotlib import pyplot as plt
import igraph
import time

import pythd

In [None]:
DATASET_SIZES_A = np.logspace(1, 7, num=30, base=10.0, dtype=int)
DIMENSIONS = np.array(range(31))
THRESHOLDS = np.array(range(100, 1, -10))
NUM_ITER = 5

def run_loop(sz, dim, num_iter=NUM_ITER, thresh=10, auto_thresh=False, filter_cls=pythd.filter.IdentityFilter):
    """Run seveal iterations of THD with fixed settings, getting a list of times"""
    total_times = []
    setup_times = []
    thd_times = []
    
    center = [0.0 for i in range(dim)]
    sd = [10.0 for i in range(dim)]

    for i in range(num_iter):
        total_time = time.perf_counter()
        setup_time = total_time
        
        dataset = np.random.rand(sz, dim)
        filt = filter_cls()
        f_x = filt(dataset)
        clustering = pythd.clustering.HierarchicalClustering()
        cover = pythd.cover.IntervalCover.EvenlySpacedFromValues(f_x, 15, 0.4)
        
        thd_time = time.perf_counter()
        setup_time = thd_time - setup_time
        
        thresh = max(1, sz / 10) if auto_thresh else 1
        thd = pythd.thd.THD(dataset, filt, cover, group_threshold=thresh)
        
        last_time = time.perf_counter()
        total_time = last_time - total_time
        thd_time = last_time - thd_time
        total_times.append(total_time)
        setup_times.append(setup_time)
        thd_times.append(thd_time)
    
    return total_times, setup_times, thd_times

def confidence_interval(data, confidence=0.99):
    """Compute a confidence interval for a series of data.
    
    Returns half the width of the interval"""
    data = np.array(data)
    n = data.shape[0]
    sem = sp.stats.sem(data) # standard error of the mean
    return sem * sp.stats.t.ppf((1 + confidence) / 2.0, n - 1)

def format_times(tms):
    """Format a list of times using a 99% confidence interval"""
    tms = np.array(tms)
    mean = tms.mean()
    h = confidence_interval(tms)
    return "{:.4f} +/- {:.4f}".format(mean, h)
    

In [None]:
# fixed dimension and threshold
res_types = ["total", "setup", "thd"]
means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}
for sz in DATASET_SIZES_A:
    print("Dataset size {}... ".format(sz), end='')
    res = run_loop(sz=sz, dim=2)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
# fixed dimension and threshold
colors = ["blue", "green", "orange"]

for i, k in enumerate(means.keys()):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(DATASET_SIZES_A, v, "-", color=colors[i], label=k)
    plt.fill_between(DATASET_SIZES_A, v-h, v+h, color=colors[i], alpha=0.1)
plt.legend(loc="best")
plt.xscale("log")
plt.xlabel("dataset size")
plt.ylabel("average time (s)")
_ = plt.show()

In [None]:
#fixed dataset size and threshold
res_types = ["total", "setup", "thd"]
means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

sz = 10000
for dim in DIMENSIONS:
    print("Dimension {}...".format(dim), end='')
    res = run_loop(sz=sz, dim=dim, num_iter=100)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
#fixed dataset size and threshold
colors = ["blue", "green", "orange"]

for i, k in enumerate(means.keys()):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(DIMENSIONS, v, "-", color=colors[i], label=k)
    plt.fill_between(DIMENSIONS, v-h, v+h, color=colors[i], alpha=0.1)
plt.legend(loc="best")
plt.xlabel("num. dimensions")
plt.ylabel("average time (s)")
_ = plt.show()

In [None]:
#fixed dataset size and dimensions
res_types = ["total", "setup", "thd"]
means = {k: [] for k in res_types}
intervals = {k: [] for k in res_types}

sz = 100000
THRESHOLDS = np.flip(np.logspace(0, np.log10(sz), dtype=int, num=20))
dim = 2
for thresh in THRESHOLDS:
    print("Group threshold {}...".format(thresh), end='')
    res = run_loop(sz=sz, thresh=thresh, dim=dim, num_iter=100)
    for i, k in enumerate(res_types):
        means[k].append(np.mean(res[i]))
        intervals[k].append(confidence_interval(res[i], confidence=0.99))
    print(", ".join(["{}: {:.4f} +/ {:.4f} s".format(k, means[k][-1], intervals[k][-1]) for k in means.keys()]))

In [None]:
#fixed dataset size and threshold
colors = ["blue", "green", "orange"]

for i, k in enumerate(means.keys()):
    v = np.array(means[k])
    h = np.array(intervals[k])
    plt.plot(THRESHOLDS, v, "-", color=colors[i], label=k)
    plt.fill_between(THRESHOLDS, v-h, v+h, color=colors[i], alpha=0.1)
plt.legend(loc="best")
plt.xscale("log")
plt.xlabel("group threshold")
plt.ylabel("average time (s)")
_ = plt.show()