In [6]:
import re
import numpy as np
import pandas as pd
import json
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
import scipy
import scipy.cluster
import traceback

In [2]:
dataset_name_dict = {'Mallat': 0,
 'UWaveGestureLibraryAll': 1,
 'NonInvasiveFetalECGThorax2': 2,
 'MixedShapesRegularTrain': 3,
 'MixedShapesSmallTrain': 4,
 'ECG5000': 5,
 'NonInvasiveFetalECGThorax1': 6,
#  'MoteStrain': 7,
 "StarLightCurves":7,
 'HandOutlines': 8,
 'UWaveGestureLibraryX': 9,
 'CBF': 10,
 'InsectWingbeatSound': 11,
 'UWaveGestureLibraryY': 12,
 'ShapesAll': 13,
 'SonyAIBORobotSurface2': 14,
 'FreezerSmallTrain': 15,
  'Crop':16,
 'ElectricDevices':17}

datasets=dataset_name_dict.keys()

workers=[1, 4, 12, 24, 36, 48, 96]# 1 must be first processed

prefixs=[2, 5, 10, 30, 50, 200]

ouput_dir = "../outputs/tmfg/"

## store true labels

In [66]:
def getDatasetInformation(dataset):
    dir_addr = "../../tmfg_benchmark/par_tmfg/datasets/UCRArchive_2018/"
    X1 = np.genfromtxt(dir_addr+dataset+"/"+dataset+"_TRAIN.tsv", delimiter="\t")
    X2 = np.genfromtxt(dir_addr+dataset+"/"+dataset+"_TEST.tsv", delimiter="\t")
    X = np.concatenate((X1, X2), axis=0)
    true_labels = X[:,0]
    X = X[:,1:]
    if dataset in ["Crop", "ElectricDevices", "StarLightCurves"]:
        X, index = np.unique(X, axis=0, return_index=True)
        true_labels = true_labels[index]
    n = X.shape[0]
    return {
        "true_labels":list(true_labels),
        "n":n
    }
data_information = {}
for dataset in datasets:
    data_information[dataset] = getDatasetInformation(dataset)
json_string = json.dumps(data_information)
# json.dump(json_string, open("dataset_information.json", "w"))

In [3]:
data_information = json.load(open("dataset_information.json"))
data_information = json.loads(data_information)

## store quality

In [7]:
def getTMFGQuality(dataset, method, prefix):
    if method=="exact":
        prefix="-1"
    else:
        prefix="-"+str(prefix)
    true_labels = data_information[dataset]["true_labels"]
    n_cluster = len(np.unique(true_labels))
    Z_dir = ouput_dir + "Zs/%s-%s-Z%s" % (dataset, method,prefix)
    Z = np.genfromtxt(Z_dir)
    tmfg_clustering = scipy.cluster.hierarchy.cut_tree(Z, n_clusters=[n_cluster]).flatten()
    score = adjusted_rand_score(true_labels, tmfg_clustering)
    info_score = adjusted_mutual_info_score(true_labels, tmfg_clustering)
    return {
        "score":score,
        "info_score":info_score
    }
quality_dict = {}
for dataset in datasets:
    scores = getTMFGQuality(dataset, "exact", 1)
    quality_dict[dataset+"1"] = scores
    for prefix in prefixs:
        scores = getTMFGQuality(dataset, "prefix", prefix)
        quality_dict[dataset+str(prefix)] = scores
json_string = json.dumps(quality_dict)
json.dump(json_string, open("tmfg_quality_dict.json", "w"))

In [17]:
# quality_dict = json.load(open("tmfg_quality_dict%s.json" % version))
# quality_dict = json.loads(quality_dict)

## get TMFG csv data 

In [8]:
space_split = re.compile("( |\n)")#("(### Batch [0-9]+)")
time_find = re.compile("tmfg\ total:.+?\n")
apsp_time_find = re.compile("APSP\ total:.+?\n")
direction_time_find = re.compile("direction\ total:.+?\n")
non_discrete_time_find = re.compile("non-discrete\ total:.+?\n")
discrete_time_find = re.compile("discrete\ total:.+?\n")
bubble_time_find = re.compile("bubble\ total:.+?\n")
hierarchy_time_find = re.compile("hierarchy\ total:.+?\n")

In [9]:
def getFileName(dataset, method, prefix, worker, version):
#     if version != "":
#         if method != "prefix":
#             return ouput_dir + "cdbht/%s_%s_%sth.txt" % (dataset+version, method, worker)
#         return ouput_dir + "cdbht/%s_prefix_%s_%sth.txt" %(dataset+version, prefix, worker)
#     else:
    if method != "prefix":
        return ouput_dir + "cdbht/%s_%s_%sth.txt" % (dataset, method, worker)
    return ouput_dir + "cdbht/%s_prefix_%s_%sth.txt" %(dataset, prefix, worker)

In [10]:
def getInfo(dataset, method, prefix, worker, version):
    file = getFileName(dataset, method, prefix, worker, version)
    f1 = open(file)  
    data1 = f1.read()
    time = 10000000000
    for timetxt in time_find.findall(data1):
        time = min(time, float(timetxt[11:]))
    apsptime = 10000000000
    for timetxt in apsp_time_find.findall(data1):
        apsptime = min(apsptime, float(timetxt[11:]))
    directiontime = 10000000000
    for timetxt in direction_time_find.findall(data1):
        directiontime = min(directiontime, float(timetxt[16:]))
    non_discretetime = 10000000000
    for timetxt in non_discrete_time_find.findall(data1):
        non_discretetime = min(non_discretetime, float(timetxt[19:]))
    discretetime = 10000000000
    for timetxt in discrete_time_find.findall(data1):
        discretetime = min(discretetime, float(timetxt[15:]))
    bubbletime = 10000000000
    for timetxt in bubble_time_find.findall(data1):
        bubbletime = min(bubbletime, float(timetxt[13:]))
    hierarchytime = 10000000000
    for timetxt in hierarchy_time_find.findall(data1):
        hierarchytime = min(hierarchytime, float(timetxt[16:]))
    if method=="exact":
        prefix=1
    return {
        "dataset": dataset,
        "prefix": str(prefix),
        "method":"tmfg_"+str(prefix),
        "workers" : worker,
        "tmfg" : time,
        "apsp" :apsptime,
        "direction":directiontime,
        "non_discrete":non_discretetime,
        "discrete":discretetime,
        "bubble":bubbletime,
        "bubble_tree":directiontime+non_discretetime+discretetime+bubbletime,
        "hierarchy":hierarchytime,
        "time":time + apsptime+directiontime+non_discretetime+discretetime+bubbletime+hierarchytime,
        "edge_sum" : float(data1.split("\n")[8]),
        "version": version
    }

In [11]:
steps = ["time", "tmfg", "apsp", 'bubble_tree', 'hierarchy']
version=""
data = []
one_thread_time = {}
for dataset in datasets:
    n = data_information[dataset]["n"]
    for prefix in prefixs:
        data_score = quality_dict[dataset+str(prefix)]
        for worker in workers:
            try:
                tmp = getInfo(dataset, "prefix", prefix, worker, version)
                if(worker==1):
                    for step in steps:
                        one_thread_time[dataset+"-"+str(prefix)+step] = tmp[step]
                        tmp[step+"speedup"]=1
                else:
                    for step in steps:
                        speedup = one_thread_time[dataset+"-"+str(prefix)+step]/tmp[step]
                        tmp[step+"speedup"]=speedup
            except Exception as e: 
                print(e)
                traceback.print_exc()
                print(dataset, prefix, worker, False)
            tmp["ID"] = str(dataset_name_dict[dataset])
            tmp["speedup"] = tmp["timespeedup"]
            tmp["score"] = data_score["score"]
            tmp["info_score"] = data_score["info_score"]
            tmp["n"]=n
            data.append(tmp)

one_thread_time = {}
for dataset in datasets:
    data_score = quality_dict[dataset+str(1)]
    n = data_information[dataset]["n"]
    for worker in workers:
        try:
            tmp = getInfo(dataset, "exact", prefix, worker, version)
            if(worker==1):
                for step in steps:
                    one_thread_time[dataset+"-"+str(prefix)+step] = tmp[step]
                    tmp[step+"speedup"]=1
            else:
                for step in steps:
                    speedup = one_thread_time[dataset+"-"+str(prefix)+step]/tmp[step]
                    tmp[step+"speedup"]=speedup
        except Exception as e: 
            print(e)
            traceback.print_exc()
            print(dataset, 0, worker, True)
        tmp["ID"] = str(dataset_name_dict[dataset])
        tmp["speedup"] = tmp["timespeedup"]
        tmp["score"] = data_score["score"]
        tmp["info_score"] = data_score["info_score"]
        tmp["n"]=n
        data.append(tmp)
df = pd.DataFrame(data) 

In [12]:
df_tmfg_times = df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score', "edge_sum", "n", "prefix"]]

In [13]:
df_tmfg_times.to_csv("tmfg.csv")
df.to_csv("tmfg_allsteps.csv")