In [62]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-paper'])
import numpy as np
import pandas as pd
import json
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
import scipy
import traceback
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['text.usetex'] = True

dataset_name_dict = {'Mallat': 0,
 'UWaveGestureLibraryAll': 1,
 'NonInvasiveFetalECGThorax2': 2,
 'MixedShapesRegularTrain': 3,
 'MixedShapesSmallTrain': 4,
 'ECG5000': 5,
 'NonInvasiveFetalECGThorax1': 6,
#  'MoteStrain': 7,
 "StarLightCurves":7,
 'HandOutlines': 8,
 'UWaveGestureLibraryX': 9,
 'CBF': 10,
 'InsectWingbeatSound': 11,
 'UWaveGestureLibraryY': 12,
 'ShapesAll': 13,
 'SonyAIBORobotSurface2': 14,
 'FreezerSmallTrain': 15,
 'Crop':16,
 'ElectricDevices':17}

datasets=dataset_name_dict.keys()

workers=[1, 4, 12, 24, 36, 48, 96]# 1 must be first processed

data_information = json.load(open("dataset_information.json"))
data_information = json.loads(data_information)

## KMEANS

In [63]:
def getJson(dataset, worker, method = "kmeans"):
    json_file = open("../outputs/%s/%s_%s_%sth.json" % (method, method, dataset, worker))
    data = json.load(json_file)
    data = json.loads(data)
    return data

In [64]:
data = []
one_thread_time = {}
# datasets = ["Crop", "ElectricDevices"]
for dataset in datasets:
    for worker in workers:
        tmp = getJson(dataset, worker)
        if(worker==1):
            one_thread_time[dataset] = tmp["time"]
            tmp["speedup"]=1
        else:
            speedup = one_thread_time[dataset]/tmp["time"]
            tmp["speedup"]=speedup
        tmp["ID"] = str(dataset_name_dict[tmp["dataset"]])
        tmp["workers"] = worker
        tmp["n"] = data_information[dataset]["n"]
        data.append(tmp)
df = pd.DataFrame(data)

In [65]:
df["edge_sum"] = 0
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n']].to_csv("kmeans.csv")

## KMEANS spectral

In [72]:
starlight_scores = {
    (100, 0.5061098019996543, 0.607895603942882, 8.291735649108887),
(200, 0.5056192323212957, 0.6089419996708064, 9.262848377227783),
(300, 0.5058662322887388, 0.6083702541519664, 10.58641505241394),
(400, 0.5064933612046948, 0.6083689325362346, 12.986111879348755),
(500, 0.5063633959966081, 0.6067173751812562, 14.818368434906006),
(600, 0.5065924440523653, 0.6064849117628339, 15.213972091674805),
(700, 0.5067261681495983, 0.6061411750923228, 17.54443049430847),
(800, 0.506886806814409, 0.6055087345952759, 18.93340802192688),
(900, 0.5066944951953399, 0.6052827732595785, 19.79792094230652),
(1000, 0.5067143822822273, 0.6050655228084348, 21.346460342407227),
(1100, 0.5067412678246552, 0.604779797074312, 23.55123496055603),
(1200, 0.5066385967631624, 0.60474004283994, 25.93466830253601),
(1300, 0.5067277722852377, 0.6049220987319601, 27.599619150161743),
(1400, 0.5070950177344519, 0.605707702063791, 31.565340757369995),
(1500, 0.5071845764405573, 0.6058910328749189, 31.394762754440308),
(1600, 0.5070363396647843, 0.606362311115417, 35.447189807891846),
(1700, 0.5070172551543575, 0.6065856378374793, 38.030200719833374),
(1800, 0.5071912864751087, 0.6070434594096579, 39.939858198165894),
(1900, 0.5067445579986425, 0.6073470404577184, 42.87311005592346),
(2000, 0.5062167639373176, 0.6073839028295714, 45.78275942802429),
(2100, 0.5051800818621809, 0.6063149065576994, 47.10076189041138),
(2200, 0.5042049998309011, 0.6070518046228226, 50.71326494216919),
(2300, 0.5040798301055556, 0.6074460669207039, 52.93810296058655),
(2400, 0.5038146761591529, 0.6087086750330143, 58.921996116638184),
(2500, 0.5032757213171354, 0.6094307426387301, 82.43226218223572),
(2600, 0.5016230202103066, 0.6091760769781157, 111.20344424247742),
(2700, 0.5001715402278375, 0.6085007168208503, 110.95350503921509),
(2800, 0.4991881613037194, 0.6091070187256092, 112.46390008926392),
(2900, 0.49931838630938985, 0.6111437705356892, 118.86069011688232),
(3000, 0.4980773365750064, 0.6117446125633051, 119.60143041610718),
(3100, 0.49639278584124386, 0.6130159547866374, 115.17153787612915),
(3200, 0.4962855006709914, 0.6142921487561067, 119.01373505592346),
(3300, 0.49663303774866296, 0.6135092304009903, 127.53060483932495),
(3400, 0.4971306516703107, 0.6116345643702646, 133.2645878791809),
(3500, 0.49729976105917867, 0.6083845131387727, 125.034823179245),
(3600, 0.499264204944263, 0.60654320157154, 115.74267864227295),
(3700, 0.5021253827636166, 0.6049036051518152, 116.70267128944397),
(3800, 0.504600310551406, 0.6033241883181905, 121.75761914253235),
(3900, 0.5080146475060394, 0.602293543755976, 135.07952904701233),
(4000, 0.5109093509132864, 0.6013838824290029, 142.85799932479858),
(4100, 0.5132005816928775, 0.600026806547927, 142.51463842391968),
(4200, 0.5180104110470253, 0.6001978243187045, 142.91506624221802),
(4300, 0.5211933274981085, 0.6007125219716507, 132.29587388038635),
(4400, 0.5231268110576365, 0.6013137151085775, 146.76605224609375),
(4500, 0.5239426375703378, 0.6017286383900264, 150.8798336982727),
(4600, 0.5244279638066773, 0.6029171793112864, 177.60805654525757),
(4700, 0.5246989824979784, 0.6034388125186444, 163.19072461128235),
(4800, 0.5247636080938584, 0.6037317207684171, 174.70189023017883),
(4900, 0.5246366491262795, 0.6031687473826238, 169.09885168075562),
(5000, 0.5248675807903582, 0.6033131197651761, 169.67251777648926),
(5100, 0.525851860414372, 0.6036432043984398, 171.67198038101196),
(5200, 0.526241292417193, 0.6036874763379975, 176.23322749137878),
(5300, 0.5275046948895032, 0.6040215756654416, 182.7608404159546),
(5400, 0.5295344422953308, 0.6040963721576186, 192.39973616600037),
(5500, 0.5294581418118895, 0.6038686792479151, 200.44489932060242),
(5600, 0.5309509776642783, 0.604023026557542, 192.38972759246826),
(5700, 0.5340647427137877, 0.60393002699296, 200.23406648635864),
(5800, 0.5389067178079091, 0.6045769891623352, 204.07197451591492),
(5900, 0.5473787586453507, 0.60747905170725, 197.7679648399353),
(6000, 0.5558613700618824, 0.6089425316684884, 197.98874044418335),
(6100, 0.5612433445189691, 0.6079188499640167, 201.12582921981812),
(6200, 0.5629180623568818, 0.6080284008916149, 200.49145007133484),
(6300, 0.5640920811768536, 0.6071818873627739, 200.67538261413574),
(6400, 0.5658830640890273, 0.6074613040933319, 201.6067361831665),
(6500, 0.5673469900050551, 0.6067817804030495, 201.57653212547302),
}#Not enough memory to perform factorization.




electric_devices_scores = {
    (100, 0.18829962143418558, 0.2729017628320502, 655.2051491737366), #num_neighbors, score, info_score, time (double check)
(200, 0.10046150528326446, 0.17347303064115108, 492.63740730285645),
(300, 0.13010874229271216, 0.1758113828528965, 501.80300211906433),
(400, 0.18185418583436846, 0.17889546076078194, 739.7883677482605),
(500, 0.17696423789334745, 0.1665102837672373, 895.643107175827),
(600, 0.17706500398082717, 0.16047652033228274, 825.0174105167389),
(700, 0.1798126021489691, 0.15871990292140856, 900.8294382095337),
(800, 0.18232498107752265, 0.15755493446229446, 924.0552940368652),
(900, 0.1868081694331389, 0.15819810080187832, 865.6726467609406),
(1000, 0.18959593947540884, 0.15883632597583092, 942.6126983165741),
(1100, 0.1923437557586582, 0.1589167225641881, 893.1333494186401),
(1200, 0.19526332734936244, 0.15990578244968032, 940.3066806793213),
(1300, 0.1973136118325618, 0.16004078606088903, 943.0143821239471),
(1400, 0.19848474554344556, 0.15950218920217618, 935.4410946369171),
(1500, 0.19966522920180135, 0.15999587659689285, 894.9925932884216),
(1600, 0.20109895862671978, 0.1602901853084319, 908.1073546409607),
(1700, 0.2028280881145041, 0.16119654176081025, 921.7520523071289),
(1800, 0.2034748999870959, 0.16156858291139872, 907.6695325374603),
(1900, 0.2050958873424448, 0.16234689503953467, 868.718989610672),
(2000, 0.2062931155077733, 0.16330521957385388, 912.3228387832642),
(2100, 0.20771074558719194, 0.16433427406794848, 915.9778606891632),
(2200, 0.20773213744141447, 0.16379337294677346, 906.1362011432648),
(2300, 0.2078120612281057, 0.1640989343943043, 918.0431733131409),
(2400, 0.20626131689648902, 0.16273716007355377, 907.2158970832825),
(2500, 0.2066149168933109, 0.1628308550001415, 903.9970920085907),
(2600, 0.2061106136675557, 0.16230839286531046, 943.8289759159088),
(2700, 0.2060434558707785, 0.16212296600559037, 896.2446701526642),
(2800, 0.20522955710139526, 0.16135582153016823, 863.3647568225861),
(2900, 0.206230145841852, 0.16158995335416573, 915.7360780239105),
(3000, 0.20638416743852356, 0.16125478131969875, 929.8999557495117),
(3100, 0.2058064052907605, 0.1609975673251495, 918.924777507782),
(3200, 0.2040066852329507, 0.15966534920410297, 904.4392132759094),
}#4000: not enough memory

crop_scores = {
    (10, 0.2325393329043042, 32.41151523590088),#num_neighbors, score, time (double check)
(20, 0.248775452780486, 38.3226158618927),
(30, 0.2675503128030905, 37.987765073776245),
(40, 0.2686918790017965, 47.17161750793457),
(50, 0.2758216305164201, 47.843125343322754),
(60, 0.26691380136611276, 65.44691824913025),
(70, 0.2689585181953828, 79.77406191825867),
(80, 0.2582658731720503, 83.02534484863281),
(90, 0.2730172511311606, 89.39699482917786),
(100, 0.2728725334264991, 114.69328999519348),
(200, 0.2607540525387141, 162.0444529056549),
(300, 0.2676004432394032, 165.55012226104736),
(400, 0.26119240849298886, 201.92341589927673),
(500, 0.2663829478137875, 242.69824314117432),
(600, 0.2704986247761106, 231.6892213821411),
(700, 0.2719720950430681, 229.78280782699585),
(800, 0.27157763809572183, 245.19502353668213),
(900, 0.2724356834723714, 269.69936895370483),
(1000, 0.27252891302578497, 277.49880957603455),
(1100, 0.275783367819378, 320.3424446582794),
(1200, 0.2789142667330019, 337.9541566371918),
(1300, 0.28202518723602327, 332.5620450973511),
(1400, 0.2823456653763388, 346.60200214385986),
(1500, 0.2844295136974539, 353.40898418426514),
(1600, 0.2740941101644025, 366.9059998989105),
(1700, 0.28512221188758874, 361.2432713508606),
(1800, 0.28517403172628697, 361.2845640182495),
(1900, 0.28560632346756404, 365.578604221344),
(2000, 0.2887185429338063, 381.32565426826477),
(2100, 0.27685011337245924, 381.7315466403961),
(2200, 0.289779270304498, 413.35465264320374),
(2300, 0.29383220703335616, 422.3354902267456),
(2400, 0.29441451750165765, 421.84377098083496),
(2500, 0.2890898907073904, 443.99335074424744),
(2600, 0.2893118239329533, 459.4462866783142),
(2700, 0.2724842320250538, 450.2467637062073),
(2800, 0.2873894427566623, 509.8471772670746),
(2900, 0.2836716173683752, 501.8699290752411)

}
# More than 2900: Not enough memory to perform factorization.
kmeans_neighbor_dict = {}
x = []
y = []
z = []
for dataset in datasets:
    if dataset == "Crop":
        x_val = dataset_name_dict[dataset]
        for entry in crop_scores:
            z.append(entry[0])
            y.append(entry[1])
            x.append(str(x_val+1))
        kmeans_neighbor_dict[dataset] = 2400
        continue
    if dataset == "ElectricDevices":
        x_val = dataset_name_dict[dataset]
        for entry in electric_devices_scores:
            z.append(entry[0])
            y.append(entry[1])
            x.append(str(x_val+1))
        kmeans_neighbor_dict[dataset] = 2300
        continue
    if dataset == "StarLightCurves":
        x_val = dataset_name_dict[dataset]
        for entry in starlight_scores:
            z.append(entry[0])
            y.append(entry[1])
            x.append(str(x_val+1))
        kmeans_neighbor_dict[dataset] = 6500
        continue
    tmp = getJson(dataset, 24, "kmeans_spectral")
    n_neighbor = tmp["n_neighbors"][np.argmax(tmp["scores"])]
    kmeans_neighbor_dict[dataset] = n_neighbor
    y = y + tmp["scores"]
    x_val = dataset_name_dict[dataset]
    x = x + [str(x_val+1)] * len(tmp["scores"])
    z = z + tmp["n_neighbors"]
df = pd.DataFrame(list(zip(x,y,z)),
columns =['ID', 'scores', "n-neighbors"])
df.to_csv("kmeans_spectral_quality.csv")

In [73]:
kmeans_neighbor_dict 

{'Mallat': 330,
 'UWaveGestureLibraryAll': 10,
 'NonInvasiveFetalECGThorax2': 10,
 'MixedShapesRegularTrain': 20,
 'MixedShapesSmallTrain': 10,
 'ECG5000': 10,
 'NonInvasiveFetalECGThorax1': 20,
 'StarLightCurves': 6500,
 'HandOutlines': 1070,
 'UWaveGestureLibraryX': 50,
 'CBF': 670,
 'InsectWingbeatSound': 40,
 'UWaveGestureLibraryY': 10,
 'ShapesAll': 20,
 'SonyAIBORobotSurface2': 60,
 'FreezerSmallTrain': 1510,
 'Crop': 2400,
 'ElectricDevices': 2300}

In [80]:
data = []
one_thread_time = {}
for dataset in datasets:
    for worker in workers:
        worker2 = worker
        if(worker == 96):
            worker = 48
        tmp1 = getJson(dataset, worker, "kmeans_spectral")
        tmp = {
            "dataset": dataset, 
            "workers": max(worker, worker2), 
            "time": tmp1["time"], 
            "method":"kmeans_spectral", 
#             "speedup", 
            'ID': str(dataset_name_dict[dataset]), 
            'score':tmp1["score"],
            'info_score':tmp1["info_score"],
            "n": data_information[dataset]["n"]
        }
        if worker==24:
            tmp["time"] = tmp1["times"][np.argmax(tmp1["scores"])]
        if(worker==1):
            one_thread_time[dataset] = tmp["time"]
            tmp["speedup"]=1
        else:
            speedup = one_thread_time[dataset]/tmp["time"]
            tmp["speedup"]=speedup
        data.append(tmp)
df = pd.DataFrame(data)

In [81]:
df["edge_sum"] = 0
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n']].to_csv("kmeans_spectral.csv")

## HAC

In [66]:
def getHACQuality(dataset, method, version=""):
    true_labels = data_information[dataset]["true_labels"]
    n_cluster = len(np.unique(true_labels))
    Z = np.genfromtxt("../outputs/hac/%s_%s_dendro" % (dataset+version, method))
    tmfg_clustering = scipy.cluster.hierarchy.cut_tree(Z, n_clusters=[n_cluster]).flatten()
    score = adjusted_rand_score(true_labels, tmfg_clustering)
    info_score = adjusted_mutual_info_score(true_labels, tmfg_clustering)
    return {
        "score":score,
        "info_score":info_score
    }

In [67]:
def getHACOutput(dataset, worker, method, version=""):
    time_find = re.compile("hac_time:.+?\n")
    f1 = open("../outputs/hac/%s_%s_%sth_timing.txt" % (dataset+version, method, worker))
    data1 = f1.read()
    time = 10000000000
    for timetxt in time_find.findall(data1):
        time = min(time, float(timetxt[10:]))
    result = {
        "dataset":dataset,
        "workers":worker,
        "time":time,
        "method":method
    }
    return result

In [68]:
data = []
one_thread_time = {}
for method in ["comp", "avg"]:
    for dataset in datasets:
        scores = getHACQuality(dataset, method)
        for worker in workers:
            tmp = getHACOutput(dataset, worker, method)
            if(worker==1):
                one_thread_time[dataset] = tmp["time"]
                tmp["speedup"]=1
            else:
                speedup = one_thread_time[dataset]/tmp["time"]
                tmp["speedup"]=speedup
            tmp["ID"] = str(dataset_name_dict[tmp["dataset"]])
            tmp["score"] = scores["score"]
            tmp["info_score"] = scores["info_score"]
            tmp["n"] = data_information[dataset]["n"]
            data.append(tmp)
df = pd.DataFrame(data)

In [69]:
df["edge_sum"] = 0
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n']].to_csv("hac.csv")

## PMFG

In [83]:
pmfg_time_find = re.compile("PMFG\ time:.+?\n")
apsp_time_find = re.compile("shortest\ path\ time:.+?\n")
bubble_time_find = re.compile("bubble\ time:.+?\n")
hierarchy_time_find = re.compile("hierarchy\ time:.+?\n")
edge_sum_find = re.compile("Outputing PMFG.....+?\n")

In [84]:
def modifyPMFGZ(Z):
    n = Z.shape[0] + 1
    Z2 = np.hstack((Z, np.ones((n-1,1))*2))
    for i in range(len(Z2)):
        n1 = 1
        n2 = 1
        u = int(Z2[i][0])-1
        v = int(Z2[i][1])-1
        Z2[i][0] = u
        Z2[i][1] = v
        if(u >= n):
            n1 = Z2[u-n][3]
        if(v >= n):
            n2 = Z2[v-n][3]
        Z2[i][3] = n1+n2
    return Z2

In [85]:
def getPMFGQuality(dataset):
    true_labels = data_information[dataset]["true_labels"]
    n_cluster = len(np.unique(true_labels))
    Z = np.genfromtxt("../outputs/pmfg/%s-pmfg-Z" % (dataset))
    Z = modifyPMFGZ(Z)
    tmfg_clustering = scipy.cluster.hierarchy.cut_tree(Z, n_clusters=[n_cluster]).flatten()
    score = adjusted_rand_score(true_labels, tmfg_clustering)
    info_score = adjusted_mutual_info_score(true_labels, tmfg_clustering)
    return {
        "score":score,
        "info_score":info_score
    }

In [86]:
def getPMFGTime(dataset):
    f1 = open("../outputs/pmfg/%s_timing.txt"% (dataset))  
    data1 = f1.read()
    pmfgtime = float(pmfg_time_find.findall(data1)[0][10:])
    apsptime = float(apsp_time_find.findall(data1)[0][19:])
    bubbletime = float(bubble_time_find.findall(data1)[0][13:])
    hierarchytime = float(hierarchy_time_find.findall(data1)[0][15:])
    edge_sum = float(edge_sum_find.findall(data1)[0][18:])
    return {
        "pmfg" : pmfgtime,
        "apsp" :apsptime,
        "bubble_tree":bubbletime,
        "hierarchy":hierarchytime,
        "time":pmfgtime + bubbletime+hierarchytime,
        "edge_sum":edge_sum
        }

In [87]:
data = []
for dataset in datasets:
    if dataset in ["Crop", "ElectricDevices", "StarLightCurves"]:
        continue
    try:
        tmp = getPMFGQuality(dataset)
        tmp["dataset"] = dataset
        tmp["workers"] = 1
        dict1 = getPMFGTime(dataset)
        tmp.update(dict1)
        tmp["method"] = "pmfg"
        tmp["speedup"] = 1
        tmp["ID"] = str(dataset_name_dict[dataset])
        tmp["n"] = data_information[dataset]["n"]
        data.append(tmp)
    except Exception as e: 
        print(e)
#         traceback.print_exc()
        print(dataset)
df = pd.DataFrame(data)

In [88]:
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n']].to_csv("pmfg.csv")

## TMFG MATLAB

In [75]:
pmfg_time_find = re.compile("PMFG\ time:.+?\n")
apsp_time_find = re.compile("shortest\ path\ time:.+?\n")
bubble_time_find = re.compile("bubble\ time:.+?\n")
hierarchy_time_find = re.compile("hierarchy\ time:.+?\n")
edge_sum_find = re.compile("edge_sum:.+?\n")
time_find = re.compile("time:.+?\n")

In [76]:
def getTMFGQuality(dataset):
    true_labels = data_information[dataset]["true_labels"]
    n_cluster = len(np.unique(true_labels))
    Z = np.genfromtxt("../outputs/tmfg_matlab/%s-tmfg-Z" % (dataset))
    Z = modifyPMFGZ(Z)
    tmfg_clustering = scipy.cluster.hierarchy.cut_tree(Z, n_clusters=[n_cluster]).flatten()
    score = adjusted_rand_score(true_labels, tmfg_clustering)
    info_score = adjusted_mutual_info_score(true_labels, tmfg_clustering)
    return {
        "score":score,
        "info_score":info_score
    }
def getTMFGTime(dataset):
    f1 = open("../outputs/tmfg_matlab/%s_timing.txt"% (dataset))  
    data1 = f1.read()
    tmfgtime = float(pmfg_time_find.findall(data1)[0][10:])
    apsptime = float(apsp_time_find.findall(data1)[0][19:])
    hierarchytime = float(hierarchy_time_find.findall(data1)[0][15:])
    edge_sum = float(edge_sum_find.findall(data1)[0][9::])
    time = float(time_find.findall(data1)[-1][5:])
    bubbletime = time - tmfgtime-hierarchytime
    return {
        "tmfg" : tmfgtime,
        "apsp" :apsptime,
        "bubble_tree":bubbletime,
        "hierarchy":hierarchytime,
        "time":time,
        "edge_sum":edge_sum
        }

In [77]:
data = []
for dataset in datasets:
    if dataset in ["Crop", "ElectricDevices"]:
        continue
    try:
        tmp = getTMFGQuality(dataset)
        tmp["dataset"] = dataset
        tmp["workers"] = 1
        dict1 = getTMFGTime(dataset)
        tmp.update(dict1)
        tmp["method"] = "TMFG-M"
        tmp["speedup"] = 1
        tmp["ID"] = str(dataset_name_dict[dataset])
        tmp["n"] = data_information[dataset]["n"]
        data.append(tmp)
    except Exception as e: 
        print(e)
        traceback.print_exc()
        print(dataset)
df = pd.DataFrame(data)

In [78]:
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n']].to_csv("tmfg-matlab.csv")

In [79]:
df.to_csv("tmfg-matlab_allsteps.csv")