In [1]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use(['seaborn-paper'])
import numpy as np
import pandas as pd
import json
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.metrics.cluster import adjusted_mutual_info_score
import scipy
import traceback
plt.rcParams['ps.useafm'] = True
plt.rcParams['pdf.use14corefonts'] = True
plt.rcParams['text.usetex'] = True

dataset_name_dict = {'Mallat': 0,
 'UWaveGestureLibraryAll': 1,
 'NonInvasiveFetalECGThorax2': 2,
 'MixedShapesRegularTrain': 3,
 'MixedShapesSmallTrain': 4,
 'ECG5000': 5,
 'NonInvasiveFetalECGThorax1': 6,
#  'MoteStrain': 7,
 "StarLightCurves":7,
 'HandOutlines': 8,
 'UWaveGestureLibraryX': 9,
 'CBF': 10,
 'InsectWingbeatSound': 11,
 'UWaveGestureLibraryY': 12,
 'ShapesAll': 13,
 'SonyAIBORobotSurface2': 14,
 'FreezerSmallTrain': 15,
 'Crop':16,
 'ElectricDevices':17}

datasets=dataset_name_dict.keys()

workers=[1, 4, 12, 24, 36, 48, 96]# 1 must be first processed

data_information = json.load(open("dataset_information.json"))
data_information = json.loads(data_information)

In [14]:
# for dataset in datasets:
#     print(len(np.unique(data_information[dataset]["true_labels"])))

## KMEANS

In [15]:
def getKMeansQuality(dataset, worker):
    true_labels = data_information[dataset]["true_labels"]
    n_cluster = len(np.unique(true_labels))
    kmeans_clustering = np.genfromtxt("../outputs/kmeans_c/%s_X.dat.membership_%s" % (dataset, worker))[:,1]
    score = adjusted_rand_score(true_labels, kmeans_clustering)
    info_score = adjusted_mutual_info_score(true_labels, kmeans_clustering)
    return {
        "score":score,
        "info_score":info_score
    }

In [40]:
def getKMeansTimes(dataset, worker):
    init_pattern = r'Initialize time\s+=\s+([\d.]+)\s+sec'
    comp_pattern = r'Computation timing\s+=\s+([\d.]+)\s+sec'
    
    f1 = open("../outputs/kmeans_c/%s_%sth_kmeans.txt" % (dataset, worker))
    data1 = f1.read()
    match = re.search(init_pattern, data1)
    init_time = float(match.group(1))
    match = re.search(comp_pattern, data1)
    comp_time = float(match.group(1))
    time = init_time + comp_time
    
    result = {
        "dataset":dataset,
        "workers":worker,
        "time":time,
        "init_time":init_time,
        "comp_time":comp_time,
        "method":"kmeans_c"
    }
    return result

In [41]:
data = []
one_thread_time = {}
# datasets = ["Crop", "ElectricDevices"]
for dataset in datasets:
    for worker in workers:
        try:
            tmp = getKMeansTimes(dataset, worker)
            tmp.update(getKMeansQuality(dataset, worker))
            if(worker==1):
                one_thread_time[dataset] = tmp["time"]
                tmp["speedup"]=1
            else:
                speedup = one_thread_time[dataset]/tmp["time"]
                tmp["speedup"]=speedup
            tmp["ID"] = str(dataset_name_dict[tmp["dataset"]])
            tmp["workers"] = worker
            tmp["n"] = data_information[dataset]["n"]
            data.append(tmp)
        except:
            print(dataset, worker)
df = pd.DataFrame(data)

In [45]:
df["edge_sum"] = 0
for i in range(17):
    median_score = np.median(df[df["ID"]==str(i)]["score"])
    df.loc[df["ID"]==str(i),"median_score"] = median_score
    median_info_score = np.median(df[df["ID"]==str(i)]["info_score"])
    df.loc[df["ID"]==str(i),"median_info_score"] = median_info_score
df[["dataset", "workers", "time", "method", "speedup", 'ID', 'score','info_score','edge_sum', 'n', "median_score", "median_info_score", 'init_time', 'comp_time']].to_csv("kmeans_c.csv")