# Analysis of time/performances trade-off 

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import os
os.chdir("../")

## 1. Listing Partitions to Process

In [None]:
partitions = list()
for filename in [
        "conceptnet_partition_10000_50000_100.tsv",
        "conceptnet_partition_100_500_5.tsv",
        "conceptnet_partition_1000_5000_10.tsv",
        "conceptnet_partition_100_100_5.tsv",
        "conceptnet_partition_100_5000_5.tsv",
        "conceptnet_partition_1_100_10.tsv"
    ]:
    max_cluster_size, max_neighborhood_size, max_siblings_size = list(map(int, filename[:-4].split("_")[2:]))
    partitions.append({
        "original_path": filename,
        "max_cluster_size": max_cluster_size,
        "max_neighborhood_size": max_neighborhood_size,
        "max_siblings_size": max_siblings_size,
        "min_path": filename[:-3] + "min.tsv",
        "results_path": filename[:-3] + "min.tsv.txt"
    })

for partition in partitions:
    print(partition["min_path"])

## 2. Results Gathering

In [None]:
data = list()
for partition in partitions:
    path = partition["results_path"]
    if os.path.isfile(path):
        entry = {
            "filename": path,
            "ppref": dict(),
            "parameters": {
                "max_cluster_size": partition["max_cluster_size"],
                "max_neighborhood_size": partition["max_neighborhood_size"],
                "max_siblings_size": partition["max_siblings_size"],
            }
        }
               
        with open(path) as file:
            for row in list(map(lambda line: list(map(lambda cell: cell.strip(), line.split("\t"))), file.readlines()))[1:]:
                entry["ppref"][row[0]] = float(row[1])
        data.append(entry)
        
time = {
    "conceptnet_partition_10000_50000_100.min.tsv.txt": 250,
    "conceptnet_partition_100_500_5.min.tsv.txt": 60,
    "conceptnet_partition_1000_5000_10.min.tsv.txt": 34*60+9-25*60-22,
    "conceptnet_partition_100_100_5.min.tsv.txt": 33 + 18,
    "conceptnet_partition_100_5000_5.min.tsv.txt": 25 + 28,
    "conceptnet_partition_1_100_10.min.tsv.txt": 60+16-29,
}

for entry in data:
    entry["time"] = time[entry["filename"]]

## 3. Plotting

In [None]:
def plot(feature):
    feature_name = {
        "time": "Processing time (seconds)",
        "max_neighborhood_size": "Maximum cluster size",
        "max_cluster_size": "Maximum cluster center size",
        "max_siblings_size": "Maximum no. of siblings considered",
    }
    feature_path = {
        "time": "ppref-over-time",
        "max_neighborhood_size": "ppref-over-cluster-size",
        "max_cluster_size": "ppref-over-cluster-center-size",
        "max_siblings_size": "ppref-over-siblings-count",
    }
    ys = dict()
    x = list()
    for entry in data:
        for dimension in entry["ppref"]:
            ys.setdefault(dimension, list())
            ys[dimension].append(entry["ppref"][dimension])
        if feature in entry.keys():
            x.append(entry[feature])
        elif feature in entry["parameters"].keys():
            x.append(entry["parameters"][feature])
    plt.figure(figsize=(10, 4))
    for label, y in ys.items():
        xp, yp = zip(*sorted(zip(x, y)))
        plt.semilogx(xp, yp, marker="s", linewidth=2, label=label)
    plt.ylabel("ppref")
    plt.xlabel(feature_name[feature])
    plt.legend()
    plt.savefig("out/" + feature_path[feature] + ".png")
    plt.show()
    
plot("time")
plot("max_neighborhood_size")
plot("max_cluster_size")
plot("max_siblings_size")

In [None]:
df_vals = list()
for entry in sorted(data, key=lambda entry: entry["time"]):
    df_vals.append(entry["parameters"])
    df_vals[-1]["time"] = entry["time"]

print(pd.DataFrame(df_vals).rename(columns={
    "time": "time (seconds)",
    "max_neighborhood_size": "cluster-size",
    "max_cluster_size": "cluster center size",
    "max_siblings_size": "siblings count",
}))