In [None]:
import os
import yaml
import plotly.graph_objects as go
import pandas as pd

In [None]:
IN_EVALUATION_SUMMARY_PATH = "/veld/input/" + os.getenv("in_evaluation_summary_file")
OUT_VISUALIZATION_HTML_PATH = "/veld/output/" + os.getenv("out_visualization_html_file")
OUT_VISUALIZATION_PNG_PATH = "/veld/output/" + os.getenv("out_visualization_png_file")
print(f"IN_EVALUATION_SUMMARY_PATH: {IN_EVALUATION_SUMMARY_PATH}")
print(f"OUT_VISUALIZATION_HTML_PATH: {OUT_VISUALIZATION_HTML_PATH}")
print(f"OUT_VISUALIZATION_PNG_PATH: {OUT_VISUALIZATION_PNG_PATH}")

# load summary data

In [None]:
with open(IN_EVALUATION_SUMMARY_PATH, "r") as file_in:
    eval_summary_raw = file_in.read()
    print(eval_summary_raw)
    eval_summary = yaml.safe_load(eval_summary_raw)

# transform to dataframe

In [None]:
def load_summary_data():
    
    def normalize_size(size_str):
        if size_str.endswith("G"):
            return float(size_str[:-1])
        elif size_str.endswith("M"):
            return float(size_str[:-1]) / 1000
        else:
            raise Exception
    
    def normalize_epochs(model_id_details_dict):
        if "training_epochs" in model_id_details_dict:
            return int(model_id_details_dict["training_epochs"])
        elif "max_iter" in model_id_details_dict:
            return int(model_id_details_dict["max_iter"])
        else:
            raise Exception
    
    def normalize_vectors(model_id_details_dict):
        if "vector_size" in model_id_details_dict:
            return int(model_id_details_dict["vector_size"])
        elif "training_vector_size" in model_id_details_dict:
            return int(model_id_details_dict["training_vector_size"])
        else:
            raise Exception
    
    def normalize_min_count(model_id_details_dict):
        if "vocab_min_count" in model_id_details_dict:
            return int(model_id_details_dict["vocab_min_count"])
        elif "min_count" in model_id_details_dict:
            return int(model_id_details_dict["min_count"])
        else:
            return -1

    def load_summary_data_main():
            df = pd.DataFrame()
            for model_arch, model_arch_dict in eval_summary.items():
                model_arch_noramlized = model_arch[:1]
                for model_id, model_id_dict in model_arch_dict.items():
                    model_id_details_dict = model_id_dict["model_details"]
                    model_id_score_dict = model_id_dict["score"]
                    df_tmp = pd.DataFrame(
                        {
                            "model id": [model_arch_noramlized + model_id[-1]],
                            "score synonyms": [model_id_score_dict["synonyms"]],
                            "score homonyms": [model_id_score_dict["homonyms"]],
                            "score antonyms": [model_id_score_dict["antonyms"]],
                            "vector size": normalize_vectors(model_id_details_dict),
                            "min word count": normalize_min_count(model_id_details_dict),
                            "training epochs": normalize_epochs(model_id_details_dict),
                            "training duration (hours)": int(model_id_details_dict["training_duration (minutes)"] / 60),
                            "model data size (GB)": [normalize_size(model_id_details_dict["model_data_size"])],
                            "train data size (GB)": [normalize_size(model_id_details_dict["train_data_size"])],
                            "train data hash": model_id_details_dict["train_data_md5_hash"][:8],
                        }
                    )
                    df = pd.concat([df, df_tmp], ignore_index=True)
            return df

    return load_summary_data_main()
    
df = load_summary_data()
df

# transform for visualization

In [None]:
def create_dimensions(df):
    
    def create_axis(df, col_name):
    
        # check if this column has any non-numeric value
        is_numeric = True
        for row in df.iterrows():
            row_list = row[1].to_list()
            try:
                round(row_list[1], 3)
            except:
                is_numeric = False
                break
    
        # iterate over rows and create potentially compressed value-label pairs
        value_list = []
        ticks_dict = {}
        non_numeric_dict = {}
        for row in df.iterrows():
    
            # get label and value from row
            row_list = row[1].to_list()
            label = row_list[0]
            value = row_list[1]
            try:
                value = round(value, 3)
            except:
                pass
    
            # merge labels if values already occurred before
            label_pre = ticks_dict.get(value)
            if label_pre is not None:
                label = label_pre + "," + label
            ticks_dict[value] = label
    
            # handle non-numeric values, by creating fake numeric values
            if is_numeric:
                value_list.append(value)
            else:
                fake_value = non_numeric_dict.get(value)
                if fake_value is None:
                    fake_value = len(non_numeric_dict) + 1
                non_numeric_dict[value] = fake_value
                value_list.append(fake_value)
    
        # create main label and value data structure for plotly's tick attributes
        tick_label_list = []
        tick_value_list = []
        for value, label in ticks_dict.items():
            value_str = str(value)
            if value_str == "-1":
                value_str = "null"
            tick_label_list.append(label + ": " + value_str)
            if is_numeric:
                tick_value_list.append(value)
            else:
                tick_value_list.append(non_numeric_dict[value])
            
        return {
            "tickvals": tick_value_list,
            "ticktext": tick_label_list,
            "label": col_name,
            "values": value_list,
        }

    def create_dimensions_main():
        dim_list = []
        for col_name in df:
            if col_name != "model id":
                dim_list.append(create_axis(df[["model id", col_name]], col_name))
        return dim_list

    return create_dimensions_main()
    
dim_list = create_dimensions(df)
dim_list

# visualize

In [None]:
fig = go.Figure(
    data=go.Parcoords(
        line={"color": list(df.index), "colorscale": 'Rainbow'},
        dimensions=dim_list
    )
)
fig.update_layout(height=700)
fig.show()
fig.write_html(OUT_VISUALIZATION_HTML_PATH)
fig.write_image(OUT_VISUALIZATION_PNG_PATH)