In [None]:
import glob
import os
import pandas as pd
import numpy as np

In [None]:
import itertools
metrics = ["bleu", "meteor", "rouge"]
candidates = [1,3,5,10]
distances = ["cosine", "euclidean", "manhattan"]

combinations = [
    f"**{distance}_{metric}_C{candidate}_**"
    for candidate, distance, metric in itertools.product(
        candidates, distances, metrics
    )
    if candidate == 10
    or metric != "meteor"
    and metric != "rouge"
    and distance != "manhattan"
    and distance != "euclidean"
]

In [None]:
speeds = {('cosine', 'bleu'): ['word2vec', 'doc2vec', 'bert', 'tfidf', 'bow'],
          ('cosine', 'meteor'): ['word2vec', 'doc2vec', 'bert', 'tfidf', 'bow'],
          ('cosine', 'rouge'): ['word2vec', 'doc2vec', 'bert', 'bow', 'tfidf'],
          ('euclidean', 'bleu'): ['doc2vec', 'word2vec', 'bert', 'tfidf', 'bow'],
          ('euclidean', 'meteor'): ['bert', 'doc2vec', 'word2vec', 'tfidf', 'bow'],
          ('euclidean', 'rouge'): ['doc2vec', 'bert', 'word2vec', 'tfidf', 'bow'],
          ('manhattan', 'bleu'): ['tfidf', 'bow', 'doc2vec', 'word2vec', 'bert'],
          ('manhattan', 'meteor'): ['tfidf', 'bow', 'doc2vec', 'word2vec', 'bert'],
          ('manhattan', 'rouge'): ['tfidf', 'bow', 'doc2vec', 'word2vec', 'bert']}


In [None]:
def find_fastest_model_for_given_list(distance, metric, model_list):
    models = speeds[(distance, metric)]
    indexes = [models.index(model) for model in model_list]
    return model_list[np.argmin(indexes)]

In [None]:
def read_csv_files_and_merge(path, csv_files, vectorizer, metric_col):
    dataframes = []
    recommended_idx_dfs = []
    for csv_file in csv_files:
        df = pd.read_csv(os.path.join(path, csv_file), index_col="test_idx")
        metric_df = df[[metric_col]]
        recommended_idx_df = df[["recommended_idx"]]
        metric_df.columns = metric_df.columns.str.replace(metric_col, f"{vectorizer}").str.replace("-mean", "")
        recommended_idx_df.columns = recommended_idx_df.columns.str.replace("recommended_idx", f"{vectorizer}").str.replace("-mean", "")
        dataframes.append(metric_df)
        recommended_idx_dfs.append(recommended_idx_df)
    return pd.concat(dataframes).sort_index(), pd.concat(recommended_idx_dfs).sort_index()

def read_results_and_get_max_for_combination(evaluation_paths):
    merged_csvs = []
    merged_idx_csvs = []
    for evaluation_path in evaluation_paths:
        splitted = evaluation_path.split("_")
        vectorizer = splitted[0].split("\\")[-1]
        distance = splitted[1]
        metric = splitted[2]
        candidate = splitted[3]

        csv_files = [path for path in os.listdir(evaluation_path) if path.endswith(".csv")]
        res, idx = read_csv_files_and_merge(evaluation_path, csv_files, vectorizer, metric)
        merged_csvs.append(res)
        merged_idx_csvs.append(idx)

    resulted = pd.concat(merged_csvs, axis=1)
    resulted["max"] = resulted.idxmax(axis=1)
    df = resulted.eq(resulted.max(axis=1), axis=0)
    resulted["max"] = df.mul(df.columns.to_series()).apply(lambda x:','.join(filter(None, x)), axis=1).str.strip(',').str.replace(f'_{metric}', '')
    resulted["fastest"] = resulted["max"].apply(lambda x: find_fastest_model_for_given_list(distance, metric, x.split(",")))
    idx_merged = pd.concat(merged_idx_csvs, axis=1)
    resulted["max_idx"] = df.apply(lambda row: idx_merged.loc[row.name, resulted.loc[row.name, "max"].split(",")[0]], axis=1) 
    return resulted

In [None]:
data = pd.read_csv("./data/comment_finder/all.csv")

def get_code(idx):
    return data.loc[idx]["code"]

def get_comment(idx):
    return data.loc[idx]["comment"]


In [None]:
for combination in combinations:
    evaluation_paths = glob.glob(f".\\results/{combination}")
    result = read_results_and_get_max_for_combination(evaluation_paths)
    result["code"] = result.apply(lambda row: get_code(row["max_idx"]), axis=1)
    result["comment"] = result.apply(lambda row: get_comment(row["max_idx"]), axis=1)
    result = result.reset_index().drop(columns=["test_idx", "max_idx"])
    cols = list(result.columns)
    new_cols = np.concatenate((cols[-2:], cols[:-2]))
    result = result[new_cols]
    result.to_csv(f"./evaluation/speed/{combination.replace('*','')}.csv", index=False)

In [62]:
result

Unnamed: 0,code,comment,bert,bow,doc2vec,tfidf,word2vec,max,fastest
0,public void testExecutePrimaryKeyLookupQuery()...,"Please use a better name (primaryKeys, primary...",1.0000,1.0000,1.0000,1.0000,1.0000,"bert,bow,doc2vec,tfidf,word2vec",tfidf
1,public void testStartProcessFromDisplayer(){ k...,would be good if you match the expected values...,0.2500,0.4167,0.3333,0.4167,0.4167,"bow,tfidf,word2vec",tfidf
2,protected void validateAssertion(final SamlAss...,"@mhpnguyen As a rule of thumb, you want to log...",0.1310,0.1190,0.2143,0.1429,0.0595,doc2vec,doc2vec
3,private void collectOverriddenMethodsInInterfa...,it looks that collectOverriddenMethodsInInterf...,0.2222,0.3333,0.2222,0.2222,0.3333,"bow,word2vec",bow
4,public DeploymentConfiguration createPropertyD...,I wonder if the static methods should have bee...,0.0909,0.1515,0.1515,0.1515,0.1212,"bow,doc2vec,tfidf",tfidf
...,...,...,...,...,...,...,...,...,...
150672,public void maybeStopRequesting( StreamRTPMana...,I get it that the JRE is optimized for small s...,0.1667,0.0000,0.0000,0.1667,0.5000,word2vec,word2vec
150673,PersistentStoragePathSanitizer() {},Constructor should be the first method general...,0.1818,0.2727,0.2727,0.3636,0.2727,tfidf,tfidf
150674,MockChartLayout() { layoutWidth = ComponentCon...,I think we should use separate constants for t...,0.0800,0.0800,0.0800,0.1200,0.1200,"tfidf,word2vec",tfidf
150675,public String getFQDNHostname() { if (fqdnHost...,I am wondering whether it wouldn't make sense ...,0.0714,0.2857,0.1786,0.2143,0.1429,bow,bow
