In [1]:
import glob
import os
import pandas as pd
import numpy as np

In [7]:
import itertools
metrics = ["bleu", "meteor", "rouge"]
candidates = [1,3,5,10]
distances = ["cosine", "euclidean", "manhattan"]

combinations = [
    f"**{distance}_{metric}_C{candidate}_**"
    for candidate, distance, metric in itertools.product(
        candidates, distances, metrics
    )
    if candidate == 10
    or metric != "meteor"
    and metric != "rouge"
    and distance != "manhattan"
    and distance != "euclidean"
]


In [3]:
def read_csv_files_and_merge(path, csv_files, vectorizer, metric_col):
    dataframes = []
    recommended_idx_dfs = []
    for csv_file in csv_files:
        df = pd.read_csv(os.path.join(path, csv_file), index_col="test_idx")
        metric_df = df[[metric_col]]
        recommended_idx_df = df[["recommended_idx"]]
        metric_df.columns = metric_df.columns.str.replace(metric_col, f"{vectorizer}").str.replace("-mean", "")
        recommended_idx_df.columns = recommended_idx_df.columns.str.replace("recommended_idx", f"{vectorizer}").str.replace("-mean", "")
        dataframes.append(metric_df)
        recommended_idx_dfs.append(recommended_idx_df)
    return pd.concat(dataframes).sort_index(), pd.concat(recommended_idx_dfs).sort_index()

def read_results_and_get_max_for_combination(evaluation_paths):
    merged_csvs = []
    merged_idx_csvs = []
    for evaluation_path in evaluation_paths:
        splitted = evaluation_path.split("_")
        vectorizer = splitted[0].split("\\")[-1]
        distance = splitted[1]
        metric = splitted[2]
        candidate = splitted[3]

        csv_files = [path for path in os.listdir(evaluation_path) if path.endswith(".csv")]
        res, idx = read_csv_files_and_merge(evaluation_path, csv_files, vectorizer, metric)
        merged_csvs.append(res)
        merged_idx_csvs.append(idx)

    resulted = pd.concat(merged_csvs, axis=1)
    resulted["max"] = resulted.idxmax(axis=1)
    df = resulted.eq(resulted.max(axis=1), axis=0)
    resulted["max"] = df.mul(df.columns.to_series()).apply(','.join, axis=1).str.strip(',').str.replace(f'_{metric}', '')
    idx_merged = pd.concat(merged_idx_csvs, axis=1)
    resulted["max_idx"] = df.apply(lambda row: idx_merged.loc[row.name, resulted.loc[row.name, "max"].split(",")[0]], axis=1) 
    return resulted

In [4]:
data = pd.read_csv("./data/comment_finder/all.csv")

def get_code(idx):
    return data.loc[idx]["code"]

def get_comment(idx):
    return data.loc[idx]["comment"]


In [10]:
for combination in combinations:
    evaluation_paths = glob.glob(f".\\results/{combination}")
    result = read_results_and_get_max_for_combination(evaluation_paths)
    result["code"] = result.apply(lambda row: get_code(row["max_idx"]), axis=1)
    result["comment"] = result.apply(lambda row: get_comment(row["max_idx"]), axis=1)
    result = result.reset_index().drop(columns=["test_idx", "max_idx"])
    cols = list(result.columns)
    new_cols = np.concatenate((cols[-2:], cols[:-2]))
    result = result[new_cols]
    result.to_csv(f"./evaluation/{combination.replace('*','')}.csv", index=False)

  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)
  df = resulted.eq(resulted.max(axis=1), axis=0)


In [None]:
# x = read_results_and_get_max_for_combination(evaluation_paths)