In [1]:
import pandas as pd
import numpy as np
import time
import math
# Pandarallelの準備
from pandarallel import pandarallel
pandarallel.initialize()

#PRED_FILES = ["gcbert.pred", "codet5_old.pred", "codebert.pred"]
PRED_FILES = ["gcbert.pred", "codet5.pred", "codebert.pred"]
DATA_DIR = "./raw_data/"
COMMENT_FILE= "data.comment"
SOURCE_FILE = "data.source"
TARGET_FILE = "data.target"

COMMENT_PATHS = f"{DATA_DIR}{COMMENT_FILE}"
DATA_SOURCE_PATH = f"{DATA_DIR}{SOURCE_FILE}"
DATA_TARGET_PATH = f"{DATA_DIR}{TARGET_FILE}"
PRED_FULL_PATHS = [f"{DATA_DIR}{f}" for f in PRED_FILES]

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
# read file per line and return dataframe and column name is filepath
def read_file_per_line(path):
    with open(path, "r") as f:
        lines = f.readlines()
        lines = [l.rstrip() for l in lines]
        return pd.DataFrame(lines, columns=[path.split("/")[-1]])
# read all files and combine them and return dataframe
def read_files(paths):
    dfs = [read_file_per_line(p) for p in paths]
    return pd.concat(dfs, axis=1)

dfs = read_files([DATA_SOURCE_PATH, DATA_TARGET_PATH, COMMENT_PATHS, *PRED_FULL_PATHS, ] )
dfs.to_excel("./data/data.xls", index=False)

  dfs.to_excel("./data/data.xls", index=False)


In [3]:
# add columns for is_correct between target and pred columns
def add_is_correct_column(df, pred_col):
    df[f"is_correct_{pred_col}"] = (df[TARGET_FILE] == df[pred_col])
    print(f"EMScore of {pred_col} is {df[f'is_correct_{pred_col}'].sum() / len(df)}")
    return df
for pred_col in PRED_FILES:
    
    dfs = add_is_correct_column(dfs, pred_col)


EMScore of gcbert.pred is 0.10296684118673648
EMScore of codet5.pred is 0.04712041884816754
EMScore of codebert.pred is 0.09307737056428156


In [4]:
# calc is_correct between each preds
def add_is_correct_between_preds(df):
    for pred_col in PRED_FILES:
        for pred_col2 in PRED_FILES:
            if pred_col == pred_col2:
                continue
            df[f"em_{pred_col}_and_{pred_col2}"] =  (df[pred_col] == df[pred_col2])
            print(f"EMScore of {pred_col} and {pred_col2} is {df[f'em_{pred_col}_and_{pred_col2}'].sum() / len(df)}")
    return df
dfs = add_is_correct_between_preds(dfs)

EMScore of gcbert.pred and codet5.pred is 0.33740546829552065
EMScore of gcbert.pred and codebert.pred is 0.43339150668993603
EMScore of codet5.pred and gcbert.pred is 0.33740546829552065
EMScore of codet5.pred and codebert.pred is 0.3228621291448517
EMScore of codebert.pred and gcbert.pred is 0.43339150668993603
EMScore of codebert.pred and codet5.pred is 0.3228621291448517


In [6]:
# add columns of all_correct for all is_correct columns
def add_all_correct_column(df):
    df["all_correct"] = df[[f"is_correct_{pred_col}" for pred_col in PRED_FILES]].all(axis=1)
    print("Number of all_correct: ", df["all_correct"].sum())
    return df
dfs = add_all_correct_column(dfs)
dfs[dfs["all_correct"]].loc[:,["data.source", "data.target", "data.comment"]].to_excel("./data/all_correct.xls", index=False)

dfs["is_correct_only_gcbert"] = dfs["is_correct_gcbert.pred"] & ~dfs["is_correct_codet5.pred"] & ~dfs["is_correct_codebert.pred"]
dfs[dfs["is_correct_only_gcbert"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred", "codet5.pred"]].to_excel("./data/is_correct_only_gcbert.xls", index=False)
print("Sum of correct only gcbert: " + str(sum(dfs["is_correct_only_gcbert"])))
dfs["is_correct_only_codet5"] = dfs["is_correct_codet5.pred"] & ~dfs["is_correct_gcbert.pred"] & ~dfs["is_correct_codebert.pred"]
dfs[dfs["is_correct_only_codet5"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred", "gcbert.pred"]].to_excel("./data/is_correct_only_codet5.xls", index=False)
print("Sum of correct only codet5: " + str(sum(dfs["is_correct_only_codet5"])))
dfs["is_correct_only_codebert"] = dfs["is_correct_codebert.pred"] & ~dfs["is_correct_codet5.pred"] & ~dfs["is_correct_gcbert.pred"]
dfs[dfs["is_correct_only_codebert"] ].loc[:,["data.source", "data.target", "data.comment", "codet5.pred", "gcbert.pred"]].to_excel("./data/is_correct_only_codebert.xls", index=False)
print("Sum of correct only codebert: " + str(sum(dfs["is_correct_only_codebert"])))

dfs["is_incorrect_only_gcbert"] = ~dfs["is_correct_gcbert.pred"] & dfs["is_correct_codet5.pred"] & dfs["is_correct_codebert.pred"]
dfs[dfs["is_incorrect_only_gcbert"] ].loc[:,["data.source", "data.target", "data.comment", "gcbert.pred"]].to_excel("./data/is_incorrect_only_gcbert.xls", index=False)
print("Sum of incorrect only gcbert: " + str(sum(dfs["is_incorrect_only_gcbert"])))

dfs["is_incorrect_only_codet5"] = dfs["is_correct_gcbert.pred"] & ~dfs["is_correct_codet5.pred"] & dfs["is_correct_codebert.pred"]
dfs[dfs["is_incorrect_only_codet5"] ].loc[:,["data.source", "data.target", "data.comment", "codet5.pred"]].to_excel("./data/is_incorrect_only_codet5.xls", index=False)
print("Sum of incorrect only codet5: " + str(sum(dfs["is_incorrect_only_codet5"])))

dfs["is_incorrect_only_codebert"] = dfs["is_correct_gcbert.pred"] & dfs["is_correct_codet5.pred"] & ~dfs["is_correct_codebert.pred"]
dfs[dfs["is_incorrect_only_codebert"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred"]].to_excel("./data/is_incorrect_only_codebert.xls", index=False)
print("Sum of incorrect only codebert: " + str(sum(dfs["is_incorrect_only_codebert"])))


dfs["all_incorrect"] = ~dfs["is_correct_gcbert.pred"] & ~dfs["is_correct_codet5.pred"] & ~dfs["is_correct_codebert.pred"]
dfs[dfs["all_incorrect"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred"]].to_excel("./data/all_incorrect.xls", index=False)
print("Sum of all_incorrect: " + str(sum(dfs["all_incorrect"])))


Number of all_correct:  35
Sum of correct only gcbert: 73
Sum of correct only codet5: 25
Sum of correct only codebert: 51
Sum of incorrect only gcbert: 13
Sum of incorrect only codet5: 61
Sum of incorrect only codebert: 8
Sum of all_incorrect: 1453


  dfs[dfs["all_correct"]].loc[:,["data.source", "data.target", "data.comment"]].to_excel("./data/all_correct.xls", index=False)
  dfs[dfs["is_correct_only_gcbert"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred", "codet5.pred"]].to_excel("./data/is_correct_only_gcbert.xls", index=False)
  dfs[dfs["is_correct_only_codet5"] ].loc[:,["data.source", "data.target", "data.comment", "codebert.pred", "gcbert.pred"]].to_excel("./data/is_correct_only_codet5.xls", index=False)
  dfs[dfs["is_correct_only_codebert"] ].loc[:,["data.source", "data.target", "data.comment", "codet5.pred", "gcbert.pred"]].to_excel("./data/is_correct_only_codebert.xls", index=False)
  dfs[dfs["is_incorrect_only_gcbert"] ].loc[:,["data.source", "data.target", "data.comment", "gcbert.pred"]].to_excel("./data/is_incorrect_only_gcbert.xls", index=False)
  dfs[dfs["is_incorrect_only_codet5"] ].loc[:,["data.source", "data.target", "data.comment", "codet5.pred"]].to_excel("./data/is_incorrect_only_codet5.

In [22]:
# calc bleu score
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def calc_bleu4(pred:str, tgt:str) -> float:
    fn = SmoothingFunction().method1
    return sentence_bleu([tgt.split()], pred.split())

# calc bles score between preds and target
def calc_bleu4_between_preds_and_target(df):
    for pred_col in PRED_FILES:
        df[f"bleu4_{pred_col}"] = df.apply(lambda x: calc_bleu4(x[pred_col], x[TARGET_FILE]), axis=1)
        print(f"Average bleu4_{pred_col}:{np.average(df[f'bleu4_{pred_col}'])}")
    return df

dfs = calc_bleu4_between_preds_and_target(dfs)

Average bleu4_gcbert.pred:0.7503426944875469
Average bleu4_codet5.pred:0.755592552249078
Average bleu4_codebert.pred:0.748118504192774


In [23]:
# calc bleu4 between each preds
def calc_bleu4_between_preds(df):
    for pred_col in PRED_FILES:
        for pred_col2 in PRED_FILES:
            if pred_col == pred_col2:
                continue
            df[f"bleu4_{pred_col}_and_{pred_col2}"] = df.apply(lambda x: calc_bleu4(x[pred_col], x[pred_col2]), axis=1)
            print(f"Average bleu4_{pred_col}_and_{pred_col2}:{np.average(df[f'bleu4_{pred_col}_and_{pred_col2}'])}")
    return df
dfs = calc_bleu4_between_preds(dfs)

Average bleu4_gcbert.pred_and_codet5.pred:0.8398327863187209
Average bleu4_gcbert.pred_and_codebert.pred:0.8689556180856352
Average bleu4_codet5.pred_and_gcbert.pred:0.8436324670445154
Average bleu4_codet5.pred_and_codebert.pred:0.8432105093066808
Average bleu4_codebert.pred_and_gcbert.pred:0.8708210022627797
Average bleu4_codebert.pred_and_codet5.pred:0.8413426594119572


In [135]:
import textdistance
# calc Levenshtein distance between preds and target
def calc_levenshtein_between_preds_and_target(df):
    for pred_col in PRED_FILES:
        df[f"levenshtein_{pred_col}"] = df.loc[:,[pred_col, TARGET_FILE]].apply(lambda x: textdistance.levenshtein.normalized_similarity(x[pred_col], x[TARGET_FILE]), axis=1)
        print(f"Average levenshtein_{pred_col}:{np.average(df[f'levenshtein_{pred_col}'])}")
    return df
dfs = calc_levenshtein_between_preds_and_target(dfs)
    

KeyError: "None of [Index(['gcbert.pred', 'data.target'], dtype='object')] are in the [index]"

In [136]:
# calc Levenshtein distance between each preds
def calc_levenshtein_between_preds(df):
    for pred_col in PRED_FILES:
        for pred_col2 in PRED_FILES:
            if pred_col == pred_col2:
                continue
            df[f"levenshtein_{pred_col}_and_{pred_col2}"] = df.loc[:, [pred_col,pred_col2]].apply(lambda x: textdistance.levenshtein.normalized_similarity(x[pred_col], x[pred_col2]), axis=1)
            print(f"Average levenshtein_{pred_col}_and_{pred_col2}:{np.average(df[f'levenshtein_{pred_col}_and_{pred_col2}'])}")
    return df
dfs = calc_levenshtein_between_preds(dfs)


Average levenshtein_gcbert.pred_and_codet5.pred:0.8828459633624532
Average levenshtein_gcbert.pred_and_codebert.pred:0.9022055387884687
Average levenshtein_codet5.pred_and_gcbert.pred:0.8828459633624532
Average levenshtein_codet5.pred_and_codebert.pred:0.8813396701129622
Average levenshtein_codebert.pred_and_gcbert.pred:0.9022055387884687
Average levenshtein_codebert.pred_and_codet5.pred:0.8813396701129622


In [118]:
dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1719 entries, 0 to 1718
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   data.source                          1719 non-null   object 
 1   data.target                          1719 non-null   object 
 2   data.comment                         1719 non-null   object 
 3   gcbert.pred                          1719 non-null   object 
 4   codet5.pred                          1719 non-null   object 
 5   codebert.pred                        1719 non-null   object 
 6   is_correct_gcbert.pred               1719 non-null   bool   
 7   is_correct_codet5.pred               1719 non-null   bool   
 8   is_correct_codebert.pred             1719 non-null   bool   
 9   all_correct                          1719 non-null   bool   
 10  is_correct_only_gcbert               1719 non-null   bool   
 11  is_correct_only_codet5        