In [1]:
# eval on predictions
import ast
import re
import difflib
from termcolor import colored

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from evaluator.CodeBLEU.calc_code_bleu import get_codebleu
from nltk.translate.bleu_score import sentence_bleu

from utils.regex_parse import comment


# Functions

In [2]:
def exclude_same_io(df):
    # excluding those input exactly same as the output
    exact_match_bool = df["inputs"] == df["labels"]
    df = df.drop(df[exact_match_bool].index)
    return df

In [3]:
# parsable eval
def is_parsable(input_code):
    try:
        ast.parse(input_code)
    except SyntaxError:
        return False
    except Exception as e:
        print(input_code)
        print(e)
        return False
    return True

In [4]:
def evaluate_codebleu(pred_filename, weights="0.25,0.25,0.25,0.25", replaced_df=None, dropna=False, is_exclude_same_io=False):
    pred_df = None
    if replaced_df is not None:
        pred_df = replaced_df
    else:
        pred_df = pd.read_csv(pred_filename)
    if dropna:
        pred_df = pred_df.dropna()
    if is_exclude_same_io:
        pred_df = exclude_same_io(pred_df)
    # a list of gold codes (which is just some variants of the same code, we can use every code of different styles)
    refs = [
        pred_df["labels"]
    ]
    # the prediction code
    hyp = pred_df["preds"]
    score = get_codebleu(refs, hyp, "python", weights)
    return score

In [5]:
def get_docstring(text):
    regex_docstr = "^\s*\'{3}([\s\S]*?)\'{3}|^\s*\"{3}([\s\S]*?)\"{3}"
    docstr_matches = re.findall(regex_docstr, text, re.M | re.S)
    docstrs = []
    for match in docstr_matches:
        docstr_a, docstr_b = match
        if docstr_a:
            docstrs += [docstr_a]
        else:
            docstrs += [docstr_b]
    return docstrs

In [6]:
def print_split_line(s):
    print(f"\n====================={s.upper()}=====================\n")

In [7]:
def tokenize(s):
    return re.split('\s+', s)

def get_diff_list(str_1, str_2):
    s1 = tokenize(str_1)
    s2 = tokenize(str_2)

    matcher = difflib.SequenceMatcher(a=s1, b=s2)

    diff_blocks_a = []
    diff_blocks_b = []

    prev_match = None
    for idx, match in enumerate(matcher.get_matching_blocks()):

        if idx == 0: 
            prev_match = match
            if match.a != 0:
                start_idx_a = 0
                end_idx_a = match.a
                diff_blocks_a += s1[start_idx_a:end_idx_a]
            if match.b != 0:
                start_idx_b = 0
                end_idx_b = match.b
                diff_blocks_b += s2[start_idx_b:end_idx_b]
            continue

        start_idx_a = prev_match.a + prev_match.size
        end_idx_a = match.a

        start_idx_b = prev_match.b + prev_match.size
        end_idx_b = match.b

        diff_list_a = s1[start_idx_a:end_idx_a]
        diff_list_b = s2[start_idx_b:end_idx_b]
        if len(diff_list_a):
            diff_blocks_a += diff_list_a
        if len(diff_list_b):
            diff_blocks_b += diff_list_b

        prev_match = match
    return diff_blocks_a, diff_blocks_b

def get_diff_str(input_str, output_str):
    return " ".join(get_diff_list(input_str, output_str)[1])

In [8]:
import re
def remove_nl_prompt(script):
    return re.sub("<nl>.*<\/nl>", "", script)

In [9]:
def evaluate_pred_df(pred_df, target_feats, is_nl=False, parse_test=True):
    
    inputs = pred_df["inputs"].to_numpy()
    labels = pred_df["labels"].to_numpy()
    preds = pred_df["preds"].to_numpy()
    
    
    code_scores = []
    diff_bleu_scores = []
    
    # if comment, need to extract comment
    gold_comments = []
    pred_comments = []
    comment_text_scores = []
    
    # if docstring, need to extract docstring
    gold_docstrings = []
    pred_docstrings = []
    docstr_text_scores = []
    
    # if parse test
    is_parsables = []
    
    pred_diffs = []
    gold_diffs = []
    
    total_len = preds.shape[0]
    
    for idx in tqdm(range(total_len)):
        input_code = inputs[idx]
        
        if is_nl:
            input_code = remove_nl_prompt(input_code)
        gold = labels[idx]
        pred = preds[idx]
        
        refs = [[gold]]
        hyp = [pred]
        
        # get code bleu score
        code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
        
        if "docstring" in target_feats:
            gold_docstr = get_docstring(gold)
            pred_docstr = get_docstring(pred)
            gold_docstr_text = "\n".join(gold_docstr)
            pred_docstr_text = "\n".join(pred_docstr)
            docstr_text_score = 0
            if len(pred_docstr_text.split()) > 0:
                docstr_text_score = sentence_bleu([gold_docstr_text.split()], pred_docstr_text.split(), auto_reweigh=True)
            
            gold_docstrings += [gold_docstr]
            pred_docstrings += [pred_docstr]
            docstr_text_scores += [docstr_text_score]
            
        if "comment" in target_feats:
            gold_comment = comment(gold)
            pred_comment = comment(pred)
            gold_comment_text = "\n".join(gold_comment)
            pred_comment_text = "\n".join(pred_comment)
            comment_text_score = 0
            if len(pred_comment_text.split()) > 0:
                comment_text_score = sentence_bleu([gold_comment_text.split()], pred_comment_text.split(), auto_reweigh=True)
            
            gold_comments += [gold_comment]
            pred_comments += [pred_comment]
            comment_text_scores += [comment_text_score]
    
        # get the diff bleu score
        gold_diff_str = get_diff_str(input_code, gold)
        pred_diff_str = get_diff_str(input_code, pred)
        
        pred_diffs += [pred_diff_str]
        gold_diffs += [gold_diff_str]
        
        diff_bleu_score = 0
        if len(pred_diff_str.split()) > 0:
            diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
        
        code_scores += [code_score]
        diff_bleu_scores += [diff_bleu_score]
        if parse_test:
            is_parsables += [is_parsable(pred)]
        
    
    code_bleus = np.array([s["code_bleu"] for s in code_scores])
    
    report = {
        "inputs": inputs,
        "labels": labels,
        "preds": preds,
        "pred_diffs": pred_diffs,
        "gold_diffs": gold_diffs,
        "codebleu": code_scores,
        "codebleu_perfect": sum(code_bleus == 1) / total_len,
        "codebleu_above_90": sum(code_bleus >= 0.9) / total_len,
        "diff_bleu": diff_bleu_scores,
        "diff_bleu_avg":  np.mean(diff_bleu_scores),
        "diff_bleu_perfect": sum(np.array(diff_bleu_scores) == 1) / total_len,
        "diff_bleu_above_90": sum(np.array(diff_bleu_scores) >= 0.9) / total_len,
    }
    
    if "docstring" in target_feats:
        report["gold_docstrings"] = gold_docstrings
        report["pred_docstrings"] = pred_docstrings
        report["docstr_text_scores"] = docstr_text_scores
        report["docstr_text_scores_avg"] = np.array(docstr_text_scores).mean()
        report["docstr_text_scores_perfect"] = sum(np.array(docstr_text_scores) == 1) / total_len
        report["docstr_text_scores_above_90"] = sum(np.array(docstr_text_scores) >= 0.9) / total_len
        
        
    if "comment" in target_feats:
        report["gold_comments"] = gold_comments
        report["pred_comments"] = pred_comments
        report["comment_text_scores"] = comment_text_scores
        report["comment_text_scores_avg"] = np.array(comment_text_scores).mean()
        report["comment_text_scores_perfect"] = sum(np.array(comment_text_scores) == 1) / total_len
        report["comment_text_scores_above_90"] = sum(np.array(comment_text_scores) >= 0.9) / total_len
        
    if parse_test:
        report["parse_test_accuracy"] = sum(np.array(is_parsables)) / total_len
        
    return report.copy()

In [10]:
def print_colored_diff(str_1, str_2):
    text_1 = ""
    text_2 = ""
    idx_1 = 0
    idx_2 = 0
    matcher = difflib.SequenceMatcher(a=str_1, b=str_2)
    for match in matcher.get_matching_blocks():
        diff_text_1 = ""
        if idx_1 < match.a:
            diff_text_1 += colored(str_1[idx_1:match.a], "red")


        diff_text_2 = ""
        if idx_2 < match.b:
            diff_text_2 += colored(str_2[idx_2:match.b], "red")

        match_text_1 = str_1[match.a:match.a+match.size]
        match_text_2 = str_2[match.b:match.b+match.size]

        idx_1 = match.a+match.size 
        idx_2 = match.b+match.size

        text_1 += diff_text_1 + match_text_1
        text_2 += diff_text_2 + match_text_2
        
    if idx_1 < len(str_1):
        text_1 += colored(str_1[idx_1:], "red")
        
    if idx_2 < len(str_2):
        text_2 += colored(str_2[idx_2:], "red")
    return text_1, text_2

In [11]:
def lookup_examples(report, score_upper_bound, score_lower_bound, metric="diff_bleu", start_idx=0, count=10):
    total = len(report["inputs"])
    if count == "all":
        count = total
    current_count = 0
    for idx in range(total):
        if current_count == count: break
        if idx < start_idx: continue
        
        # checking upper bound
        if report[metric][idx] > score_upper_bound: continue
        # checking lower bound
        if report[metric][idx] < score_lower_bound: continue
        
        input_code = report["inputs"][idx]
        pred_code = report["preds"][idx]
        gold_code = report["labels"][idx]
        
        c_input, c_gold = print_colored_diff(input_code, gold_code)
        _, c_pred = print_colored_diff(input_code, pred_code)
        
        print_split_line(f"{idx}-input")
        print(c_input)
        print_split_line(f"{idx}-prediction")
        print(c_pred)
        print_split_line(f"{idx}-gold labels")
        print(c_gold)
        print_split_line(f"{idx}-{metric}")
        print(report[metric][idx])
        
        current_count += 1
        # break


# =====================SPLIT LINE=====================

In [None]:
# finetuned nl
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_codet5small"
ckpt = "epoch 2/checkpoint-85000"

In [None]:
pred_csvfile = "codet5_finetuned_nl_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring", "list_comp", "class", "casing", "comment"]

In [None]:
all_feat_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
all_feat_codebleu

In [None]:

all_feat_pred_df = pd.read_csv(file_name)

In [None]:
all_feat_report = evaluate_pred_df(all_feat_pred_df, target_feats)

In [None]:
all_feat_report["diff_bleu_avg"]

In [None]:
max(all_feat_report["diff_bleu"])

In [None]:
total = len(all_feat_pred_df)
for idx in range(total):
    if all_feat_report["diff_bleu"][idx] >= 0.14:
        print_split_line(f"{idx}-input")
        print(all_feat_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(all_feat_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(all_feat_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(all_feat_report["diff_bleu"][idx])

In [None]:
pred_csvfile = "codet5_finetuned_nl_tok_type_4_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring", "list_comp", "class", "casing", "comment"]

In [None]:
all_feat_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
all_feat_codebleu

In [None]:

all_feat_pred_df = pd.read_csv(file_name)

In [None]:
all_feat_report = evaluate_pred_df(all_feat_pred_df, target_feats)

In [None]:
all_feat_report["diff_bleu_avg"]

In [None]:
max(all_feat_report["diff_bleu"])

# CodeT5 Finetuned

## codet5_finetuned_nl_casing_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_class_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_class_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_list_comp_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
total = len(list_comp_pred_df)
for idx in range(total):
    if list_comp_report["diff_bleu"][idx] >= 0.:
        if list_comp_report["diff_bleu"][idx] >= 0.3: continue
        print_split_line(f"{idx}-input")
        print(list_comp_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(list_comp_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(list_comp_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(list_comp_report["diff_bleu"][idx])

## codet5_finetuned_nl_docstring_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_docstring_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_comment_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# Baseline

In [12]:
# baseline nl
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_baseline_codet5small"
ckpt = "checkpoint-144856"

## codet5_baseline_nl_casing_preds

In [None]:
pred_csvfile = "codet5_baseline_nl_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_baseline_nl_class_preds

In [None]:
pred_csvfile = "codet5_baseline_nl_class_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_baseline_nl_list_comp_preds

In [None]:
pred_csvfile = "codet5_baseline_nl_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
total = len(list_comp_pred_df)
for idx in range(total):
    if list_comp_report["diff_bleu"][idx] >= 0.:
        if list_comp_report["diff_bleu"][idx] >= 0.3: continue
        print_split_line(f"{idx}-input")
        print(list_comp_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(list_comp_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(list_comp_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(list_comp_report["diff_bleu"][idx])

## codet5_baseline_nl_docstring_preds

In [None]:
pred_csvfile = "codet5_baseline_nl_docstring_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_baseline_nl_comment_preds

In [None]:
pred_csvfile = "codet5_baseline_nl_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# Baseline Short

In [13]:
# baseline nl
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_baseline_codet5small"
ckpt = "checkpoint-144856"

## casing

In [66]:
pred_csvfile = "codet5_casing_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [67]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
casing_codebleu

{'ngram': 1.517511475212327e-05,
 'weighted_ngram': 5.843949724229512e-05,
 'syntax_match': 0.09198491911353814,
 'dataflow_match': 0.010240176351472586,
 'code_bleu': 0.025574677519251285}

In [68]:
casing_pred_df = pd.read_csv(file_name)

In [69]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True)

  0%|          | 0/1999 [00:00<?, ?it/s]



In [70]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 2.5276902502119783e-235
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.0005002501250625312


## class

In [71]:
pred_csvfile = "codet5_class_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [72]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
class_codebleu

{'ngram': 7.24059655746018e-06,
 'weighted_ngram': 2.9984021021420193e-05,
 'syntax_match': 0.10043044823411601,
 'dataflow_match': 0.010553449970011357,
 'code_bleu': 0.02775528070542656}

In [73]:
class_pred_df = pd.read_csv(file_name)

In [74]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [75]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 1.515098293905171e-234
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.0


## list_comp

In [76]:
pred_csvfile = "codet5_list_comp_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [77]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

{'ngram': 1.4727286449988927e-05,
 'weighted_ngram': 4.1436510129058734e-05,
 'syntax_match': 0.07192252844621323,
 'dataflow_match': 0.013202695423268861,
 'code_bleu': 0.021295346916515287}

In [78]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [79]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True)

  0%|          | 0/1687 [00:00<?, ?it/s]

In [80]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 0.0
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.0


## docstring

In [81]:
pred_csvfile = "codet5_docstring_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [82]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
docstring_codebleu

{'ngram': 1.2793149695384782e-05,
 'weighted_ngram': 4.07234396033062e-05,
 'syntax_match': 0.09751385185303463,
 'dataflow_match': 0.008874278482575547,
 'code_bleu': 0.026610411731227218}

In [83]:
docstring_pred_df = pd.read_csv(file_name)

In [84]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [85]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 3.429426043763702e-234
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
parse_test_accuracy : 0.0


## comment

In [86]:
pred_csvfile = "codet5_comment_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [87]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

{'ngram': 8.261377779406627e-06,
 'weighted_ngram': 2.575801296555232e-05,
 'syntax_match': 0.11844219680499102,
 'dataflow_match': 0.0064500124038700075,
 'code_bleu': 0.031231557149901498}

In [88]:
comment_pred_df = pd.read_csv(file_name)

In [89]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [90]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [91]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 2.199801032708017e-82
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
comment_text_scores_avg : 0.0
comment_text_scores_perfect : 0.0
comment_text_scores_above_90 : 0.0
parse_test_accuracy : 0.0


# Baseline Combine short

## docstring + comment

In [92]:
pred_csvfile = "codet5_baseline_short_docstring_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [93]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 1.6281791191374666e-05,
 'weighted_ngram': 2.731580852067673e-05,
 'syntax_match': 0.13656164888275624,
 'dataflow_match': 0.004070646264817418,
 'code_bleu': 0.03516897318682143}

In [94]:
pred_df = pd.read_csv(file_name)

In [95]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [96]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

  0%|          | 0/2000 [00:00<?, ?it/s]



In [97]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 4.533343429250006e-82
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
comment_text_scores_avg : 0.0
comment_text_scores_perfect : 0.0
comment_text_scores_above_90 : 0.0
parse_test_accuracy : 0.0


## class + casing

In [98]:
pred_csvfile = "codet5_baseline_short_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing"]

In [99]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 6.2734127420795734e-06,
 'weighted_ngram': 2.3968916501912857e-05,
 'syntax_match': 0.09180456781011374,
 'dataflow_match': 0.01088815551754872,
 'code_bleu': 0.025680741414226615}

In [100]:
pred_df = pd.read_csv(file_name)

In [101]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [102]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

  0%|          | 0/2002 [00:00<?, ?it/s]



In [103]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 1.514341501550573e-234
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.0


## list_comp + class + casing

In [104]:
pred_csvfile = "codet5_baseline_short_list_comp_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing", "list_comp"]

In [105]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 1.0648014970577506e-05,
 'weighted_ngram': 2.6693105168194964e-05,
 'syntax_match': 0.060325011493333865,
 'dataflow_match': 0.014472209401991166,
 'code_bleu': 0.01870864050386595}

In [106]:
pred_df = pd.read_csv(file_name)

In [107]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [108]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

  0%|          | 0/468 [00:00<?, ?it/s]

In [109]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 0.0
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.0


## list_comp + class + casing + docstring + comment

In [110]:
pred_csvfile = "codet5_baseline_short_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp", "casing", "class", "docstring", "comment"]

In [111]:
all_feat_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
all_feat_codebleu

{'ngram': 7.810693399227178e-05,
 'weighted_ngram': 0.0001236303364247739,
 'syntax_match': 0.09545068027210885,
 'dataflow_match': 0.004863644259162758,
 'code_bleu': 0.025129015450422162}

In [112]:
all_pred_df = pd.read_csv(file_name)

In [113]:
all_report = evaluate_pred_df(all_pred_df, target_feats, is_nl=True)

  0%|          | 0/98 [00:00<?, ?it/s]

In [114]:
for key, val in all_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 1.8352065729549962e-157
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
comment_text_scores_avg : 0.0
comment_text_scores_perfect : 0.0
comment_text_scores_above_90 : 0.0
parse_test_accuracy : 0.0


# =====================SPLIT LINE=====================

In [None]:
pred_csvfile = "codet5_finetuned_nl_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
total = len(all_feat_pred_df)
for idx in range(total):
    if all_feat_report["diff_bleu"][idx] >= 0.14:
        print_split_line(f"{idx}-input")
        print(all_feat_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(all_feat_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(all_feat_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(all_feat_report["diff_bleu"][idx])

In [None]:
# finetuned codet5small
folder = "seq2seq_results"
model_name = "combined_base_features_updated_mask_codet5small"
ckpt = "checkpoint-97000"

In [None]:
# all feat
pred_csvfile = "codet5_finetuned_all_feat_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring", "list_comp", "class", "casing", "comment"]

In [None]:
all_feat_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')

In [None]:
all_feat_codebleu

In [None]:

all_feat_pred_df = pd.read_csv(file_name)

In [None]:
all_feat_report = evaluate_pred_df(all_feat_pred_df, target_feats)

In [None]:
all_feat_report["diff_bleu_avg"]

In [None]:
max(all_feat_report["diff_bleu"])

In [None]:
total = len(all_feat_pred_df)
for idx in range(total):
    if all_feat_report["diff_bleu"][idx] >= 0.3:
        print_split_line(f"{idx}-input")
        print(all_feat_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(all_feat_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(all_feat_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(all_feat_report["diff_bleu"][idx])

In [None]:
# casing
pred_csvfile = "codet5_finetuned_casing_preds_downsized.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:

casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats)

In [None]:
casing_report["code_bleu"]

In [None]:
casing_report["diff_bleu_avg"]

In [None]:
total = len(casing_pred_df)
for idx in range(total):
    if casing_report["diff_bleu"][idx] >= 0:
        if casing_report["diff_bleu"][idx] >= 0.1: continue
        print_split_line(f"{idx}-input")
        print(casing_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(casing_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(casing_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(casing_report["diff_bleu"][idx])

In [None]:
# class
pred_csvfile = "codet5_finetuned_class_preds_downsized.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:

class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats)

In [None]:
class_report["diff_bleu_avg"]

In [None]:
# list comp
pred_csvfile = "codet5_finetuned_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:

comp_pred_df = pd.read_csv(file_name)

In [None]:
comp_report = evaluate_pred_df(comp_pred_df, target_feats)

In [None]:
comp_report["diff_bleu_avg"]

In [None]:
comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
comp_codebleu

In [None]:
# baseline codet5small
folder = "seq2seq_results"
model_name = "combined_base_features_baseline_codet5small"
ckpt = "checkpoint-115882"

In [None]:
# casing
pred_csvfile = "baseline_casing_preds_downsized.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats)

In [None]:
casing_report["diff_bleu_avg"]

In [None]:
casing_report

In [None]:
total = len(casing_pred_df)
for idx in range(total):
    if casing_report["diff_bleu"][idx] >= 0:
        if casing_report["diff_bleu"][idx] >= 0.1: continue
        print_split_line(f"{idx}-input")
        print(casing_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(casing_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(casing_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(casing_report["diff_bleu"][idx])

In [None]:
# class
pred_csvfile = "baseline_class_preds_downsized.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:

class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats)

In [None]:
class_report["diff_bleu_avg"]

# CodeT5 Finetuned Contd

In [None]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-144856"

## codet5_finetuned_nl_tok_type_4_contd_codet5small_docstring_list_comp_class_casing_comment_preds

In [None]:
pred_csvfile = "codet5_finetuned_nl_tok_type_4_contd_codet5small_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp", "casing", "class", "docstring", "comment"]

In [None]:
all_feat_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
all_feat_codebleu

In [None]:
all_pred_df = pd.read_csv(file_name)

In [None]:
all_report = evaluate_pred_df(all_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in all_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_casing_preds

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_class_preds

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_class_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_list_comp_preds

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
total = len(list_comp_pred_df)
for idx in range(total):
    if list_comp_report["diff_bleu"][idx] >= 0.:
        if list_comp_report["diff_bleu"][idx] >= 0.3: continue
        print_split_line(f"{idx}-input")
        print(list_comp_report["inputs"][idx])
        print_split_line(f"{idx}-prediction")
        print(list_comp_report["preds"][idx])
        print_split_line(f"{idx}-gold labels")
        print(list_comp_report["labels"][idx])
        print_split_line(f"{idx}-score")
        print(list_comp_report["diff_bleu"][idx])

## codet5_finetuned_nl_docstring_preds

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_docstring_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_comment_preds

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# CodeT5 Finetuned Contd Short (4 epochs)

In [None]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-144856"

## codet5_finetuned_nl_casing_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_class_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_class_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_list_comp_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_docstring_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_comment_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# CodeT5 Finetuned Contd Combine Epoch 4

## codet5_finetuned_contd_nl_docstring_comment_preds.csv

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_docstring_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_contd_nl_tok_type_4_class_casing_preds.csv

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_tok_type_4_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_contd_nl_tok_type_4_list_comp_class_casing_preds.csv

In [None]:
pred_csvfile = "codet5_finetuned_contd_nl_tok_type_4_list_comp_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing", "list_comp"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
lookup_examples(report, 1, 0.2, metric="diff_bleu", start_idx=0, count="all")

# CodeT5 Finetuned Contd Combine Short Epoch 4

-rw-r--r-- 1 cting3 grads    274746 Aug 17 19:50 codet5_finetuned_short_contd_docstring_list_comp_class_casing_comment_preds.csv 
-rw-r--r-- 1 cting3 grads   5356738 Aug 17 19:50 codet5_finetuned_short_contd_docstring_comment_preds.csv
-rw-r--r-- 1 cting3 grads   1321747 Aug 17 19:41 codet5_finetuned_short_contd_list_comp_class_casing_preds.csv
-rw-r--r-- 1 cting3 grads   4241306 Aug 17 19:39 codet5_finetuned_short_contd_class_casing_preds.csv

In [None]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-144856"

## docstring + list_comp + class + casing + comment

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
lookup_examples(report, 1, 0, metric="diff_bleu")

## docstring + comment

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## class+casing

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
lookup_examples(report, 1.1, 0.8, metric="diff_bleu")

In [None]:
lookup_examples(report, 0.8, 0, metric="diff_bleu")

## list_comp+class+casing

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_list_comp_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing", "list_comp"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
lookup_examples(report, 1.1, 0, metric="diff_bleu")

# CodeT5 Finetuned Contd Short 8 epoch

In [None]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-434568"

## codet5_finetuned_nl_casing_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
casing_pred_df = pd.read_csv(file_name)

In [None]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_class_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_class_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_list_comp_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_list_comp_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_docstring_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## codet5_finetuned_nl_comment_preds

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# CodeT5 Finetuned Contd Combine Short Epoch 8

-rw-r--r-- 1 cting3 grads    276311 Aug 17 19:48 codet5_finetuned_short_contd_docstring_list_comp_class_casing_comment_preds.csv
-rw-r--r-- 1 cting3 grads   5275345 Aug 17 19:47 codet5_finetuned_short_contd_docstring_comment_preds.csv
-rw-r--r-- 1 cting3 grads   1320497 Aug 17 19:40 codet5_finetuned_short_contd_list_comp_class_casing_preds.csv
-rw-r--r-- 1 cting3 grads   4251789 Aug 17 19:38 codet5_finetuned_short_contd_class_casing_preds.csv

In [None]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-434568"

## docstring + list_comp + class + casing + comment

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## docstring + comment

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_docstring_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## class+casing

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## list_comp+class+casing

In [None]:
pred_csvfile = "codet5_finetuned_short_contd_list_comp_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing", "list_comp"]

In [None]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

In [None]:
pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [None]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# CodeT5 Finetuned Contd Short epoch 13

In [12]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-796708"

## codet5_finetuned_nl_casing_preds

In [13]:
pred_csvfile = "codet5_casing_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [14]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

{'ngram': 0.8515370682933681,
 'weighted_ngram': 0.8632592595105788,
 'syntax_match': 0.9603313267926237,
 'dataflow_match': 0.9350088432061108,
 'code_bleu': 0.9025341244506704}

In [15]:
casing_pred_df = pd.read_csv(file_name)

In [16]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/1999 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




In [17]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.33466733366683343
codebleu_above_90 : 0.6143071535767884
diff_bleu_avg : 0.4095457724284954
diff_bleu_perfect : 0.33766883441720863
diff_bleu_above_90 : 0.3391695847923962
parse_test_accuracy : 0.9154577288644322


## codet5_finetuned_nl_class_preds

In [18]:
pred_csvfile = "codet5_class_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [19]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

{'ngram': 0.7583811884881181,
 'weighted_ngram': 0.7699428430150793,
 'syntax_match': 0.8502548313330444,
 'dataflow_match': 0.8627158225182803,
 'code_bleu': 0.8103236713386305}

In [20]:
class_pred_df = pd.read_csv(file_name)

In [21]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [22]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.22988505747126436
codebleu_above_90 : 0.3983008495752124
diff_bleu_avg : 0.38277694882269986
diff_bleu_perfect : 0.23088455772113944
diff_bleu_above_90 : 0.23438280859570215
parse_test_accuracy : 0.9175412293853074


## codet5_finetuned_nl_list_comp_preds

In [25]:
pred_csvfile = "codet5_list_comp_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [26]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

{'ngram': 0.9221047580957143,
 'weighted_ngram': 0.9259730884213488,
 'syntax_match': 0.9319856560678293,
 'dataflow_match': 0.9350537153543588,
 'code_bleu': 0.9287793044848128}

In [27]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [28]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/1687 [00:00<?, ?it/s]

In [29]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.4066390041493776
codebleu_above_90 : 0.7338470657972732
diff_bleu_avg : 0.47917780477819827
diff_bleu_perfect : 0.4090100770598696
diff_bleu_above_90 : 0.4096028452874926
parse_test_accuracy : 0.7545939537640782


## codet5_finetuned_nl_docstring_preds

In [30]:
pred_csvfile = "codet5_docstring_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [31]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

{'ngram': 0.3672733662557726,
 'weighted_ngram': 0.5239876414871829,
 'syntax_match': 0.7835198875884203,
 'dataflow_match': 0.8714541469889188,
 'code_bleu': 0.6365587605800737}

In [32]:
docstring_pred_df = pd.read_csv(file_name)

In [33]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [34]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.11144427786106946
diff_bleu_avg : 0.004418664442209302
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
parse_test_accuracy : 0.7891054472763618


## codet5_finetuned_nl_comment_preds

In [35]:
pred_csvfile = "codet5_comment_short_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [36]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
comment_codebleu

{'ngram': 0.6519745504806481,
 'weighted_ngram': 0.6697556498470272,
 'syntax_match': 0.9319492304651582,
 'dataflow_match': 0.953981642272389,
 'code_bleu': 0.8019152682663055}

In [37]:
comment_pred_df = pd.read_csv(file_name)

In [38]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [39]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2001 [00:00<?, ?it/s]



In [40]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.16141929035482258
codebleu_above_90 : 0.384807596201899
diff_bleu_avg : 0.2813279087674655
diff_bleu_perfect : 0.12093953023488256
diff_bleu_above_90 : 0.13693153423288357
comment_text_scores_avg : 0.08391034927961877
comment_text_scores_perfect : 0.04147926036981509
comment_text_scores_above_90 : 0.047476261869065464
parse_test_accuracy : 0.904047976011994


# CodeT5 Finetuned Contd Combine Short Epoch 13

-rw-r--r-- 1 cting3 grads    276311 Aug 17 19:48 codet5_finetuned_short_contd_docstring_list_comp_class_casing_comment_preds.csv
-rw-r--r-- 1 cting3 grads   5275345 Aug 17 19:47 codet5_finetuned_short_contd_docstring_comment_preds.csv
-rw-r--r-- 1 cting3 grads   1320497 Aug 17 19:40 codet5_finetuned_short_contd_list_comp_class_casing_preds.csv
-rw-r--r-- 1 cting3 grads   4251789 Aug 17 19:38 codet5_finetuned_short_contd_class_casing_preds.csv

In [44]:
# combined_nl_prompt_base_features_contd_codet5small
folder = "seq2seq_results"
model_name = "combined_nl_prompt_base_features_contd_codet5small"
ckpt = "checkpoint-796708"

## docstring + list_comp + class + casing + comment

In [45]:
pred_csvfile = "codet5_finetuned_short_docstring_list_comp_class_casing_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [46]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 0.2370711143795277,
 'weighted_ngram': 0.2666262612468731,
 'syntax_match': 0.641156462585034,
 'dataflow_match': 0.4375542817439639,
 'code_bleu': 0.3956020299888497}

In [47]:
pred_df = pd.read_csv(file_name)

In [48]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [49]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/98 [00:00<?, ?it/s]

In [50]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0
diff_bleu_avg : 0.0015539349830256176
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 4.820800458386333e-90
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
comment_text_scores_avg : 0.021789135543230743
comment_text_scores_perfect : 0.02040816326530612
comment_text_scores_above_90 : 0.02040816326530612
parse_test_accuracy : 0.7857142857142857


## docstring + comment

In [51]:
pred_csvfile = "codet5_finetuned_short_docstring_comment_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment", "docstring"]

In [52]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 0.27801499852601474,
 'weighted_ngram': 0.29462125948909834,
 'syntax_match': 0.7661626487310451,
 'dataflow_match': 0.8689570129146889,
 'code_bleu': 0.5519389799152118}

In [53]:
pred_df = pd.read_csv(file_name)

In [54]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [55]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2000 [00:00<?, ?it/s]



In [56]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.0015
diff_bleu_avg : 0.012957864485672797
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0226052589831815
docstr_text_scores_perfect : 0.0005
docstr_text_scores_above_90 : 0.001
comment_text_scores_avg : 0.0040017806028904675
comment_text_scores_perfect : 0.0015
comment_text_scores_above_90 : 0.0025
parse_test_accuracy : 0.7775


## class+casing

In [57]:
pred_csvfile = "codet5_finetuned_short_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing"]

In [58]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 0.5675763890543825,
 'weighted_ngram': 0.5799559744299995,
 'syntax_match': 0.7018852046271323,
 'dataflow_match': 0.5375063233204057,
 'code_bleu': 0.5967309728579799}

In [59]:
pred_df = pd.read_csv(file_name)

In [60]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [61]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2002 [00:00<?, ?it/s]



In [62]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.003996003996003996
codebleu_above_90 : 0.02097902097902098
diff_bleu_avg : 0.03481417975360696
diff_bleu_perfect : 0.003996003996003996
diff_bleu_above_90 : 0.003996003996003996
parse_test_accuracy : 0.7892107892107892


## list_comp+class+casing

In [63]:
pred_csvfile = "codet5_finetuned_short_list_comp_class_casing_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class", "casing", "list_comp"]

In [64]:
codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True, is_exclude_same_io=True)
codebleu

{'ngram': 0.7081693792580135,
 'weighted_ngram': 0.7168083212232143,
 'syntax_match': 0.7760898678766316,
 'dataflow_match': 0.506791059537153,
 'code_bleu': 0.6769646569737531}

In [65]:
pred_df = pd.read_csv(file_name)

In [66]:
# not sure why there is nan in preds, but just exclude it no matter what
pred_df = pred_df.dropna()

In [67]:
report = evaluate_pred_df(pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/468 [00:00<?, ?it/s]

In [68]:
for key, val in report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.042735042735042736
diff_bleu_avg : 0.06586243624288024
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
parse_test_accuracy : 0.7222222222222222
