In [2]:
# %pip install -r requirements.txt

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m31m89.7 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: nltk
Successfully installed nltk-3.7
Note: you may need to restart the kernel to use updated packages.


In [1]:
# eval on predictions
import ast
import re
import difflib
from termcolor import colored

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from evaluator.CodeBLEU.calc_code_bleu import get_codebleu
from nltk.translate.bleu_score import sentence_bleu

from utils.regex_parse import comment
from utils.eval_utils import evaluate_codebleu, evaluate_pred_df, lookup_examples, get_valid_pred_df


In [20]:
def exclude_same_io(df):
    # excluding those input exactly same as the output
    exact_match_bool = df["inputs"] == df["labels"]
    df = df.drop(df[exact_match_bool].index)
    return df

In [21]:
# parsable eval
def is_parsable(input_code):
    try:
        ast.parse(input_code)
    except SyntaxError:
        return False
    except Exception as e:
        print(input_code)
        print(e)
        return False
    return True

In [22]:
def evaluate_codebleu(pred_filename, weights="0.25,0.25,0.25,0.25", replaced_df=None, dropna=False, is_exclude_same_io=False):
    pred_df = None
    if replaced_df is not None:
        pred_df = replaced_df
    else:
        pred_df = pd.read_csv(pred_filename)
    if dropna:
        pred_df = pred_df.dropna()
    if is_exclude_same_io:
        pred_df = exclude_same_io(pred_df)
    # a list of gold codes (which is just some variants of the same code, we can use every code of different styles)
    refs = [
        pred_df["labels"]
    ]
    # the prediction code
    hyp = pred_df["preds"]
    score = get_codebleu(refs, hyp, "python", weights)
    return score

In [23]:
def get_docstring(text):
    regex_docstr = "^\s*\'{3}([\s\S]*?)\'{3}|^\s*\"{3}([\s\S]*?)\"{3}"
    docstr_matches = re.findall(regex_docstr, text, re.M | re.S)
    docstrs = []
    for match in docstr_matches:
        docstr_a, docstr_b = match
        if docstr_a:
            docstrs += [docstr_a]
        else:
            docstrs += [docstr_b]
    return docstrs

In [24]:
def print_split_line(s):
    print(f"\n====================={s.upper()}=====================\n")

In [25]:
def tokenize(s):
    return re.split('\s+', s)

def get_diff_list(str_1, str_2):
    s1 = tokenize(str_1)
    s2 = tokenize(str_2)

    matcher = difflib.SequenceMatcher(a=s1, b=s2)

    diff_blocks_a = []
    diff_blocks_b = []

    prev_match = None
    for idx, match in enumerate(matcher.get_matching_blocks()):

        if idx == 0: 
            prev_match = match
            if match.a != 0:
                start_idx_a = 0
                end_idx_a = match.a
                diff_blocks_a += s1[start_idx_a:end_idx_a]
            if match.b != 0:
                start_idx_b = 0
                end_idx_b = match.b
                diff_blocks_b += s2[start_idx_b:end_idx_b]
            continue

        start_idx_a = prev_match.a + prev_match.size
        end_idx_a = match.a

        start_idx_b = prev_match.b + prev_match.size
        end_idx_b = match.b

        diff_list_a = s1[start_idx_a:end_idx_a]
        diff_list_b = s2[start_idx_b:end_idx_b]
        if len(diff_list_a):
            diff_blocks_a += diff_list_a
        if len(diff_list_b):
            diff_blocks_b += diff_list_b

        prev_match = match
    return diff_blocks_a, diff_blocks_b

def get_diff_str(input_str, output_str):
    return " ".join(get_diff_list(input_str, output_str)[1])

In [26]:
import re
def remove_nl_prompt(script):
    return re.sub("<nl>.*<\/nl>", "", script)

In [27]:
def evaluate_pred_df(pred_df, target_feats, is_nl=False, parse_test=True):
    
    inputs = pred_df["inputs"].to_numpy()
    labels = pred_df["labels"].to_numpy()
    preds = pred_df["preds"].to_numpy()
    
    if is_nl:
        inputs = [remove_nl_prompt(input_script) for input_script in inputs]
    
    code_scores = []
    diff_bleu_scores = []
    
    # if comment, need to extract comment
    gold_comments = []
    pred_comments = []
    comment_text_scores = []
    
    # if docstring, need to extract docstring
    gold_docstrings = []
    pred_docstrings = []
    docstr_text_scores = []
    
    # if parse test
    is_parsables = []
    
    pred_diffs = []
    gold_diffs = []
    
    total_len = preds.shape[0]
    
    for idx in tqdm(range(total_len)):
        input_code = inputs[idx]
        gold = labels[idx]
        pred = preds[idx]
        
        refs = [[gold]]
        hyp = [pred]
        
        # get code bleu score
        code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
        
        if "docstring" in target_feats:
            gold_docstr = get_docstring(gold)
            pred_docstr = get_docstring(pred)
            gold_docstr_text = "\n".join(gold_docstr)
            pred_docstr_text = "\n".join(pred_docstr)
            docstr_text_score = 0
            if len(pred_docstr_text.split()) > 0:
                docstr_text_score = sentence_bleu([gold_docstr_text.split()], pred_docstr_text.split(), auto_reweigh=True)
            
            gold_docstrings += [gold_docstr]
            pred_docstrings += [pred_docstr]
            docstr_text_scores += [docstr_text_score]
            
        if "comment" in target_feats:
            gold_comment = comment(gold)
            pred_comment = comment(pred)
            gold_comment_text = "\n".join(gold_comment)
            pred_comment_text = "\n".join(pred_comment)
            comment_text_score = 0
            if len(pred_comment_text.split()) > 0:
                comment_text_score = sentence_bleu([gold_comment_text.split()], pred_comment_text.split(), auto_reweigh=True)
            
            gold_comments += [gold_comment]
            pred_comments += [pred_comment]
            comment_text_scores += [comment_text_score]
    
        # get the diff bleu score
        gold_diff_str = get_diff_str(input_code, gold)
        pred_diff_str = get_diff_str(input_code, pred)
        
        pred_diffs += [pred_diff_str]
        gold_diffs += [gold_diff_str]
        
        diff_bleu_score = 0
        if len(pred_diff_str.split()) > 0:
            diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
        
        code_scores += [code_score]
        diff_bleu_scores += [diff_bleu_score]
        if parse_test:
            is_parsables += [is_parsable(pred)]
        
    
    code_bleus = np.array([s["code_bleu"] for s in code_scores])
    
    report = {
        "inputs": inputs,
        "labels": labels,
        "preds": preds,
        "pred_diffs": pred_diffs,
        "gold_diffs": gold_diffs,
        "codebleu": code_scores,
        "codebleu_perfect": sum(code_bleus == 1) / total_len,
        "codebleu_above_90": sum(code_bleus >= 0.9) / total_len,
        "diff_bleu": diff_bleu_scores,
        "diff_bleu_avg":  np.mean(diff_bleu_scores),
        "diff_bleu_perfect": sum(np.array(diff_bleu_scores) == 1) / total_len,
        "diff_bleu_above_90": sum(np.array(diff_bleu_scores) >= 0.9) / total_len,
    }
    
    if "docstring" in target_feats:
        report["gold_docstrings"] = gold_docstrings
        report["pred_docstrings"] = pred_docstrings
        report["docstr_text_scores"] = docstr_text_scores
        report["docstr_text_scores_avg"] = np.array(docstr_text_scores).mean()
        report["docstr_text_scores_perfect"] = sum(np.array(docstr_text_scores) == 1) / total_len
        report["docstr_text_scores_above_90"] = sum(np.array(docstr_text_scores) >= 0.9) / total_len
        
        
    if "comment" in target_feats:
        report["gold_comments"] = gold_comments
        report["pred_comments"] = pred_comments
        report["comment_text_scores"] = comment_text_scores
        report["comment_text_scores_avg"] = np.array(comment_text_scores).mean()
        report["comment_text_scores_perfect"] = sum(np.array(comment_text_scores) == 1) / total_len
        report["comment_text_scores_above_90"] = sum(np.array(comment_text_scores) >= 0.9) / total_len
        
    if parse_test:
        report["parse_test_accuracy"] = sum(np.array(is_parsables)) / total_len
        
    return report.copy()

In [28]:
def print_colored_diff(str_1, str_2):
    text_1 = ""
    text_2 = ""
    idx_1 = 0
    idx_2 = 0
    matcher = difflib.SequenceMatcher(a=str_1, b=str_2)
    for match in matcher.get_matching_blocks():
        diff_text_1 = ""
        if idx_1 < match.a:
            diff_text_1 += colored(str_1[idx_1:match.a], "red")


        diff_text_2 = ""
        if idx_2 < match.b:
            diff_text_2 += colored(str_2[idx_2:match.b], "red")

        match_text_1 = str_1[match.a:match.a+match.size]
        match_text_2 = str_2[match.b:match.b+match.size]

        idx_1 = match.a+match.size 
        idx_2 = match.b+match.size

        text_1 += diff_text_1 + match_text_1
        text_2 += diff_text_2 + match_text_2
        
    if idx_1 < len(str_1):
        text_1 += colored(str_1[idx_1:], "red")
        
    if idx_2 < len(str_2):
        text_2 += colored(str_2[idx_2:], "red")
    return text_1, text_2

In [29]:
def lookup_examples(report, score_upper_bound, score_lower_bound, metric="diff_bleu", start_idx=0, count=10):
    total = len(report["inputs"])
    if count == "all":
        count = total
    current_count = 0
    for idx in range(total):
        if current_count == count: break
        if idx < start_idx: continue
        
        # checking upper bound
        if report[metric][idx] > score_upper_bound: continue
        # checking lower bound
        if report[metric][idx] < score_lower_bound: continue
        
        input_code = report["inputs"][idx]
        pred_code = report["preds"][idx]
        gold_code = report["labels"][idx]
        
        c_input, c_gold = print_colored_diff(input_code, gold_code)
        _, c_pred = print_colored_diff(input_code, pred_code)
        
        print_split_line(f"{idx}-input")
        print(c_input)
        print_split_line(f"{idx}-prediction")
        print(c_pred)
        print_split_line(f"{idx}-gold labels")
        print(c_gold)
        print_split_line(f"{idx}-{metric}")
        print(report[metric][idx])
        
        current_count += 1
        # break


In [14]:
DATA_DIR = "/data/ken/data/code"

In [20]:
codex_list_comp_df = pd.read_csv(f"{DATA_DIR}/codex_output.csv")
codex_list_comp_df = exclude_same_io(codex_list_comp_df)

In [21]:
codex_list_comp_df

Unnamed: 0,inputs,preds,labels
0,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...
22,\nimport re\nfrom django import template\nfrom...,\nimport re\nfrom django import template\nfrom...,\nimport re\nfrom django import template\nfrom...
49,"\nfrom __future__ import unicode_literals, pri...","\nfrom __future__ import unicode_literals, pri...","\nfrom __future__ import unicode_literals, pri..."
76,"\nfrom setuptools import setup, find_packages\...","\nfrom setuptools import setup, find_packages\...","\nfrom setuptools import setup, find_packages\..."
86,\nimport random\nimport unittest\nfrom algo.ch...,\nimport random\nimport unittest\nfrom algo.ch...,\nimport random\nimport unittest\nfrom algo.ch...
...,...,...,...
3886,\nfrom CommonServerPython import *\nimport jso...,\nfrom CommonServerPython import *\nimport jso...,\nfrom CommonServerPython import *\nimport jso...
3898,\nimport z3\nfrom mythril.laser.smt.model impo...,\nimport z3\nfrom mythril.laser.smt.model impo...,\nimport z3\nfrom mythril.laser.smt.model impo...
3909,\n'\nCreated on Fri Mar 11 15:29:19 2016\n\n@a...,\n'\nCreated on Fri Mar 11 15:29:19 2016\n\n@a...,\n'\nCreated on Fri Mar 11 15:29:19 2016\n\n@a...
3933,\n'Helpers for training an agent using imitati...,\n'Helpers for training an agent using imitati...,\n'Helpers for training an agent using imitati...


In [22]:

target_feats=['list_comp']
codex_list_comp_report = evaluate_pred_df(codex_list_comp_df, target_feats, is_nl=False, parse_test=True)
for key, val in codex_list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

  0%|          | 0/248 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


codebleu_perfect : 0.0
codebleu_above_90 : 0.12096774193548387
diff_bleu_avg : 0.1124273545448584
diff_bleu_perfect : 0.056451612903225805
diff_bleu_above_90 : 0.056451612903225805
parse_test_accuracy : 0.24596774193548387


Unnamed: 0,inputs,preds,labels
0,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...,\n__author__ = 'morta@digitus.itk.ppke.hu'\nim...
1,\n'security converge saved queries\n\nRevision...,\n'security converge saved queries\n\nRevision...,\n'security converge saved queries\n\nRevision...
2,\nfrom gng import GrowingNeuralGas\nfrom sklea...,\nfrom gng import GrowingNeuralGas\nfrom sklea...,\nfrom gng import GrowingNeuralGas\nfrom sklea...
3,"\n'\nCreated on Mar 27, 2018\n\n@author: Starl...","\n'\nCreated on Mar 27, 2018\n\n@author: Starl...","\n'\nCreated on Mar 27, 2018\n\n@author: Starl..."
4,\nimport os\nimport tempfile\nfrom os.path imp...,\nimport os\nimport tempfile\nfrom os.path imp...,\nimport os\nimport tempfile\nfrom os.path imp...
...,...,...,...
3995,\nimport numpy as np\nprint(np.empty(3))\nprin...,\nimport numpy as np\nprint(np.empty(3))\nprin...,\nimport numpy as np\nprint(np.empty(3))\nprin...
3996,"\nfrom math import cos, pi, sin\nfrom PyQt5.Qt...","\nfrom math import cos, pi, sin\nfrom PyQt5.Qt...","\nfrom math import cos, pi, sin\nfrom PyQt5.Qt..."
3997,\nimport numpy as np\nimport pytest\nfrom devi...,\nimport numpy as np\nimport pytest\nfrom devi...,\nimport numpy as np\nimport pytest\nfrom devi...
3998,\n'Run PEtab test suite (https://github.com/PE...,\n'Run PEtab test suite (https://github.com/PE...,\n'Run PEtab test suite (https://github.com/PE...


In [19]:
lookup_examples(codex_list_comp_report, 0.01, 0, metric="diff_bleu")




__author__ = 'morta@digitus.itk.ppke.hu'
import os
from docmodel import token
from purepos.common.analysisqueue import AnalysisQueue
STEM_FILTER_FILE = 'purepos_stems.txt'
UNKOWN_VALUE = (- 99.0)
LEMMA_MAPPER = None
analysis_queue = AnalysisQueue()
CONFIGURATION = None

class Constants():

    def __init__(self):
        pass

class StemFilter():

    def __init__(self, filename: str):
        self.stems = set()
        with open(filename) as file:
            self.stems = set(file.readlines())

    def filter_stem(self, candidates) -> list:
        if (len(self.stems) == 0):
            return candidates
        ret = [][31m
        for t in candidates:
            if (t.stem in self.stems):
                ret.append(t)[0m
        if (len(ret) == 0):
            return candidates
        return ret

    @staticmethod
    def create_stem_filter():
        if os.path.isfile(STEM_FILTER_FILE):
            return StemFilter(STEM_FILTER_FILE)

def simplify_lemma(t: token.Token):
    

# CodeT5 Individual Finetuned - Short Dataset

In [None]:
# drwxr-xr-x 27 cting3 grads 4096 Aug  5 12:15 outlier_fixed_list_comp_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Aug  5 11:34 outlier_updated_docstring_codet5small
# drwxr-xr-x 25 cting3 grads 4096 Jul 28 14:36 outlier_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Jul 28 14:18 outlier_casing_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Jul 24 15:47 outlier_class_codet5small

# checkpoint-12000
# checkpoint-85500
# checkpoint-40500
# checkpoint-27000
# checkpoint-49000

## casing

In [None]:
# model config
folder = "seq2seq_results"
model_name = "outlier_casing_codet5small"
ckpt = "checkpoint-27000"

In [None]:
# model config
pred_csvfile = "codet5_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [None]:
# codebleu score
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

In [None]:
# getting prediction
casing_pred_df = pd.read_csv(file_name)

In [None]:
# getting the score report, containing all the eval metrics
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
# you can use this function for looking up the example
# you can search with score upper bound and lower bound with a certain eval metric
# and you can also pick how many examples you want to see by using `count` argument
lookup_examples(casing_report, 0.6, 0.4, metric="diff_bleu")

## class

In [None]:
folder = "seq2seq_results"
model_name = "outlier_class_codet5small"
ckpt = "checkpoint-49000"

In [None]:
pred_csvfile = "codet5_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [None]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

In [None]:
class_pred_df = pd.read_csv(file_name)

In [None]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## list_comp

In [None]:
folder = "seq2seq_results"
model_name = "outlier_fixed_list_comp_codet5small"
ckpt = "checkpoint-12000"

In [None]:
pred_csvfile = "codet5_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [None]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

In [None]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [None]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

In [None]:
lookup_examples(list_comp_report, 1, 1, metric="diff_bleu")

## docstring

In [None]:
folder = "seq2seq_results"
model_name = "outlier_updated_docstring_codet5small"
ckpt = "checkpoint-85500"

In [None]:
pred_csvfile = "codet5_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [None]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

In [None]:
docstring_pred_df = pd.read_csv(file_name)

In [None]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

## comment

In [None]:
folder = "seq2seq_results"
model_name = "outlier_codet5small"
ckpt = "checkpoint-40500"

In [None]:
pred_csvfile = "codet5_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [None]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True)
comment_codebleu

In [None]:
comment_pred_df = pd.read_csv(file_name)

In [None]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [None]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True, parse_test=True)

In [None]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

# CodeT5 Individual Finetuned - Truncated Dataset

In [12]:
# drwxr-xr-x 27 cting3 grads 4096 Aug  5 12:15 outlier_fixed_list_comp_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Aug  5 11:34 outlier_updated_docstring_codet5small
# drwxr-xr-x 25 cting3 grads 4096 Jul 28 14:36 outlier_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Jul 28 14:18 outlier_casing_codet5small
# drwxr-xr-x 32 cting3 grads 4096 Jul 24 15:47 outlier_class_codet5small

# checkpoint-12000
# checkpoint-85500
# checkpoint-40500
# checkpoint-27000
# checkpoint-49000

## casing

In [14]:
folder = "seq2seq_results"
model_name = "outlier_casing_codet5small"
ckpt = "checkpoint-27000"

In [15]:
pred_csvfile = "codet5_eval_set_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["casing"]

In [16]:
casing_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
casing_codebleu

{'ngram': 0.9502663480841979,
 'weighted_ngram': 0.9550267101422462,
 'syntax_match': 0.9901381003399783,
 'dataflow_match': 0.9544287595353866,
 'code_bleu': 0.9624649795254523}

In [17]:
casing_pred_df = pd.read_csv(file_name)

In [18]:
casing_report = evaluate_pred_df(casing_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2011 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




In [19]:
for key, val in casing_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.4724017901541522
codebleu_above_90 : 0.8547986076578816
diff_bleu_avg : 0.5964453082218742
diff_bleu_perfect : 0.45997016409746394
diff_bleu_above_90 : 0.4733963202386872
parse_test_accuracy : 0.4669318746892093


## class

In [22]:
folder = "seq2seq_results"
model_name = "outlier_class_codet5small"
ckpt = "checkpoint-49000"

In [23]:
pred_csvfile = "codet5_eval_set_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["class"]

In [24]:
class_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
class_codebleu

{'ngram': 0.7575970498680648,
 'weighted_ngram': 0.784300360086646,
 'syntax_match': 0.8638880792771787,
 'dataflow_match': 0.8626689507992045,
 'code_bleu': 0.8171136100077735}

In [25]:
class_pred_df = pd.read_csv(file_name)

In [26]:
class_report = evaluate_pred_df(class_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2005 [00:00<?, ?it/s]



In [27]:
for key, val in class_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.08478802992518704
codebleu_above_90 : 0.31022443890274315
diff_bleu_avg : 0.29821448465689815
diff_bleu_perfect : 0.08528678304239401
diff_bleu_above_90 : 0.10274314214463841
parse_test_accuracy : 0.4967581047381546


## list_comp

In [28]:
folder = "seq2seq_results"
model_name = "outlier_fixed_list_comp_codet5small"
ckpt = "checkpoint-12000"

In [29]:
pred_csvfile = "codet5_eval_set_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [30]:
list_comp_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', is_exclude_same_io=True)
list_comp_codebleu

{'ngram': 0.9750531654811976,
 'weighted_ngram': 0.9777989820886288,
 'syntax_match': 0.9729383740338027,
 'dataflow_match': 0.9273258171790089,
 'code_bleu': 0.9632790846956594}

In [31]:
list_comp_pred_df = pd.read_csv(file_name)
list_comp_pred_df = exclude_same_io(list_comp_pred_df)

In [32]:
list_comp_report = evaluate_pred_df(list_comp_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/816 [00:00<?, ?it/s]

In [33]:
for key, val in list_comp_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.21200980392156862
codebleu_above_90 : 0.8970588235294118
diff_bleu_avg : 0.38956310338746586
diff_bleu_perfect : 0.21200980392156862
diff_bleu_above_90 : 0.21446078431372548
parse_test_accuracy : 0.32598039215686275


## docstring

In [35]:
folder = "seq2seq_results"
model_name = "outlier_updated_docstring_codet5small"
ckpt = "checkpoint-85500"

In [36]:
pred_csvfile = "codet5_eval_set_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["docstring"]

In [37]:
docstring_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
docstring_codebleu

{'ngram': 0.421175022475854,
 'weighted_ngram': 0.5404834582717518,
 'syntax_match': 0.8606034809804484,
 'dataflow_match': 0.8820790293490737,
 'code_bleu': 0.676085247769282}

In [38]:
docstring_pred_df = pd.read_csv(file_name)

In [39]:
docstring_report = evaluate_pred_df(docstring_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/1999 [00:00<?, ?it/s]



In [40]:
for key, val in docstring_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0
codebleu_above_90 : 0.10355177588794397
diff_bleu_avg : 0.015511289362665262
diff_bleu_perfect : 0.0
diff_bleu_above_90 : 0.0
docstr_text_scores_avg : 0.0
docstr_text_scores_perfect : 0.0
docstr_text_scores_above_90 : 0.0
parse_test_accuracy : 0.832416208104052


## comment

In [41]:
folder = "seq2seq_results"
model_name = "outlier_codet5small"
ckpt = "checkpoint-40500"

In [42]:
pred_csvfile = "codet5_eval_set_preds.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["comment"]

In [43]:
comment_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25', dropna=True)
comment_codebleu

{'ngram': 0.6448604000984216,
 'weighted_ngram': 0.658919766640146,
 'syntax_match': 0.8721128194792865,
 'dataflow_match': 0.8420958814147669,
 'code_bleu': 0.7544972169081553}

In [44]:
comment_pred_df = pd.read_csv(file_name)

In [45]:
# not sure why there is nan in preds, but just exclude it no matter what
comment_pred_df = comment_pred_df.dropna()

In [46]:
comment_report = evaluate_pred_df(comment_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/2014 [00:00<?, ?it/s]



In [47]:
for key, val in comment_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.05958291956305859
codebleu_above_90 : 0.2701092353525323
diff_bleu_avg : 0.28453211512775645
diff_bleu_perfect : 0.05114200595829196
diff_bleu_above_90 : 0.09384309831181728
comment_text_scores_avg : 0.17852446160161348
comment_text_scores_perfect : 0.07944389275074479
comment_text_scores_above_90 : 0.11519364448857994
parse_test_accuracy : 0.5099304865938431


# Eval on test split from training data

## Uncomment Parallel Corpus

In [None]:
# no_outlier_codet5small
evaluate_codebleu("seq2seq_results/no_outlier_codet5small/codet5_preds.csv")

In [None]:
# outlier_codet5small
evaluate_codebleu("seq2seq_results/outlier_codet5small/codet5_preds.csv")

In [None]:
comment_pred_df = pd.read_csv("seq2seq_results/outlier_codet5small/codet5_preds.csv")

In [None]:
# excluding those input exactly same as the output
exact_match_bool = comment_pred_df["inputs"] == comment_pred_df["labels"]
cleaned_comment_pred_df = comment_pred_df.drop(comment_pred_df[exact_match_bool].index)

In [None]:
evaluate_codebleu("", weights="0.25,0.25,0.25,0.25", replaced_df=cleaned_comment_pred_df)

In [None]:
comment_pred_df = cleaned_comment_pred_df

In [None]:
comment_inputs = comment_pred_df["inputs"].to_numpy()
comment_labels = comment_pred_df["labels"].to_numpy()
comment_preds = comment_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
comment_code_scores = []
comment_text_scores = []
comment_diff_bleu_scores = []

gold_comments = []
pred_comments = []
gold_comment_texts = []
pred_comment_texts = []
gold_comments_count = []
pred_comments_count = []
gold_has_comments_list = []
pred_has_comments_list = []

for idx in tqdm(range(comment_preds.shape[0])):
    input_code = comment_inputs[idx]
    gold = comment_labels[idx]
    pred = comment_preds[idx]
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    comment_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    
    gold_comment = comment(gold)
    pred_comment = comment(pred)
    gold_comment_text = "\n".join(gold_comment)
    pred_comment_text = "\n".join(pred_comment)
    gold_comment_count = len(gold_comment)
    pred_comment_count = len(pred_comment)
    gold_has_comment = len(gold_comment) > 0
    pred_has_comment = len(pred_comment) > 0
    
    
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    comment_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        comment_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    
    comment_text_score = get_codebleu([[gold_comment_text]], [pred_comment_text], "python", '1,0,0,0')
    
    comment_code_scores += [comment_code_score]
    comment_text_scores += [comment_text_score]
    comment_diff_bleu_scores += [comment_diff_bleu_score]
       
    gold_comments += [gold_comment]
    pred_comments += [pred_comment]
    gold_comment_texts += [gold_comment_text]
    pred_comment_texts += [pred_comment_text]
    gold_comments_count += [gold_comment_count]
    pred_comments_count += [pred_comment_count]
    gold_has_comments_list += [gold_has_comment]
    pred_has_comments_list += [pred_has_comment]

In [None]:
comment_bleu_scores = np.array([s["ngram"] for s in comment_text_scores])

In [None]:
comment_bleu_scores.mean()

In [None]:
"Comment BLEU score on only comparing difference in prediction:", np.mean(comment_diff_bleu_scores)

In [None]:
comment_bleu_scores.max()

In [None]:
comment_bleu_scores[3236]

In [None]:
idx = 188
print_split_line(f"{idx}-prediction")
print(comment_preds[idx])
print_split_line(f"{idx}-gold labels")
print(comment_labels[idx])
print_split_line(f"{idx}-score")
print(comment_bleu_scores[idx])

In [None]:
comment_total = len(comment_preds)
sum(pred_has_comments_list), sum(gold_has_comments_list)

In [None]:
for idx in range(comment_total):
    if comment_bleu_scores[idx] < 0.5 or comment_bleu_scores[idx] > 0.95:    
        continue
    
    if not pred_has_comments_list[idx]:
        continue
    if not gold_has_comments_list[idx]:
        continue
        
#     if "copyright" in pred_comment_texts[idx].lower():
#         continue
        
#     if "copyright" in gold_comment_texts[idx].lower():
#         continue
        
#     if "license" in pred_comment_texts[idx].lower():
#         continue
        
#     if "license" in gold_comment_texts[idx].lower():
#         continue
        
    
        
#     if "\n#" in pred_comment_texts[idx].lower():
#         continue
        
    # if " #" not in gold_comment_texts[idx].lower():
    #     continue
    
    # if " #" in pred_comment_texts[idx].lower():
    print_split_line(f"{idx}-prediction")
    print(comment_preds[idx])
    print_split_line(f"{idx}-gold labels")
    print(comment_labels[idx])
    print_split_line(f"{idx}-score")
    print(comment_bleu_scores[idx])
    
        

In [None]:
print("Accuracy of whether both do or do not have comments")
sum(np.array(pred_has_comments_list) == np.array(gold_has_comments_list)) / comment_total 

In [None]:
print("Accuracy of whether both have same comment counts")
sum(np.array(gold_comments_count) == np.array(pred_comments_count)) / comment_total 

In [None]:
np.logical_and(comment_bleu_scores == 1, np.array(pred_has_comments_list), np.array(gold_has_comments_list))

In [None]:
print("Perfect Prediction Rate:", sum(comment_bleu_scores == 1) / comment_total)
print("Above 0.9 Comment BLEU Prediction Rate:", sum(comment_bleu_scores >= 0.9) / comment_total)

In [None]:
print("Perfect Prediction Rate:", sum(np.logical_and(comment_bleu_scores == 1, np.array(pred_has_comments_list), np.array(gold_has_comments_list))) / comment_total)
print("Above 0.9 Comment CodeBLEU Prediction Rate:", sum(np.logical_and(comment_bleu_scores >= 0.9, np.array(pred_has_comments_list), np.array(gold_has_comments_list))) / comment_total)

In [None]:

print_split_line("prediction")
print(preds[3236])
print_split_line("gold labels")
print(labels[3236])

In [None]:
get_codebleu([[labels[20]]], [preds[20]], "python", '0.25,0.25,0.25,0.25')

## Removed Class Parallel Corpus - with outliers

In [None]:
# outlier_class_codet5small
evaluate_codebleu("seq2seq_results/outlier_class_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
class_pred_df = pd.read_csv("seq2seq_results/outlier_class_codet5small/codet5_preds.csv")

In [None]:
class_pred_df

In [None]:
# excluding those input exactly same as the output
exact_match_bool = class_pred_df["inputs"] == class_pred_df["labels"]
cleaned_class_pred_df = class_pred_df.drop(class_pred_df[exact_match_bool].index)

In [None]:
class_pred_df = cleaned_class_pred_df

In [None]:
class_inputs = class_pred_df["inputs"].to_numpy()
class_labels = class_pred_df["labels"].to_numpy()
class_preds = class_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
class_scores = []
class_diff_bleu_scores = []
for idx in tqdm(range(class_preds.shape[0])):
    input_code = class_inputs[idx]
    gold = class_labels[idx]
    pred = class_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    class_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        class_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    
    class_diff_bleu_scores += [class_diff_bleu_score]
    
    score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    class_scores += [score]

In [None]:
"Class BLEU score on only comparing difference in prediction:", np.mean(class_diff_bleu_scores)

In [None]:
class_total = class_preds.shape[0]

In [None]:
class_code_bleus = np.array([s["code_bleu"] for s in class_scores])

In [None]:
print("Perfect Prediction Rate:", sum(class_code_bleus == 1) / class_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(class_code_bleus > 0.9) / class_total)

In [None]:
# a perfect case
print_split_line("input")
print(eval_dataset["train"]["no_class_content"][63833])
print_split_line("prediction")
print(class_preds[0])
print_split_line("gold labels")
print(class_labels[0])
print_split_line("score")
print(class_scores[0])

In [None]:

input_idx = 53092
output_idx = 4293
print_split_line("input")
print(eval_dataset["train"]["no_class_content"][input_idx])
print_split_line("prediction")
print(class_preds[output_idx])
print_split_line("gold labels")
print(class_labels[output_idx])
print_split_line("score")
print(class_scores[output_idx])

In [None]:
print(eval_dataset["train"]["no_class_content"][74459])

In [None]:
# from transformers import RobertaTokenizer
# tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

In [None]:
# outlier_no_class_no_super_codet5small
evaluate_codebleu("seq2seq_results/outlier_no_class_no_super_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
class_super_pred_df = pd.read_csv("seq2seq_results/outlier_no_class_no_super_codet5small/codet5_preds.csv")

In [None]:
class_super_pred_df

In [None]:
# excluding those input exactly same as the output
exact_match_bool = class_super_pred_df["inputs"] == class_super_pred_df["labels"]
cleaned_class_pred_df = class_super_pred_df.drop(class_super_pred_df[exact_match_bool].index)

In [None]:
class_super_pred_df = cleaned_class_pred_df

In [None]:
class_super_inputs = class_super_pred_df["inputs"].to_numpy()
class_super_labels = class_super_pred_df["labels"].to_numpy()
class_super_preds = class_super_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
class_super_scores = []
class_super_diff_bleu_scores = []
for idx in tqdm(range(class_super_preds.shape[0])):
    input_code = class_super_inputs[idx]
    gold = class_super_labels[idx]
    pred = class_super_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    class_super_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        class_super_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    
    class_super_diff_bleu_scores += [class_super_diff_bleu_score]
    
    score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    class_super_scores += [score]

In [None]:
"Class BLEU score on only comparing difference in prediction:", np.mean(class_super_diff_bleu_scores)

In [None]:
class_super_total = class_super_preds.shape[0]

In [None]:
class_super_code_bleus = np.array([s["code_bleu"] for s in class_super_scores])

In [None]:
print("Perfect Prediction Rate:", sum(class_super_code_bleus == 1) / class_super_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(class_super_code_bleus > 0.9) / class_super_total)

In [None]:
for idx in range(class_super_total):
    if class_super_diff_bleu_scores[idx] >= 0.8:
        # if "oStart" not in pred_docstr_texts[idx]: continue
        print_split_line(f"{idx}-prediction")
        print(class_super_preds[idx])
        print_split_line(f"{idx}-gold labels")
        print(class_super_labels[idx])
        print_split_line(f"{idx}-score")
        print(class_super_diff_bleu_scores[idx])

In [None]:
for idx in range(class_super_total):
    if class_super_diff_bleu_scores[idx] < 0.3 and class_super_diff_bleu_scores[idx] > 0.1:
        print_split_line(f"{idx}-input")
        print(class_super_inputs[idx])
        print_split_line(f"{idx}-prediction")
        print(class_super_preds[idx])
        print_split_line(f"{idx}-gold labels")
        print(class_super_labels[idx])
        print_split_line(f"{idx}-score")
        print(class_super_diff_bleu_scores[idx])

## Removed Doc String Parallel Corpus - with outliers

In [None]:
# outlier_docstring_codet5small
evaluate_codebleu("seq2seq_results/outlier_docstring_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
docstr_pred_df = pd.read_csv("seq2seq_results/outlier_docstring_codet5small/codet5_preds.csv")

In [None]:
docstr_pred_df = docstr_pred_df.dropna()

In [None]:
# excluding those input exactly same as the output
exact_match_bool = docstr_pred_df["inputs"] == docstr_pred_df["labels"]
cleaned_docstr_pred_df = docstr_pred_df.drop(docstr_pred_df[exact_match_bool].index)
docstr_pred_df = cleaned_docstr_pred_df

In [None]:
docstr_inputs = docstr_pred_df["inputs"].to_numpy()
docstr_labels = docstr_pred_df["labels"].to_numpy()
docstr_preds = docstr_pred_df["preds"].to_numpy()

In [None]:
import re
def get_docstring(text):
    regex_docstr = "^\s*\'{3}([\s\S]*?)\'{3}|^\s*\"{3}([\s\S]*?)\"{3}"
    docstr_matches = re.findall(regex_docstr, text, re.M | re.S)
    docstrs = []
    for match in docstr_matches:
        docstr_a, docstr_b = match
        if docstr_a:
            docstrs += [docstr_a]
        else:
            docstrs += [docstr_b]
    return docstrs

In [None]:
# getting unit score
gold_docstrs = []
pred_docstrs = []
gold_docstr_counts = []
pred_docstr_counts = []
gold_docstr_texts = []
pred_docstr_texts = []
gold_has_docstr_list = []
pred_has_docstr_list = []

docstr_code_scores = []
docstr_text_scores = []
docstr_diff_bleu_scores = []

for idx in tqdm(range(docstr_preds.shape[0])):
    input_code = docstr_inputs[idx]
    gold = docstr_labels[idx]
    pred = docstr_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_docstr = get_docstring(gold)
    pred_docstr = get_docstring(pred)
    gold_docstr_text = "\n".join(gold_docstr)
    pred_docstr_text = "\n".join(pred_docstr)
    gold_docstr_count = len(gold_docstr)
    pred_docstr_count = len(pred_docstr)
    gold_has_docstr = len(gold_docstr) > 0
    pred_has_docstr = len(pred_docstr) > 0
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    docstr_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        docstr_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
        
    docstr_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    docstr_code_scores += [docstr_code_score]
    docstr_text_scores += [docstr_text_score]
    docstr_diff_bleu_scores += [docstr_diff_bleu_score]
       
    gold_docstrs += [gold_docstr]
    pred_docstrs += [pred_docstr]
    gold_docstr_texts += [gold_docstr_text]
    pred_docstr_texts += [pred_docstr_text]
    gold_docstr_counts += [gold_docstr_count]
    pred_docstr_counts += [pred_docstr_count]
    gold_has_docstr_list += [gold_has_docstr]
    pred_has_docstr_list += [pred_has_docstr]

In [None]:
docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [None]:
docstr_code_bleus = np.array([s["code_bleu"] for s in docstr_code_scores])

In [None]:
docstr_diff_bleu_scores = np.array(docstr_diff_bleu_scores)

In [None]:
docstr_text_bleus.mean()

In [None]:
docstr_total = docstr_preds.shape[0]

In [None]:
"Docstring BLEU score on only comparing difference in prediction:", np.mean(docstr_diff_bleu_scores)

In [None]:
print("Perfect Prediction Rate:", sum(docstr_text_bleus == 1) / docstr_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(docstr_text_bleus > 0.9) / docstr_total)

In [None]:
print("Perfect Prediction Rate:", sum(docstr_diff_bleu_scores == 1) / docstr_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(docstr_diff_bleu_scores > 0.9) / docstr_total)

In [None]:
idx = 67
print_split_line(f"{idx}-prediction")
print(docstr_preds[idx])
print_split_line(f"{idx}-gold labels")
print(docstr_labels[idx])
print_split_line(f"{idx}-score")
print(docstr_text_bleus[idx])

In [None]:
for idx in range(docstr_total):
    if docstr_text_bleus[idx] >= 0.5:
        if "oStart" not in pred_docstr_texts[idx]: continue
        print_split_line(f"{idx}-prediction")
        print(pred_docstr_texts[idx])
        print_split_line(f"{idx}-gold labels")
        print(gold_docstr_texts[idx])
        print_split_line(f"{idx}-score")
        print(docstr_text_bleus[idx])

In [None]:
# outlier_updated_docstring_codet5small
evaluate_codebleu("seq2seq_results/outlier_updated_docstring_codet5small/codet5_preds.csv")

In [None]:
updated_docstr_pred_df = pd.read_csv("seq2seq_results/outlier_updated_docstring_codet5small/codet5_preds.csv")

In [None]:
# excluding those input exactly same as the output
exact_match_bool = updated_docstr_pred_df["inputs"] == updated_docstr_pred_df["labels"]
cleaned_docstr_pred_df = updated_docstr_pred_df.drop(updated_docstr_pred_df[exact_match_bool].index)
updated_docstr_pred_df = cleaned_docstr_pred_df

In [None]:
updated_docstr_inputs = updated_docstr_pred_df["inputs"].to_numpy()
updated_docstr_labels = updated_docstr_pred_df["labels"].to_numpy()
updated_docstr_preds = updated_docstr_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
gold_docstrs = []
pred_docstrs = []
gold_docstr_counts = []
pred_docstr_counts = []
gold_docstr_texts = []
pred_docstr_texts = []
gold_has_docstr_list = []
pred_has_docstr_list = []

updated_docstr_code_scores = []
updated_docstr_text_scores = []
updated_docstr_diff_bleu_scores = []

for idx in tqdm(range(docstr_preds.shape[0])):
    input_code = docstr_inputs[idx]
    gold = docstr_labels[idx]
    pred = docstr_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_docstr = get_docstring(gold)
    pred_docstr = get_docstring(pred)
    gold_docstr_text = "\n".join(gold_docstr)
    pred_docstr_text = "\n".join(pred_docstr)
    gold_docstr_count = len(gold_docstr)
    pred_docstr_count = len(pred_docstr)
    gold_has_docstr = len(gold_docstr) > 0
    pred_has_docstr = len(pred_docstr) > 0
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    docstr_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        docstr_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
        
    docstr_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    updated_docstr_code_scores += [docstr_code_score]
    updated_docstr_text_scores += [docstr_text_score]
    updated_docstr_diff_bleu_scores += [docstr_diff_bleu_score]
       
    gold_docstrs += [gold_docstr]
    pred_docstrs += [pred_docstr]
    gold_docstr_texts += [gold_docstr_text]
    pred_docstr_texts += [pred_docstr_text]
    gold_docstr_counts += [gold_docstr_count]
    pred_docstr_counts += [pred_docstr_count]
    gold_has_docstr_list += [gold_has_docstr]
    pred_has_docstr_list += [pred_has_docstr]

In [None]:
updated_docstr_text_bleus = np.array([s["ngram"] for s in updated_docstr_text_scores])

In [None]:
updated_docstr_code_bleus = np.array([s["code_bleu"] for s in updated_docstr_code_scores])

In [None]:
updated_docstr_text_bleus.mean()

In [None]:
updated_docstr_diff_bleu_scores = np.array(updated_docstr_diff_bleu_scores)

In [None]:
"Updated Docstring BLEU score on only comparing difference in prediction:", np.mean(updated_docstr_diff_bleu_scores)

In [None]:
updated_docstr_total = updated_docstr_preds.shape[0]

In [None]:
print("Perfect Prediction Rate:", sum(updated_docstr_text_bleus == 1) / updated_docstr_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(updated_docstr_text_bleus > 0.9) / updated_docstr_total)

In [None]:
print("Perfect Prediction Rate:", sum(updated_docstr_diff_bleu_scores == 1) / updated_docstr_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(updated_docstr_diff_bleu_scores > 0.9) / updated_docstr_total)

In [None]:
for idx in range(docstr_total):
    if updated_docstr_diff_bleu_scores[idx] >= 0.0 and updated_docstr_diff_bleu_scores[idx] < 0.1:
        print_split_line(f"{idx}-input")
        print(updated_docstr_inputs[idx])
        print_split_line(f"{idx}-prediction")
        print(updated_docstr_preds[idx])
        print_split_line(f"{idx}-gold labels")
        print(updated_docstr_labels[idx])
        print_split_line(f"{idx}-score")
        print(updated_docstr_diff_bleu_scores[idx])

## Casing

In [None]:
# outlier_casing_codet5small
evaluate_codebleu("seq2seq_results/outlier_casing_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
casing_pred_df = pd.read_csv("seq2seq_results/outlier_casing_codet5small/codet5_preds.csv")

In [None]:
# excluding those input exactly same as the output
exact_match_bool = casing_pred_df["inputs"] == casing_pred_df["labels"]
cleaned_casing_pred_df = casing_pred_df.drop(casing_pred_df[exact_match_bool].index)

In [None]:
evaluate_codebleu("",  '0.25,0.25,0.25,0.25', replaced_df=cleaned_casing_pred_df)

In [None]:
casing_pred_df = cleaned_casing_pred_df

In [None]:
casing_inputs = casing_pred_df["inputs"].to_numpy()
casing_labels = casing_pred_df["labels"].to_numpy()
casing_preds = casing_pred_df["preds"].to_numpy()

In [None]:
# getting unit score

# input_casing = []
# gold_casing = []
# pred_casing = []

# gold_docstrs = []
# pred_docstrs = []
# gold_docstr_counts = []
# pred_docstr_counts = []
# gold_docstr_texts = []
# pred_docstr_texts = []
# gold_has_docstr_list = []
# pred_has_docstr_list = []

casing_code_scores = []
casing_diff_bleu_scores = []
# docstr_text_scores = []

for idx in tqdm(range(casing_preds.shape[0])):
    input_code = casing_inputs[idx]
    gold = casing_labels[idx]
    pred = casing_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    casing_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        casing_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    # gold_docstr = get_docstring(gold)
    # pred_docstr = get_docstring(pred)
    # gold_docstr_text = "\n".join(gold_docstr)
    # pred_docstr_text = "\n".join(pred_docstr)
    # gold_docstr_count = len(gold_docstr)
    # pred_docstr_count = len(pred_docstr)
    # gold_has_docstr = len(gold_docstr) > 0
    # pred_has_docstr = len(pred_docstr) > 0
    
    
    casing_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    # docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    casing_code_scores += [casing_code_score]
    casing_diff_bleu_scores += [casing_diff_bleu_score]
    # docstr_text_scores += [docstr_text_score]
       
    # gold_docstrs += [gold_docstr]
    # pred_docstrs += [pred_docstr]
    # gold_docstr_texts += [gold_docstr_text]
    # pred_docstr_texts += [pred_docstr_text]
    # gold_docstr_counts += [gold_docstr_count]
    # pred_docstr_counts += [pred_docstr_count]
    # gold_has_docstr_list += [gold_has_docstr]
    # pred_has_docstr_list += [pred_has_docstr]

In [None]:
"Casing BLEU score on only comparing difference in prediction:", np.mean(casing_diff_bleu_scores)

In [None]:
# docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [None]:
casing_code_bleus = np.array([s["code_bleu"] for s in casing_code_scores])

In [None]:
# docstr_text_bleus.mean()

In [None]:
casing_total = casing_preds.shape[0]

In [None]:
print("Perfect Prediction Rate:", sum(casing_code_bleus == 1) / casing_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(casing_code_bleus > 0.9) / casing_total)

In [None]:
idx = 531
print_split_line(f"{idx}-prediction")
print(casing_preds[idx])
print_split_line(f"{idx}-gold labels")
print(casing_labels[idx])
print_split_line(f"{idx}-score")
print(casing_code_bleus[idx])

In [None]:
for idx in range(casing_total):
    if casing_code_bleus[idx] < 0.6 and casing_code_bleus[idx] > 0.5:
        print_split_line(f"{idx}-input")
        print(casing_inputs[idx])
        print_split_line(f"{idx}-prediction")
        print(casing_preds[idx])
        print_split_line(f"{idx}-gold labels")
        print(casing_labels[idx])
        print_split_line(f"{idx}-score")
        print(casing_code_bleus[idx])

## List Comprehension

In [None]:
# outlier_comp_codet5small
evaluate_codebleu("seq2seq_results/outlier_comp_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
comp_pred_df = pd.read_csv("seq2seq_results/outlier_comp_codet5small/codet5_preds.csv")

In [None]:
# excluding those input exactly same as the output
exact_match_bool = comp_pred_df["inputs"] == comp_pred_df["labels"]
cleaned_comp_pred_df = comp_pred_df.drop(comp_pred_df[exact_match_bool].index)

In [None]:
evaluate_codebleu("",  '0.25,0.25,0.25,0.25', replaced_df=cleaned_comp_pred_df)

In [None]:
comp_pred_df = cleaned_comp_pred_df

In [None]:
comp_inputs = comp_pred_df["inputs"].to_numpy()
comp_labels = comp_pred_df["labels"].to_numpy()
comp_preds = comp_pred_df["preds"].to_numpy()

In [None]:
# getting unit score

# input_casing = []
# gold_casing = []
# pred_casing = []

# gold_docstrs = []
# pred_docstrs = []
# gold_docstr_counts = []
# pred_docstr_counts = []
# gold_docstr_texts = []
# pred_docstr_texts = []
# gold_has_docstr_list = []
# pred_has_docstr_list = []

comp_code_scores = []
comp_diff_bleu_scores = []
# docstr_text_scores = []

for idx in tqdm(range(comp_preds.shape[0])):
    gold = comp_labels[idx]
    pred = comp_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    input_code = comp_inputs[idx]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    comp_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        comp_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    # gold_docstr = get_docstring(gold)
    # pred_docstr = get_docstring(pred)
    # gold_docstr_text = "\n".join(gold_docstr)
    # pred_docstr_text = "\n".join(pred_docstr)
    # gold_docstr_count = len(gold_docstr)
    # pred_docstr_count = len(pred_docstr)
    # gold_has_docstr = len(gold_docstr) > 0
    # pred_has_docstr = len(pred_docstr) > 0
    
    
    comp_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    # docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    comp_code_scores += [comp_code_score]
    comp_diff_bleu_scores += [comp_diff_bleu_score]
    # docstr_text_scores += [docstr_text_score]
       
    # gold_docstrs += [gold_docstr]
    # pred_docstrs += [pred_docstr]
    # gold_docstr_texts += [gold_docstr_text]
    # pred_docstr_texts += [pred_docstr_text]
    # gold_docstr_counts += [gold_docstr_count]
    # pred_docstr_counts += [pred_docstr_count]
    # gold_has_docstr_list += [gold_has_docstr]
    # pred_has_docstr_list += [pred_has_docstr]

In [None]:
"List Comp BLEU score on only comparing difference in prediction:", np.mean(comp_diff_bleu_scores)

In [None]:
# docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [None]:
comp_code_bleus = np.array([s["code_bleu"] for s in comp_code_scores])

In [None]:
# docstr_text_bleus.mean()

In [None]:
comp_total = comp_preds.shape[0]

In [None]:
print("Perfect Prediction Rate:", sum(comp_code_bleus == 1) / comp_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(comp_code_bleus > 0.9) / comp_total)

In [None]:
# idx = 760
# print_split_line(f"{idx}-prediction")
# print(docstr_preds[idx])
# print_split_line(f"{idx}-gold labels")
# print(docstr_labels[idx])
# print_split_line(f"{idx}-score")
# print(docstr_text_bleus[idx])

In [None]:
comp_code_bleus.max()

In [None]:
for idx in range(comp_total):
    if comp_code_bleus[idx] == 1: continue
    if comp_code_bleus[idx] >= 0.7: continue
    if comp_code_bleus[idx] < 0.4: continue
    print_split_line(f"{idx}-input")
    print(comp_inputs[idx])
    print_split_line(f"{idx}-prediction")
    print(comp_preds[idx])
    print_split_line(f"{idx}-gold labels")
    print(comp_labels[idx])
    print_split_line(f"{idx}-score")
    print(comp_code_bleus[idx])

In [None]:
# outlier_fixed_list_comp_codet5small
evaluate_codebleu("seq2seq_results/outlier_fixed_list_comp_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

In [None]:
fixed_comp_pred_df = pd.read_csv("seq2seq_results/outlier_fixed_list_comp_codet5small/codet5_preds.csv")

In [None]:
# excluding those input exactly same as the output
exact_match_bool = fixed_comp_pred_df["inputs"] == fixed_comp_pred_df["labels"]
cleaned_comp_pred_df = fixed_comp_pred_df.drop(fixed_comp_pred_df[exact_match_bool].index)

In [None]:
evaluate_codebleu("",  '0.25,0.25,0.25,0.25', replaced_df=cleaned_comp_pred_df)

In [None]:
fixed_comp_pred_df = cleaned_comp_pred_df

In [None]:
fixed_comp_inputs = fixed_comp_pred_df["inputs"].to_numpy()
fixed_comp_labels = fixed_comp_pred_df["labels"].to_numpy()
fixed_comp_preds = fixed_comp_pred_df["preds"].to_numpy()

In [None]:
# getting unit score

# input_casing = []
# gold_casing = []
# pred_casing = []

# gold_docstrs = []
# pred_docstrs = []
# gold_docstr_counts = []
# pred_docstr_counts = []
# gold_docstr_texts = []
# pred_docstr_texts = []
# gold_has_docstr_list = []
# pred_has_docstr_list = []

fixed_comp_code_scores = []
fixed_comp_diff_bleu_scores = []
# docstr_text_scores = []

for idx in tqdm(range(fixed_comp_preds.shape[0])):
    gold = fixed_comp_labels[idx]
    pred = fixed_comp_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    input_code = fixed_comp_inputs[idx]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    fixed_comp_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        fixed_comp_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    # gold_docstr = get_docstring(gold)
    # pred_docstr = get_docstring(pred)
    # gold_docstr_text = "\n".join(gold_docstr)
    # pred_docstr_text = "\n".join(pred_docstr)
    # gold_docstr_count = len(gold_docstr)
    # pred_docstr_count = len(pred_docstr)
    # gold_has_docstr = len(gold_docstr) > 0
    # pred_has_docstr = len(pred_docstr) > 0
    
    
    fixed_comp_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    # docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    fixed_comp_code_scores += [fixed_comp_code_score]
    fixed_comp_diff_bleu_scores += [fixed_comp_diff_bleu_score]
    # docstr_text_scores += [docstr_text_score]
       
    # gold_docstrs += [gold_docstr]
    # pred_docstrs += [pred_docstr]
    # gold_docstr_texts += [gold_docstr_text]
    # pred_docstr_texts += [pred_docstr_text]
    # gold_docstr_counts += [gold_docstr_count]
    # pred_docstr_counts += [pred_docstr_count]
    # gold_has_docstr_list += [gold_has_docstr]
    # pred_has_docstr_list += [pred_has_docstr]

In [None]:
fixed_comp_diff_bleu_scores = np.array(fixed_comp_diff_bleu_scores)

In [None]:
"List Comp BLEU score on only comparing difference in prediction:", np.mean(fixed_comp_diff_bleu_scores)

In [None]:
# docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [None]:
fixed_comp_code_bleus = np.array([s["code_bleu"] for s in fixed_comp_code_scores])

In [None]:
# docstr_text_bleus.mean()

In [None]:
fixed_comp_total = fixed_comp_preds.shape[0]

In [None]:
print("Perfect Prediction Rate:", sum(fixed_comp_code_bleus == 1) / fixed_comp_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(fixed_comp_code_bleus > 0.9) / fixed_comp_total)

In [None]:
print("Perfect Prediction Rate:", sum(fixed_comp_diff_bleu_scores == 1) / fixed_comp_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(fixed_comp_diff_bleu_scores > 0.9) / fixed_comp_total)

In [None]:
# idx = 760
# print_split_line(f"{idx}-prediction")
# print(docstr_preds[idx])
# print_split_line(f"{idx}-gold labels")
# print(docstr_labels[idx])
# print_split_line(f"{idx}-score")
# print(docstr_text_bleus[idx])

In [None]:
fixed_comp_code_bleus.max()

In [None]:
for idx in range(fixed_comp_total):
    if fixed_comp_diff_bleu_scores[idx] != 1: continue
    if fixed_comp_diff_bleu_scores[idx] <= 0.9: continue
    # if fixed_comp_code_bleus[idx] < 0.2: continue
    print_split_line(f"{idx}-input")
    print(fixed_comp_inputs[idx])
    print_split_line(f"{idx}-prediction")
    print(fixed_comp_preds[idx])
    print_split_line(f"{idx}-gold labels")
    print(fixed_comp_labels[idx])
    print_split_line(f"{idx}-score")
    print(fixed_comp_diff_bleu_scores[idx])

# Codex

## list comp - post processing

In [38]:
folder = "/data/ken/data/code"
model_name = "."
ckpt = "."

In [41]:
pred_csvfile = "codex_output_postprocessed.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["list_comp"]

In [42]:
list_comp_codex_codebleu = evaluate_codebleu(file_name,  '0.25,0.25,0.25,0.25')
list_comp_codex_codebleu

{'ngram': 0.8944182364863242,
 'weighted_ngram': 0.8976747447630965,
 'syntax_match': 0.8812310617313291,
 'dataflow_match': 0.7832284157996021,
 'code_bleu': 0.8641381146950879}

In [43]:
list_comp_codex_pred_df = pd.read_csv(file_name)

In [44]:
list_comp_codex_report = evaluate_pred_df(list_comp_codex_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/473 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [45]:
for key, val in list_comp_codex_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.0021141649048625794
codebleu_above_90 : 0.5095137420718816
diff_bleu_avg : 0.16295339387378543
diff_bleu_perfect : 0.048625792811839326
diff_bleu_above_90 : 0.0507399577167019
parse_test_accuracy : 0.5665961945031712


In [49]:
lookup_examples(list_comp_codex_report, 0.01, 0)




import numpy as np
from tensorpack.RL import HistoryFramePlayer
__all__ = ['HistoryPlayerWithVar']

class HistoryPlayerWithVar(HistoryFramePlayer):

    def current_state(self):
        assert (len(self.history) != 0)
        assert (len(self.history[0]) == 2), 'state needs to be like [img, vars]'
        diff_len = (self.history.maxlen - len(self.history))
        zeros = [[31m]
        for [0mk in range(diff_len)[31m:
            zeros.append(np.zeros_like(self.history[0][0]))[0m
        for k in self.history:
            zeros.append(k[0])
        img = np.concatenate(zeros, axis=2)
        gvar = self.history[(- 1)][1]
        return (img, gvar)




import numpy as np
from tensorpack.RL import HistoryFramePlayer
__all__ = ['HistoryPlayerWithVar']

class HistoryPlayerWithVar(HistoryFramePlayer):

    def current_state(self):
        assert (len(self.history) != 0)
        assert (len(self.history[0]) == 2), 'state needs to be like [img, vars]'
        diff_len = (self.history

## decorator - post processing

In [2]:
folder = "/data/ken/data/code"
model_name = "."
ckpt = "."

In [3]:
pred_csvfile = "decorator.output_post_process.csv"
file_name = f"{folder}/{model_name}/{ckpt}/{pred_csvfile}"
target_feats = ["decorator"]

In [4]:
decorator_codex_pred_df = pd.read_csv(file_name)
decorator_codex_pred_df = get_valid_pred_df(decorator_codex_pred_df)

In [5]:
decorator_codex_codebleu = evaluate_codebleu(file_name, replaced_df=decorator_codex_pred_df, weights='0.25,0.25,0.25,0.25')
decorator_codex_codebleu

{'ngram': 0.634677744038495,
 'weighted_ngram': 0.6514251474979365,
 'syntax_match': 0.7381001301280734,
 'dataflow_match': 0.6642056330325403,
 'code_bleu': 0.6721021636742612}

In [6]:
decorator_codex_report = evaluate_pred_df(decorator_codex_pred_df, target_feats, is_nl=True, parse_test=True)

  0%|          | 0/173 [00:00<?, ?it/s]

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()




In [7]:
for key, val in decorator_codex_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.005780346820809248
codebleu_above_90 : 0.08670520231213873
diff_bleu_avg : 0.01865318485074729
diff_bleu_perfect : 0.017341040462427744
diff_bleu_above_90 : 0.017341040462427744
parse_test_accuracy : 0.3583815028901734


In [8]:
decorator_codex_report = evaluate_pred_df(decorator_codex_pred_df, target_feats, clean_diff=True, is_nl=True, parse_test=True)

  0%|          | 0/173 [00:00<?, ?it/s]



In [9]:
for key, val in decorator_codex_report.items():
    if type(val) != list and len(val.shape) == 0:
        print(key, ":", val)

codebleu_perfect : 0.005780346820809248
codebleu_above_90 : 0.08670520231213873
diff_bleu_avg : 0.06637365209541911
diff_bleu_perfect : 0.057803468208092484
diff_bleu_above_90 : 0.057803468208092484
parse_test_accuracy : 0.3583815028901734


In [16]:
lookup_examples(decorator_codex_report, 1, 0.5, count=25)



import os
import pytest
EXAMINATORS = ['bamboo.buildKey', 'BUILD_ID', 'BUILD_NUMBER', 'BUILDKITE', 'CI', 'CIRCLECI', 'CONTINUOUS_INTEGRATION', 'GITHUB_ACTIONS', 'HUDSON_URL', 'JENKINS_URL', 'TEAMCITY_VERSION', 'TRAVIS']

def pytest_runtest_makereport(item):
    [31m"""[0mFailing test cases are not a problem anymore.[31m"""[0m
    outcome = (yield)
    rep = outcome.get_result()
    examinators = EXAMINATORS
    for examinator in item.config.getini('vw_examinators').split('\n'):
        examinators.append(examinator.strip())
    if any((os.environ.get(gaze, False) for gaze in examinators)):
        rep.outcome = 'passed'

def pytest_addoption(parser):
    parser.addini('vw_examinators', 'List of additional VW examinators.')


import os
import pytest
EXAMINATORS = ['bamboo.buildKey', 'BUILD_ID', 'BUILD_NUMBER', 'BUILDKITE', 'CI', 'CIRCLECI', 'CONTINUOUS_INTEGRATION', 'GITHUB_ACTIONS', 'HUDSON_URL', 'JENKINS_URL', 'TEAMCITY_VERSION', 'TRAVIS']
[31m
@pytest.hookimpl(hookwrapper=True

In [12]:
decorator_codex_report["pred_diffs"][0], decorator_codex_report["gold_diffs"][0]

('@mock.patch socket.gethostbyname_ex ip',
 'None : None : @mock.patch socket.gethostbyname_ex ')