In [121]:
# eval on predictions
import re
import difflib

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from evaluator.CodeBLEU.calc_code_bleu import get_codebleu
from nltk.translate.bleu_score import sentence_bleu

from utils.regex_parse import comment


In [2]:
def evaluate_codebleu(pred_filename, weights="0.25,0.25,0.25,0.25", replaced_df=None):
    pred_df = None
    if replaced_df is not None:
        pred_df = replaced_df
    else:
        pred_df = pd.read_csv(pred_filename)
    # a list of gold codes (which is just some variants of the same code, we can use every code of different styles)
    refs = [
        pred_df["labels"]
    ]
    # the prediction code
    hyp = pred_df["preds"]
    score = get_codebleu(refs, hyp, "python", weights)
    return score

In [3]:
def print_split_line(s):
    print(f"\n====================={s.upper()}=====================\n")

In [None]:
def tokenize(s):
    return re.split('\s+', s)

def get_diff_list(str_1, str_2):
    s1 = tokenize(str_1)
    s2 = tokenize(str_2)

    matcher = difflib.SequenceMatcher(a=s1, b=s2)

    diff_blocks_a = []
    diff_blocks_b = []

    prev_match = None
    for idx, match in enumerate(matcher.get_matching_blocks()):

        if idx == 0: 
            prev_match = match
            if match.a != 0:
                start_idx_a = 0
                end_idx_a = match.a
                diff_blocks_a += s1[start_idx_a:end_idx_a]
            if match.b != 0:
                start_idx_b = 0
                end_idx_b = match.b
                diff_blocks_b += s2[start_idx_b:end_idx_b]
            continue

        start_idx_a = prev_match.a + prev_match.size
        end_idx_a = match.a

        start_idx_b = prev_match.b + prev_match.size
        end_idx_b = match.b

        diff_list_a = s1[start_idx_a:end_idx_a]
        diff_list_b = s2[start_idx_b:end_idx_b]
        if len(diff_list_a):
            diff_blocks_a += diff_list_a
        if len(diff_list_b):
            diff_blocks_b += diff_list_b

        prev_match = match
    return diff_blocks_a, diff_blocks_b

def get_diff_str(input_str, output_str):
    return " ".join(get_diff_list(input_str, output_str)[1])

# Uncomment Parallel Corpus

In [5]:
# no_outlier_codet5small
evaluate_codebleu("seq2seq_results/no_outlier_codet5small/codet5_preds.csv")

{'ngram': 0.7020337009365258,
 'weighted_ngram': 0.7158332483308997,
 'syntax_match': 0.9308169419505175,
 'dataflow_match': 0.8687742139204994,
 'code_bleu': 0.8043645262846106}

In [7]:
# outlier_codet5small
evaluate_codebleu("seq2seq_results/outlier_codet5small/codet5_preds.csv")

{'ngram': 0.7020194689240385,
 'weighted_ngram': 0.7148537062012494,
 'syntax_match': 0.9295950989323382,
 'dataflow_match': 0.8680684288664943,
 'code_bleu': 0.8036341757310301}

In [118]:
comment_pred_df = pd.read_csv("seq2seq_results/outlier_codet5small/codet5_preds.csv")

In [123]:
# excluding those input exactly same as the output
exact_match_bool = comment_pred_df["inputs"] == comment_pred_df["labels"]
cleaned_comment_pred_df = comment_pred_df.drop(comment_pred_df[exact_match_bool].index)

In [None]:
evaluate_codebleu("", weights="0.25,0.25,0.25,0.25", replaced_df=cleaned_comment_pred_df)

In [124]:
comment_pred_df = cleaned_comment_pred_df

In [125]:
comment_inputs = comment_pred_df["inputs"].to_numpy()
comment_labels = comment_pred_df["labels"].to_numpy()
comment_preds = comment_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
comment_code_scores = []
comment_text_scores = []
comment_diff_bleu_scores = []

gold_comments = []
pred_comments = []
gold_comment_texts = []
pred_comment_texts = []
gold_comments_count = []
pred_comments_count = []
gold_has_comments_list = []
pred_has_comments_list = []

for idx in tqdm(range(comment_preds.shape[0])):
    input_code = comment_inputs[idx]
    gold = comment_labels[idx]
    pred = comment_preds[idx]
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    comment_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    
    gold_comment = comment(gold)
    pred_comment = comment(pred)
    gold_comment_text = "\n".join(gold_comment)
    pred_comment_text = "\n".join(pred_comment)
    gold_comment_count = len(gold_comment)
    pred_comment_count = len(pred_comment)
    gold_has_comment = len(gold_comment) > 0
    pred_has_comment = len(pred_comment) > 0
    
    
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    comment_diff_bleu_score = 0
    if len(pred_diff_str.split()) > 0:
        comment_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    
    comment_text_score = get_codebleu([[gold_comment_text]], [pred_comment_text], "python", '1,0,0,0')
    
    comment_code_scores += [comment_code_score]
    comment_text_scores += [comment_text_score]
    comment_diff_bleu_scores += [comment_diff_bleu_score]
       
    gold_comments += [gold_comment]
    pred_comments += [pred_comment]
    gold_comment_texts += [gold_comment_text]
    pred_comment_texts += [pred_comment_text]
    gold_comments_count += [gold_comment_count]
    pred_comments_count += [pred_comment_count]
    gold_has_comments_list += [gold_has_comment]
    pred_has_comments_list += [pred_has_comment]

In [128]:
comment_bleu_scores = np.array([s["ngram"] for s in comment_text_scores])

In [129]:
comment_bleu_scores.mean()

0.18750932499757741

In [131]:
"Comment BLEU score on only comparing difference in prediction:", np.mean(comment_diff_bleu_scores)

('Comment BLEU score on only comparing difference in prediction:',
 0.2515634110214649)

In [12]:
comment_bleu_scores.max()

1.0

In [18]:
comment_bleu_scores[3236]

0.9013987750892306

In [17]:
idx = 188
print_split_line(f"{idx}-prediction")
print(comment_preds[idx])
print_split_line(f"{idx}-gold labels")
print(comment_labels[idx])
print_split_line(f"{idx}-score")
print(comment_bleu_scores[idx])



#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os
import worlds as my_world

fldr = os.getcwd() + os.sep + 'data' + os.sep + 'worlds'

class Planet(object):
    """
    class to manage the simplified evolution of a planet to 
    build a virtual world. Takes basic atmospheric parameters
    and *very* roughly uses these to guess what the world would
    look like.
    The idea is to be able to auto generate worlds as follows:
    green lush worlds: sun > 0.15, rain > 0.15
    earth like worlds: sun=0.2, rain=0.1, wind=0.1
    metal rich worlds: sun<0.2, wind>0.2, seismic_activity>0.6
    """
    def __init__(self, name, num_seeds, width, height, wind, rain, sun, lava):
        """
        All parameters must be between 0 and 1 and show the probability of
        that event. The numbers below are rough guidelines for normal planets
        wind 0.0 -> 0.2 : determines air currents, rain movement, topsoil
        rain 0.1 -> 0.6 : determines plant growth, river networks
        

In [13]:
comment_total = len(comment_preds)
sum(pred_has_comments_list), sum(gold_has_comments_list)

(1449, 2334)

In [None]:
for idx in range(comment_total):
    if comment_bleu_scores[idx] < 0.5 or comment_bleu_scores[idx] > 0.95:    
        continue
    
    if not pred_has_comments_list[idx]:
        continue
    if not gold_has_comments_list[idx]:
        continue
        
#     if "copyright" in pred_comment_texts[idx].lower():
#         continue
        
#     if "copyright" in gold_comment_texts[idx].lower():
#         continue
        
#     if "license" in pred_comment_texts[idx].lower():
#         continue
        
#     if "license" in gold_comment_texts[idx].lower():
#         continue
        
    
        
#     if "\n#" in pred_comment_texts[idx].lower():
#         continue
        
    # if " #" not in gold_comment_texts[idx].lower():
    #     continue
    
    # if " #" in pred_comment_texts[idx].lower():
    print_split_line(f"{idx}-prediction")
    print(comment_preds[idx])
    print_split_line(f"{idx}-gold labels")
    print(comment_labels[idx])
    print_split_line(f"{idx}-score")
    print(comment_bleu_scores[idx])
    
        

In [19]:
print("Accuracy of whether both do or do not have comments")
sum(np.array(pred_has_comments_list) == np.array(gold_has_comments_list)) / comment_total 

Accuracy of whether both do or do not have comments


0.6509259259259259

In [21]:
print("Accuracy of whether both have same comment counts")
sum(np.array(gold_comments_count) == np.array(pred_comments_count)) / comment_total 

Accuracy of whether both have same comment counts


0.4049382716049383

In [67]:
np.logical_and(comment_bleu_scores == 1, np.array(pred_has_comments_list), np.array(gold_has_comments_list))

array([False,  True, False, ..., False,  True, False])

In [64]:
print("Perfect Prediction Rate:", sum(comment_bleu_scores == 1) / comment_total)
print("Above 0.9 Comment BLEU Prediction Rate:", sum(comment_bleu_scores >= 0.9) / comment_total)

Perfect Prediction Rate: 0.09598765432098766
Above 0.9 Comment BLEU Prediction Rate: 0.13425925925925927


In [69]:
print("Perfect Prediction Rate:", sum(np.logical_and(comment_bleu_scores == 1, np.array(pred_has_comments_list), np.array(gold_has_comments_list))) / comment_total)
print("Above 0.9 Comment CodeBLEU Prediction Rate:", sum(np.logical_and(comment_bleu_scores >= 0.9, np.array(pred_has_comments_list), np.array(gold_has_comments_list))) / comment_total)

Perfect Prediction Rate: 0.09598765432098766
Above 0.9 Comment BLEU Prediction Rate: 0.13425925925925927


In [50]:

print_split_line("prediction")
print(preds[3236])
print_split_line("gold labels")
print(labels[3236])

# coding=utf-8
# Copyright 2022 The TensorFlow Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for trajectory."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import tensorflow as tf

from tf_agents.drivers import dynamic_episode_driver
from tf_agents.drivers import test_utils as drivers_test_utils
from tf_agents.environments import tf_py_environment
from tf_agents.trajectories import t

In [47]:
get_codebleu([[labels[20]]], [preds[20]], "python", '0.25,0.25,0.25,0.25')

{'ngram': 0.030049336124714957,
 'weighted_ngram': 0.049595120779045175,
 'syntax_match': 0.9565217391304348,
 'dataflow_match': 1.0,
 'code_bleu': 0.5090415490085487}

# Removed Class Parallel Corpus - with outliers

In [6]:
# outlier_class_codet5small
evaluate_codebleu("seq2seq_results/outlier_class_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

{'ngram': 0.9539663181494397,
 'weighted_ngram': 0.955850824785441,
 'syntax_match': 0.9574235970530824,
 'dataflow_match': 0.8715078962664325,
 'code_bleu': 0.9346871590635989}

In [29]:
class_pred_df = pd.read_csv("seq2seq_results/outlier_class_codet5small/codet5_preds.csv")

In [30]:
class_pred_df

Unnamed: 0.1,Unnamed: 0,preds,labels
0,0,"""""""Sequence-to-sequence model with an attentio...","""""""Sequence-to-sequence model with an attentio..."
1,1,"from django.db import models, migrations\n\ncl...","from django.db import models, migrations\n\ncl..."
2,2,"from lampost.di.resource import Injected, modu...","from lampost.di.resource import Injected, modu..."
3,3,"import logging\nfrom typing import Any, Dict\n...","import logging\nfrom typing import Any, Dict\n..."
4,4,"""""""Module for finding vulnerabilities based on...","""""""Module for finding vulnerabilities based on..."
...,...,...,...
7840,7840,from.dice import Dice\nfrom.player import Play...,from.dice import Dice\nfrom.player import Play...
7841,7841,from.views import BaseView\nfrom forums.extens...,from.views import BaseView\nfrom forums.extens...
7842,7842,"""""""\nutils.py\n========\nThis submodule contai...","""""""\nutils.py\n========\nThis submodule contai..."
7843,7843,"""""""Tests for audio_classifier.""""""\nimport enum...","""""""Tests for audio_classifier.""""""\nimport enum..."


In [31]:
class_labels = class_pred_df["labels"].to_numpy()
class_preds = class_pred_df["preds"].to_numpy()

In [None]:
# getting unit score
class_scores = []
for idx in tqdm(range(class_preds.shape[0])):
    refs = [
        [class_labels[idx]]
    ]
    hyp = [class_preds[idx]]
    score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    class_scores += [score]

In [54]:
class_total = class_preds.shape[0]

In [43]:
for idx, score in enumerate(class_scores):
    if score["weighted_ngram"] == 1:
        print(idx)
        break

0


In [None]:

from datasets import load_from_disk, load_metric
fname_prefix = ""

test_codet5_dataset = load_from_disk(fname_prefix + 'datasets/codet5_test_class_bq_padded.hf') #codet5_train_class_bq_padded.hf


In [48]:
keyword = "Construct sequence encoder"
for idx, code in enumerate(test_codet5_dataset["content"]):
    if keyword in code:
        print(idx)

train 63833


In [51]:
class_code_bleus = np.array([s["code_bleu"] for s in class_scores])

In [60]:
print("Perfect Prediction Rate:", sum(class_code_bleus == 1) / class_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(class_code_bleus > 0.9) / class_total)

Perfect Prediction Rate: 0.28846398980242194
Above 0.9 CodeBLEU Prediction Rate: 0.7445506692160612


In [None]:
# a perfect case
print_split_line("input")
print(eval_dataset["train"]["no_class_content"][63833])
print_split_line("prediction")
print(class_preds[0])
print_split_line("gold labels")
print(class_labels[0])
print_split_line("score")
print(class_scores[0])

In [None]:

input_idx = 53092
output_idx = 4293
print_split_line("input")
print(eval_dataset["train"]["no_class_content"][input_idx])
print_split_line("prediction")
print(class_preds[output_idx])
print_split_line("gold labels")
print(class_labels[output_idx])
print_split_line("score")
print(class_scores[output_idx])

In [34]:
print(eval_dataset["train"]["no_class_content"][74459])


import ctypes
pass

def __init__(message):
    message += (' (%s)' % ctypes.WinError())
    super(PyperclipWindowsException, self).__init__(message)



In [52]:
# from transformers import RobertaTokenizer
# tokenizer = RobertaTokenizer.from_pretrained("Salesforce/codet5-small")

# Removed Doc String Parallel Corpus - with outliers

In [3]:
# outlier_docstring_codet5small
evaluate_codebleu("seq2seq_results/outlier_docstring_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

{'ngram': 0.6007622882171681,
 'weighted_ngram': 0.6152656721984298,
 'syntax_match': 0.8764716714385848,
 'dataflow_match': 0.7972510418090768,
 'code_bleu': 0.7224376684158149}

In [114]:
docstr_pred_df = pd.read_csv("seq2seq_results/outlier_docstring_codet5small/codet5_preds.csv")

In [20]:
docstr_labels = docstr_pred_df["labels"].to_numpy()
docstr_preds = docstr_pred_df["preds"].to_numpy()

In [22]:
import re
def get_docstring(text):
    regex_docstr = "^\s*\'{3}([\s\S]*?)\'{3}|^\s*\"{3}([\s\S]*?)\"{3}"
    docstr_matches = re.findall(regex_docstr, text, re.M | re.S)
    docstrs = []
    for match in docstr_matches:
        docstr_a, docstr_b = match
        if docstr_a:
            docstrs += [docstr_a]
        else:
            docstrs += [docstr_b]
    return docstrs

In [None]:
# getting unit score
gold_docstrs = []
pred_docstrs = []
gold_docstr_counts = []
pred_docstr_counts = []
gold_docstr_texts = []
pred_docstr_texts = []
gold_has_docstr_list = []
pred_has_docstr_list = []

docstr_code_scores = []
docstr_text_scores = []

for idx in tqdm(range(docstr_preds.shape[0])):
    
    gold = docstr_labels[idx]
    pred = docstr_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_docstr = get_docstring(gold)
    pred_docstr = get_docstring(pred)
    gold_docstr_text = "\n".join(gold_docstr)
    pred_docstr_text = "\n".join(pred_docstr)
    gold_docstr_count = len(gold_docstr)
    pred_docstr_count = len(pred_docstr)
    gold_has_docstr = len(gold_docstr) > 0
    pred_has_docstr = len(pred_docstr) > 0
    
    
    docstr_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    docstr_code_scores += [docstr_code_score]
    docstr_text_scores += [docstr_text_score]
       
    gold_docstrs += [gold_docstr]
    pred_docstrs += [pred_docstr]
    gold_docstr_texts += [gold_docstr_text]
    pred_docstr_texts += [pred_docstr_text]
    gold_docstr_counts += [gold_docstr_count]
    pred_docstr_counts += [pred_docstr_count]
    gold_has_docstr_list += [gold_has_docstr]
    pred_has_docstr_list += [pred_has_docstr]

In [24]:
docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [25]:
docstr_code_bleus = np.array([s["code_bleu"] for s in docstr_code_scores])

In [26]:
docstr_text_bleus.mean()

0.06557760175998402

In [27]:
docstr_total = docstr_preds.shape[0]

In [28]:
print("Perfect Prediction Rate:", sum(docstr_text_bleus == 1) / docstr_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(docstr_text_bleus > 0.9) / docstr_total)

Perfect Prediction Rate: 0.02549928673323823
Above 0.9 CodeBLEU Prediction Rate: 0.03245363766048502


In [None]:
idx = 760
print_split_line(f"{idx}-prediction")
print(docstr_preds[idx])
print_split_line(f"{idx}-gold labels")
print(docstr_labels[idx])
print_split_line(f"{idx}-score")
print(docstr_text_bleus[idx])

In [None]:
for idx in range(docstr_total):
    if docstr_text_bleus[idx] >= 0.5:
        print_split_line(f"{idx}-prediction")
        print(pred_docstr_texts[idx])
        print_split_line(f"{idx}-gold labels")
        print(gold_docstr_texts[idx])
        print_split_line(f"{idx}-score")
        print(docstr_text_bleus[idx])

# Casing

In [4]:
# outlier_casing_codet5small
evaluate_codebleu("seq2seq_results/outlier_casing_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

{'ngram': 0.9464984063637811,
 'weighted_ngram': 0.9492120027209221,
 'syntax_match': 0.9920926165526851,
 'dataflow_match': 0.9604937079397652,
 'code_bleu': 0.9620741833942883}

In [12]:
casing_pred_df = pd.read_csv("seq2seq_results/outlier_casing_codet5small/codet5_preds.csv")

In [90]:
# excluding those input exactly same as the output
exact_match_bool = casing_pred_df["inputs"] == casing_pred_df["labels"]
cleaned_casing_pred_df = casing_pred_df.drop(casing_pred_df[exact_match_bool].index)

In [91]:
evaluate_codebleu("",  '0.25,0.25,0.25,0.25', replaced_df=cleaned_casing_pred_df)

{'ngram': 0.9440698608811208,
 'weighted_ngram': 0.9469331824692371,
 'syntax_match': 0.9917691355554074,
 'dataflow_match': 0.9588187759294317,
 'code_bleu': 0.9603977387087993}

In [92]:
casing_pred_df = cleaned_casing_pred_df

In [93]:
casing_inputs = casing_pred_df["inputs"].to_numpy()
casing_labels = casing_pred_df["labels"].to_numpy()
casing_preds = casing_pred_df["preds"].to_numpy()

In [None]:
# getting unit score

# input_casing = []
# gold_casing = []
# pred_casing = []

# gold_docstrs = []
# pred_docstrs = []
# gold_docstr_counts = []
# pred_docstr_counts = []
# gold_docstr_texts = []
# pred_docstr_texts = []
# gold_has_docstr_list = []
# pred_has_docstr_list = []

casing_code_scores = []
casing_diff_bleu_scores = []
# docstr_text_scores = []

for idx in tqdm(range(casing_preds.shape[0])):
    input_code = casing_inputs[idx]
    gold = casing_labels[idx]
    pred = casing_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    casing_diff_bleu_score = 0
    if len(pred_diff_str) > 0:
        casing_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    # gold_docstr = get_docstring(gold)
    # pred_docstr = get_docstring(pred)
    # gold_docstr_text = "\n".join(gold_docstr)
    # pred_docstr_text = "\n".join(pred_docstr)
    # gold_docstr_count = len(gold_docstr)
    # pred_docstr_count = len(pred_docstr)
    # gold_has_docstr = len(gold_docstr) > 0
    # pred_has_docstr = len(pred_docstr) > 0
    
    
    casing_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    # docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    casing_code_scores += [casing_code_score]
    casing_diff_bleu_scores += [casing_diff_bleu_score]
    # docstr_text_scores += [docstr_text_score]
       
    # gold_docstrs += [gold_docstr]
    # pred_docstrs += [pred_docstr]
    # gold_docstr_texts += [gold_docstr_text]
    # pred_docstr_texts += [pred_docstr_text]
    # gold_docstr_counts += [gold_docstr_count]
    # pred_docstr_counts += [pred_docstr_count]
    # gold_has_docstr_list += [gold_has_docstr]
    # pred_has_docstr_list += [pred_has_docstr]

In [112]:
"Casing BLEU score on only comparing difference in prediction:", np.mean(casing_diff_bleu_scores)

('Casing BLEU score on only comparing difference in prediction:',
 0.7060336355803001)

In [96]:
# docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [97]:
casing_code_bleus = np.array([s["code_bleu"] for s in casing_code_scores])

In [98]:
# docstr_text_bleus.mean()

In [99]:
casing_total = casing_preds.shape[0]

In [100]:
print("Perfect Prediction Rate:", sum(casing_code_bleus == 1) / casing_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(casing_code_bleus > 0.9) / casing_total)

Perfect Prediction Rate: 0.5115195663222091
Above 0.9 CodeBLEU Prediction Rate: 0.8553277994240217


In [None]:
idx = 760
print_split_line(f"{idx}-prediction")
print(docstr_preds[idx])
print_split_line(f"{idx}-gold labels")
print(docstr_labels[idx])
print_split_line(f"{idx}-score")
print(docstr_text_bleus[idx])

In [None]:
for idx in range(casing_total):
    if casing_code_bleus[idx] < 0.6 and casing_code_bleus[idx] > 0.5:
        print_split_line(f"{idx}-input")
        print(casing_inputs[idx])
        print_split_line(f"{idx}-prediction")
        print(casing_preds[idx])
        print_split_line(f"{idx}-gold labels")
        print(casing_labels[idx])
        print_split_line(f"{idx}-score")
        print(casing_code_bleus[idx])

# List Comprehension

In [4]:
# outlier_comp_codet5small
evaluate_codebleu("seq2seq_results/outlier_comp_codet5small/codet5_preds.csv",  '0.25,0.25,0.25,0.25')

{'ngram': 0.9872128913481273,
 'weighted_ngram': 0.9873731766619129,
 'syntax_match': 0.9819772416779816,
 'dataflow_match': 0.9498150078983433,
 'code_bleu': 0.9765945793965913}

In [101]:
comp_pred_df = pd.read_csv("seq2seq_results/outlier_comp_codet5small/codet5_preds.csv")

In [102]:
# excluding those input exactly same as the output
exact_match_bool = comp_pred_df["inputs"] == comp_pred_df["labels"]
cleaned_comp_pred_df = comp_pred_df.drop(comp_pred_df[exact_match_bool].index)

In [43]:
evaluate_codebleu("",  '0.25,0.25,0.25,0.25', replaced_df=cleaned_comp_pred_df)

{'ngram': 0.968530550095664,
 'weighted_ngram': 0.9689526941115969,
 'syntax_match': 0.960221464239374,
 'dataflow_match': 0.8916389008662929,
 'code_bleu': 0.947335902328232}

In [103]:
comp_pred_df = cleaned_comp_pred_df

In [104]:
comp_inputs = comp_pred_df["inputs"].to_numpy()
comp_labels = comp_pred_df["labels"].to_numpy()
comp_preds = comp_pred_df["preds"].to_numpy()

In [105]:
# getting unit score

# input_casing = []
# gold_casing = []
# pred_casing = []

# gold_docstrs = []
# pred_docstrs = []
# gold_docstr_counts = []
# pred_docstr_counts = []
# gold_docstr_texts = []
# pred_docstr_texts = []
# gold_has_docstr_list = []
# pred_has_docstr_list = []

comp_code_scores = []
comp_diff_bleu_scores = []
# docstr_text_scores = []

for idx in tqdm(range(comp_preds.shape[0])):
    gold = comp_labels[idx]
    pred = comp_preds[idx]
    
    refs = [
        [gold]
    ]
    hyp = [pred]
    
    input_code = casing_inputs[idx]
    
    gold_diff_str = get_diff_str(input_code, gold)
    pred_diff_str = get_diff_str(input_code, pred)
    
    comp_diff_bleu_score = 0
    if len(pred_diff_str) > 0:
        comp_diff_bleu_score = sentence_bleu([gold_diff_str.split()], pred_diff_str.split(), auto_reweigh=True)
    # gold_docstr = get_docstring(gold)
    # pred_docstr = get_docstring(pred)
    # gold_docstr_text = "\n".join(gold_docstr)
    # pred_docstr_text = "\n".join(pred_docstr)
    # gold_docstr_count = len(gold_docstr)
    # pred_docstr_count = len(pred_docstr)
    # gold_has_docstr = len(gold_docstr) > 0
    # pred_has_docstr = len(pred_docstr) > 0
    
    
    comp_code_score = get_codebleu(refs, hyp, "python", '0.25,0.25,0.25,0.25')
    # docstr_text_score = get_codebleu([[gold_docstr_text]], [pred_docstr_text], "python", '1,0,0,0')
    
    comp_code_scores += [comp_code_score]
    comp_diff_bleu_scores += [comp_diff_bleu_score]
    # docstr_text_scores += [docstr_text_score]
       
    # gold_docstrs += [gold_docstr]
    # pred_docstrs += [pred_docstr]
    # gold_docstr_texts += [gold_docstr_text]
    # pred_docstr_texts += [pred_docstr_text]
    # gold_docstr_counts += [gold_docstr_count]
    # pred_docstr_counts += [pred_docstr_count]
    # gold_has_docstr_list += [gold_has_docstr]
    # pred_has_docstr_list += [pred_has_docstr]

  0%|          | 0/2795 [00:00<?, ?it/s]



In [113]:
"List Comp BLEU score on only comparing difference in prediction:", np.mean(comp_diff_bleu_scores)

('List Comp BLEU score on only comparing difference in prediction:',
 0.9597028263460841)

In [15]:
# docstr_text_bleus = np.array([s["ngram"] for s in docstr_text_scores])

In [107]:
comp_code_bleus = np.array([s["code_bleu"] for s in comp_code_scores])

In [108]:
# docstr_text_bleus.mean()

In [109]:
comp_total = comp_preds.shape[0]

In [110]:
print("Perfect Prediction Rate:", sum(comp_code_bleus == 1) / comp_total)
print("Above 0.9 CodeBLEU Prediction Rate:", sum(comp_code_bleus > 0.9) / comp_total)

Perfect Prediction Rate: 0.237567084078712
Above 0.9 CodeBLEU Prediction Rate: 0.8014311270125224


In [13]:
# idx = 760
# print_split_line(f"{idx}-prediction")
# print(docstr_preds[idx])
# print_split_line(f"{idx}-gold labels")
# print(docstr_labels[idx])
# print_split_line(f"{idx}-score")
# print(docstr_text_bleus[idx])

In [52]:
comp_code_bleus.max()

1.0

In [56]:
for idx in range(comp_total):
    if comp_code_bleus[idx] == 1: continue
    if comp_code_bleus[idx] >= 0.7: continue
    if comp_code_bleus[idx] < 0.4: continue
    print_split_line(f"{idx}-input")
    print(comp_inputs[idx])
    print_split_line(f"{idx}-prediction")
    print(comp_preds[idx])
    print_split_line(f"{idx}-gold labels")
    print(comp_labels[idx])
    print_split_line(f"{idx}-score")
    print(comp_code_bleus[idx])



def add_stacking_base_pairs(graph_list=None):
    for g in graph_list:
        for (n, d) in g.nodes_iter(data=True):
            if d.get('position', False) == 0 or d.get('position', False) is not False:
                pos = d['position']
                neighbors = g.neighbors(n)
                if len(neighbors) >= 2:
                    greater_position_neighbors = []
                    for v in neighbors:
                        greater_position_neighbors.append(v)
                    if len(greater_position_neighbors) >= 2:
                        greater_position_neighbor_connected_by_backbone_list = []
                        for v in greater_position_neighbors:
                            greater_position_neighbor_connected_by_backbone_list.append(v)
                        if len(greater_position_neighbor_connected_by_backbone_list) > 0:
                            greater_position_neighbor_connected_by_backbone = greater_position_neighbor_connected_by_backbone_list[0]
  