## BLEU tokenizer

In [30]:
# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
# Copyright 2020 SacreBLEU Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import re
from functools import lru_cache


class BaseTokenizer:
    """A base dummy tokenizer to derive from."""

    def signature(self):
        """
        Returns a signature for the tokenizer.
        :return: signature string
        """
        return "none"

    def __call__(self, line):
        """
        Tokenizes an input line with the tokenizer.
        :param line: a segment to tokenize
        :return: the tokenized line
        """
        return line


class TokenizerRegexp(BaseTokenizer):
    def signature(self):
        return "re"

    def __init__(self):
        self._re = [
            # language-dependent part (assuming Western languages)
            (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
            # tokenize period and comma unless preceded by a digit
            (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
            # tokenize period and comma unless followed by a digit
            (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
            # tokenize dash when preceded by a digit
            (re.compile(r"([0-9])(-)"), r"\1 \2 "),
            # one space only between words
            # NOTE: Doing this in Python (below) is faster
            # (re.compile(r'\s+'), r' '),
        ]

    @lru_cache(maxsize=2**16)
    def __call__(self, line):
        """Common post-processing tokenizer for `13a` and `zh` tokenizers.
        :param line: a segment to tokenize
        :return: the tokenized line
        """
        for (_re, repl) in self._re:
            line = _re.sub(repl, line)

        # no leading or trailing spaces, single space within words
        # return ' '.join(line.split())
        # This line is changed with regards to the original tokenizer (seen above) to return individual words
        return line.split()


class Tokenizer13a(BaseTokenizer):
    def signature(self):
        return "13a"

    def __init__(self):
        self._post_tokenizer = TokenizerRegexp()

    @lru_cache(maxsize=2**16)
    def __call__(self, line):
        """Tokenizes an input line using a relatively minimal tokenization
        that is however equivalent to mteval-v13a, used by WMT.

        :param line: a segment to tokenize
        :return: the tokenized line
        """

        # language-independent part:
        line = line.replace("<skipped>", "")
        line = line.replace("-\n", "")
        line = line.replace("\n", " ")

        if "&" in line:
            line = line.replace("&quot;", '"')
            line = line.replace("&amp;", "&")
            line = line.replace("&lt;", "<")
            line = line.replace("&gt;", ">")

        return self._post_tokenizer(f" {line} ")

In [31]:
tokenizer=Tokenizer13a()

### Average length of RACE in angular format dataset

In [31]:
import json
with open('../data/angular_filtered/subsets/generation/test_race_v1.json') as f:
    race_data = json.load(f)

race_length_list = []
for item in race_data:
    race_length_list.append(len(tokenizer(item['race'])))
print(sum(race_length_list) / len(race_length_list))

5.35187969924812


### Average length of RAG in angular format dataset

In [30]:
import json
with open('../data/angular_filtered/subsets/generation/test_gpt35_model_classified_rag.json') as f:
    rag_data = json.load(f)

rag_length_list = []
for item in rag_data:
    rag_length_list.append(len(tokenizer(item['chatgpt_rag'])))
print(sum(rag_length_list) / len(rag_length_list))

12.25062656641604


### Average length of reference message in angular format dataset

In [32]:
import json
with open('../data/angular_filtered/subsets/generation/test_gpt35_model_classified_rag.json') as f:
    ref_data = json.load(f)

ref_length_list = []
for item in ref_data:
    ref_length_list.append(len(tokenizer(item['msg'])))
print(sum(ref_length_list) / len(ref_length_list))

11.685213032581453


In [3]:
from nltk import meteor_score

In [36]:
import evaluate
bleu = evaluate.load("bleu")

In [11]:
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [37]:
predictions = ["add select_order_by_with_table_star_table_name"]
references = ['change name select_order_by_with_table_star_table_name for parser']
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.7714985257158095, 'precisions': [0.9375, 0.9333333333333333, 0.9285714285714286, 0.9230769230769231], 'brevity_penalty': 0.8290291181804004, 'length_ratio': 0.8421052631578947, 'translation_length': 16, 'reference_length': 19}


In [12]:
predictions = ["docs: add note about firebase"]
references = ['docs(auth): mention of package requirement for server implementation']
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.14285714285714285, 'rouge2': 0.0, 'rougeL': 0.14285714285714285, 'rougeLsum': 0.14285714285714285}


In [22]:
predictions = ["docs: Update warning message for programmatic server implementation"]
references = ['docs(auth): mention of package requirement for server implementation']
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.0, 'precisions': [0.6363636363636364, 0.3, 0.0, 0.0], 'brevity_penalty': 0.9131007162822622, 'length_ratio': 0.9166666666666666, 'translation_length': 11, 'reference_length': 12}


In [13]:
predictions = ["docs: Update warning message for programmatic server implementation"]
references = ['docs(auth): mention of package requirement for server implementation']
results = rouge.compute(predictions=predictions, references=references)
print(results)

{'rouge1': 0.47058823529411764, 'rouge2': 0.13333333333333333, 'rougeL': 0.47058823529411764, 'rougeLsum': 0.47058823529411764}


In [8]:
math.exp(1-2)

0.36787944117144233

In [9]:
results

{'bleu': 1.0,
 'precisions': [1.0, 1.0, 1.0, 1.0],
 'brevity_penalty': 1.0,
 'length_ratio': 1.1666666666666667,
 'translation_length': 7,
 'reference_length': 6}

In [5]:
def _generate_enums(hypothesis, reference, preprocess=str.lower):
    """
    Takes in string inputs for hypothesis and reference and returns
    enumerated word lists for each of them

    :param hypothesis: hypothesis string
    :type hypothesis: str
    :param reference: reference string
    :type reference: str
    :preprocess: preprocessing method (default str.lower)
    :type preprocess: method
    :return: enumerated words list
    :rtype: list of 2D tuples, list of 2D tuples
    """
    hypothesis_list = list(enumerate(preprocess(hypothesis).split()))
    reference_list = list(enumerate(preprocess(reference).split()))
    return hypothesis_list, reference_list


def exact_match(hypothesis, reference):
    """
    matches exact words in hypothesis and reference
    and returns a word mapping based on the enumerated
    word id between hypothesis and reference

    :param hypothesis: hypothesis string
    :type hypothesis: str
    :param reference: reference string
    :type reference: str
    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
             enumerated unmatched reference tuples
    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
    """
    hypothesis_list, reference_list = _generate_enums(hypothesis, reference)
    return _match_enums(hypothesis_list, reference_list)


def _match_enums(enum_hypothesis_list, enum_reference_list):
    """
    matches exact words in hypothesis and reference and returns
    a word mapping between enum_hypothesis_list and enum_reference_list
    based on the enumerated word id.

    :param enum_hypothesis_list: enumerated hypothesis list
    :type enum_hypothesis_list: list of tuples
    :param enum_reference_list: enumerated reference list
    :type enum_reference_list: list of 2D tuples
    :return: enumerated matched tuples, enumerated unmatched hypothesis tuples,
             enumerated unmatched reference tuples
    :rtype: list of 2D tuples, list of 2D tuples,  list of 2D tuples
    """
    word_match = []
    for i in range(len(enum_hypothesis_list))[::-1]:
        for j in range(len(enum_reference_list))[::-1]:
            if enum_hypothesis_list[i][1] == enum_reference_list[j][1]:
                word_match.append(
                    (enum_hypothesis_list[i][0], enum_reference_list[j][0])
                )
                (enum_hypothesis_list.pop(i)[1], enum_reference_list.pop(j)[1])
                break
    return word_match, enum_hypothesis_list, enum_reference_list

In [7]:
hypothesis="docs: Update warning message for programmatic server implementation"
reference = "docs(auth): mention of package requirement for server implementation"

([(7, 7), (6, 6), (4, 5)],
 [(0, 'docs:'),
  (1, 'update'),
  (3, 'message'),
  (5, 'programmatic')],
 [(0, 'docs(auth):'),
  (1, 'mention'),
  (2, 'of'),
  (3, 'package'),
  (4, 'requirement')])

In [13]:
import re
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

#refs_new = []
for ele in reference:
    if ele in punc:
        reference = reference.replace(ele, " ")
        reference = re.sub(r'\s+', ' ', reference).strip()

for ele in hypothesis:
    if ele in punc:
        hypothesis = hypothesis.replace(ele, " ")
        hypothesis = re.sub(r'\s+', ' ', hypothesis).strip()

In [16]:
hypothesis_list, reference_list = _generate_enums(hypothesis, reference)

In [17]:
hypothesis_list

[(0, 'docs'),
 (1, 'update'),
 (3, 'message'),
 (4, 'for'),
 (5, 'programmatic'),
 (6, 'server'),
 (7, 'implementation')]

In [18]:
reference_list

[(0, 'docs'),
 (1, 'auth'),
 (2, 'mention'),
 (3, 'of'),
 (4, 'package'),
 (5, 'requirement'),
 (6, 'for'),
 (7, 'server'),
 (8, 'implementation')]

In [19]:
exact_match(hypothesis, reference)

([(7, 8), (6, 7), (4, 6), (0, 0)],
 [(1, 'auth'), (2, 'mention'), (3, 'of'), (4, 'package'), (5, 'requirement')])

In [24]:
F1 = ((7/12)* (7/10)) / (0.85*(7/12) + 0.15*(7/10))
F1

0.6796116504854369

In [26]:
F1 = ((7/11)* (7/10)) / (0.85*(7/11) + 0.15*(7/10))
F1

0.6896551724137931

In [27]:
F1 = ((7/9)* (7/10)) / (0.85*(7/9) + 0.15*(7/10))
F1

0.7106598984771574

In [25]:
F1 = ((7/10)* (7/12)) / (0.5*(7/12) + 0.5*(7/10))
F1

0.6363636363636365

## B-Norm

In [2]:
import re

def splitPuncts(line):
    # This regex matches words and punctuation, treating punctuation as separate tokens
    return ' '.join(re.findall(r'\w+|[^\w\s]', line))

# Example usage:
text = "docs(auth): mention of package requirement for server implementation"
print(splitPuncts(text))

docs ( auth ) : mention of package requirement for server implementation


In [34]:
splitPuncts(text).split()

['docs',
 '(',
 'auth',
 ')',
 ':',
 'mention',
 'of',
 'package',
 'requirement',
 'for',
 'server',
 'implementation']

B-Moses

In [1]:
import collections
import math


def _get_ngrams(segment, max_order):
  """Extracts all n-grams upto a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams upto max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i+order])
      ngram_counts[ngram] += 1
  return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):
  """Computes BLEU score of translated segments against one or more references.

  Args:
    reference_corpus: list of lists of references for each translation. Each
        reference should be tokenized into a list of tokens.
    translation_corpus: list of translations to score. Each translation
        should be tokenized into a list of tokens.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoothing.

  Returns:
    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
    precisions and brevity penalty.
  """
  matches_by_order = [0] * max_order
  possible_matches_by_order = [0] * max_order
  reference_length = 0
  translation_length = 0
  for (references, translation) in zip(reference_corpus,
                                       translation_corpus):
    reference_length += min(len(r) for r in references)
    translation_length += len(translation)

    merged_ref_ngram_counts = collections.Counter()
    for reference in references:
      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
    translation_ngram_counts = _get_ngrams(translation, max_order)
    overlap = translation_ngram_counts & merged_ref_ngram_counts
    for ngram in overlap:
      matches_by_order[len(ngram)-1] += overlap[ngram]
    for order in range(1, max_order+1):
      possible_matches = len(translation) - order + 1
      if possible_matches > 0:
        possible_matches_by_order[order-1] += possible_matches

  precisions = [0] * max_order
  for i in range(0, max_order):
    if smooth:
      precisions[i] = ((matches_by_order[i] + 1.) /
                       (possible_matches_by_order[i] + 1.))
    else:
      if possible_matches_by_order[i] > 0:
        precisions[i] = (float(matches_by_order[i]) /
                         possible_matches_by_order[i])
      else:
        precisions[i] = 0.0

  if min(precisions) > 0:
    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
    geo_mean = math.exp(p_log_sum)
  else:
    geo_mean = 0

  ratio = float(translation_length) / reference_length

  if ratio > 1.0:
    bp = 1.
  else:
    bp = math.exp(1 - 1. / ratio)

  bleu = geo_mean * bp

  return (bleu, precisions, bp, ratio, translation_length, reference_length)

In [10]:
hypothesis="Upgrade to plexus - utils 3 . 0 . 24"
reference = "Upgrade to Plexus Utils 3 . 0 . 24"



In [11]:
compute_bleu(splitPuncts(reference).split(), splitPuncts(hypothesis).split())

(0.0, [0.56, 0.0, 0.0, 0.0], 1.0, 2.7777777777777777, 25, 9)

In [5]:
import math
import copy

def bleu_count(hypothesis, references, max_n=4):
    ret_len_hyp = 0
    ret_len_ref = 0
    ret_clip_count = [0]*max_n
    ret_count = [0]*max_n
    for m in range(len(hypothesis)):
        hyp, ref = hypothesis[m], references[m]
        x = hyp.split()
        y = [r.split() for r in ref]
        x_len = len(x)
        y_len = [len(s) for s in y]
        n_ref = len(ref)

        closest_diff = 9999
        closest_length = 9999
        ref_ngram = dict()

        for i in range(n_ref):
            diff = abs(y_len[i]-x_len)
            if diff < closest_diff:
                closest_diff = diff
                closest_length = y_len[i]
            elif diff==closest_diff and y_len[i] < closest_length:
                closest_length = y_len[i]

            for n in range(max_n):
                sent_ngram = dict()
                for st in range(0, y_len[i]-n):
                    ngram = "%d"%(n+1)
                    for k in range(n+1):
                        j = st+k
                        ngram += " %s"%(y[i][j])
                    if ngram not in sent_ngram:
                        sent_ngram[ngram]=0
                    sent_ngram[ngram]+=1
                for ngram in sent_ngram.keys():
                    if ngram not in ref_ngram or ref_ngram[ngram]<sent_ngram[ngram]:
                        ref_ngram[ngram] = sent_ngram[ngram]

        ret_len_hyp += x_len
        ret_len_ref += closest_length

        for n in range(max_n):
            hyp_ngram = dict()
            for st in range(0, x_len-n):
                ngram = "%d"%(n+1)
                for k in range(n+1):
                    j = st+k
                    ngram += " %s"%(x[j])
                if ngram not in hyp_ngram:
                    hyp_ngram[ngram]=0
                hyp_ngram[ngram]+=1
            for ngram in hyp_ngram.keys():
                if ngram in ref_ngram:
                    ret_clip_count[n] += min(ref_ngram[ngram], hyp_ngram[ngram])
                ret_count[n] += hyp_ngram[ngram]

    return ret_clip_count, ret_count, ret_len_hyp, ret_len_ref

def corpus_bleu(hypothesis, references, max_n=4):
    assert(len(hypothesis) == len(references))
    clip_count, count, total_len_hyp, total_len_ref = bleu_count(hypothesis, references, max_n=max_n)
    brevity_penalty = 1.0
    bleu_scores = []
    bleu = 0
    for n in range(max_n):
        if count[n]>0:
            bleu_scores.append(clip_count[n]/count[n])
        else:
            bleu_scores.append(0)
    if total_len_hyp < total_len_ref:
        if total_len_hyp==0:
            brevity_penalty = 0.0
        else:
            brevity_penalty = math.exp(1 - total_len_ref/total_len_hyp)
    def my_log(x):
        if x == 0:
            return -9999999999.0
        elif x < 0:
            raise Exception("Value Error")
        return math.log(x)
    log_bleu = 0.0
    for n in range(max_n):
        log_bleu += my_log(bleu_scores[n])
    bleu = brevity_penalty*math.exp(log_bleu / float(max_n))
    return [bleu]+bleu_scores, [brevity_penalty, total_len_hyp/total_len_ref, total_len_hyp, total_len_ref]


In [13]:
corpus_bleu([hypothesis], [reference])

([0.0, 0.3, 0.0, 0.0, 0.0], [1.0, 10.0, 10, 1])

In [1]:
import json
from collections import defaultdict


def is_content_identical(msg):
    # 按换行符分隔每行内容
    lines = msg.split('\n')
    if len(lines) == 1:
        return False
    # 去除每行前后的空白字符并比较
    return all(line.strip() == lines[0].strip() for line in lines)

def check_msg_duplicates(json_data):

    # 遍历 JSON 数据，将每个项的 msg 字段作为键存储在 msg_map 中
    for item in json_data:
        if is_content_identical(item['msg']):
            item['msg'] = item['msg'].split('\n')[0]
            # result.append(item)
    
    return json_data

with open('../data/angular_filtered/subsets/dev_test.json', 'r', encoding='UTF-8') as f:
    json_data = json.load(f)

with open('../data/angular_filtered/subsets/dev_test.json', 'w', encoding='UTF-8') as f:
    json.dump(check_msg_duplicates(json_data), f, ensure_ascii=False, indent=4)

In [2]:
output_file = '../data/angular_filtered/subsets/generation/chunksize/dev_test_ref.txt'
with open(output_file, 'w', encoding='UTF-8') as f:
    for item in json_data:
        f.write(item['msg'].replace('\n', '\\n').replace('\r', '\\r') + '\n')

In [9]:
with open('../data/angular_filtered/subsets/generation/embedding/dev_test_gpt35_rag_mxbai.json', 'r', encoding='UTF-8') as f:
    data = json.load(f)

with open('../data/angular_filtered/subsets/generation/embedding/dev_test_gpt35_rag_mxbai.json', 'w', encoding='UTF-8') as f:
    json.dump(check_msg_duplicates(data), f, ensure_ascii=False, indent=4)

## Log-MNEXT

In [21]:
from nltk import word_tokenize

In [22]:
"chore(package.json): update release script and add prepublish script".split()

['chore(package.json):',
 'update',
 'release',
 'script',
 'and',
 'add',
 'prepublish',
 'script']

In [23]:
word_tokenize("chore(package.json): update release script and add prepublish script")

['chore',
 '(',
 'package.json',
 ')',
 ':',
 'update',
 'release',
 'script',
 'and',
 'add',
 'prepublish',
 'script']

In [25]:
import re
punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
refs = ["build(docs): update build command fo full static"]
for i in range(len(refs)):
    for ele in refs[i]:
        if ele in punc:
            refs[i] = refs[i].replace(ele, " ")
            refs[i] = re.sub(r'\s+', ' ', refs[i]).strip()

In [26]:
refs

['build docs update build command fo full static']