In [3]:
# add f1/em from baidu https://github.com/baidu/DuReader/blob/master/DuReader-Robust/evaluate.py#L188
def _tokenize_chinese_chars(text):
    """
    :param text: input text, unicode string
    :return:
        tokenized text, list
    """

    def _is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
            (cp >= 0x3400 and cp <= 0x4DBF) or  #
            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
            (cp >= 0x2B820 and cp <= 0x2CEAF) or
            (cp >= 0xF900 and cp <= 0xFAFF) or  #
            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    output = []
    buff = ""
    for char in text:
        cp = ord(char)
        if _is_chinese_char(cp) or char == "=":
            if buff != "":
                output.append(buff)
                buff = ""
            output.append(char)
        else:
            buff += char

    if buff != "":
        output.append(buff)

    return output

def _normalize(in_str):
    """
    normalize the input unicode string
    """
    in_str = in_str.lower()
    sp_char = [
        u':', u'_', u'`', u'，', u'。', u'：', u'？', u'！', u'(', u')',
        u'“', u'”', u'；', u'’', u'《', u'》', u'……', u'·', u'、', u',',
        u'「', u'」', u'（', u'）', u'－', u'～', u'『', u'』', '|'
    ]
    out_segs = []
    for char in in_str:
        if char in sp_char:
            continue
        else:
            out_segs.append(char)
    return ''.join(out_segs)


def find_lcs(s1, s2):
    """find the longest common subsequence between s1 ans s2"""
    m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)]
    max_len = 0
    p = 0
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                m[i+1][j+1] = m[i][j]+1
                if m[i+1][j+1] > max_len:
                    max_len = m[i+1][j+1]
                    p = i+1
    return s1[p-max_len:p], max_len

def calc_f1_score(answers, prediction):
    f1_scores = []
    for ans in answers:
        ans_segs = _tokenize_chinese_chars(_normalize(ans))
        prediction_segs = _tokenize_chinese_chars(_normalize(prediction))
        lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
        if lcs_len == 0:
            f1_scores.append(0)
            continue
        prec = 1.0*lcs_len/len(prediction_segs)
        rec = 1.0*lcs_len/len(ans_segs)
        f1 = (2 * prec * rec) / (prec + rec)
        f1_scores.append(f1)
    return max(f1_scores)


def calc_em_score(answers, prediction):
    em = 0
    for ans in answers:
        ans_ = _normalize(ans)
        prediction_ = _normalize(prediction)
        if ans_ == prediction_:
            em = 1
            break
    return em
# add f1/em end


def get_gen_metric(examples, flag=False):
    metric = {"f1": 0, "em": 0}
    f1 = 0
    em = 0
    total_count = 0
    skip_count = 0
    
    for item in examples:
        total_count += 1
        idx = item['idx']
        answers = item['ans']
        try:
            prediction = item['predict'].strip()  # should be text
            if flag:
                prediction = item['predict'].strip()[:20]  # should be text
        except:
            skip_count += 1
            print("Skipped")
            print('----------------------------')
            continue
        _f1 = calc_f1_score(answers, prediction)
        f1 += _f1
        em += calc_em_score(answers, prediction)
        
    f1_score = 100.0 * f1 / total_count
    em_score = 100.0 * em / total_count

    metric['f1'] = f1_score
    metric['em'] = em_score
    return metric

import json
with open('dureader_results.json', 'r') as f:
    examples = json.load(f)
print(get_gen_metric(examples, True))

{'f1': 13.448054433123485, 'em': 2.8934368383909668}


In [1]:
datalist = ['/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_1.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_2.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_4.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_3.mindrecord']
datalist.sort()
print(datalist)

['/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_1.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_2.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_3.mindrecord', '/home/ma-user/work/notebook_code/data_prompt_mindrecord_code_thu_code_level/code/transfered_mindrecord_4.mindrecord']


In [1]:
# Copyright 2017 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Python implementation of BLEU and smooth-BLEU.

This module provides a Python implementation of BLEU and smooth-BLEU.
Smooth BLEU is computed following the method outlined in the paper:
Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
evaluation metrics for machine translation. COLING 2004.
"""
# coding=utf-8
"""Tokenization classes for OpenAI GPT."""
# from __future__ import (absolute_import, division, print_function,
#                         unicode_literals)
import collections
import math


def _get_ngrams(segment, max_order):
  """Extracts all n-grams upto a given maximum order from an input segment.

  Args:
    segment: text segment from which n-grams will be extracted.
    max_order: maximum length in tokens of the n-grams returned by this
        methods.

  Returns:
    The Counter containing all n-grams upto max_order in segment
    with a count of how many times each n-gram occurred.
  """
  ngram_counts = collections.Counter()
  for order in range(1, max_order + 1):
    for i in range(0, len(segment) - order + 1):
      ngram = tuple(segment[i:i+order])
      ngram_counts[ngram] += 1
  return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
                 smooth=False):
  """Computes BLEU score of translated segments against one or more references.

  Args:
    reference_corpus: list of lists of references for each translation. Each
        reference should be tokenized into a list of tokens.
    translation_corpus: list of translations to score. Each translation
        should be tokenized into a list of tokens.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoothing.

  Returns:
    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
    precisions and brevity penalty.
  """
  matches_by_order = [0] * max_order
  possible_matches_by_order = [0] * max_order
  reference_length = 0
  translation_length = 0
  for (references, translation) in zip(reference_corpus,
                                       translation_corpus):
    reference_length += min(len(r) for r in references)
    translation_length += len(translation)

    merged_ref_ngram_counts = collections.Counter()
    for reference in references:
      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
    translation_ngram_counts = _get_ngrams(translation, max_order)
    overlap = translation_ngram_counts & merged_ref_ngram_counts
    for ngram in overlap:
      matches_by_order[len(ngram)-1] += overlap[ngram]
    for order in range(1, max_order+1):
      possible_matches = len(translation) - order + 1
      if possible_matches > 0:
        possible_matches_by_order[order-1] += possible_matches

  precisions = [0] * max_order
  for i in range(0, max_order):
    if smooth:
      precisions[i] = ((matches_by_order[i] + 1.) /
                       (possible_matches_by_order[i] + 1.))
    else:
      if possible_matches_by_order[i] > 0:
        precisions[i] = (float(matches_by_order[i]) /
                         possible_matches_by_order[i])
      else:
        precisions[i] = 0.0

  if min(precisions) > 0:
    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
    geo_mean = math.exp(p_log_sum)
  else:
    geo_mean = 0

  ratio = float(translation_length) / reference_length

  if ratio > 1.0:
    bp = 1.
  else:
    bp = math.exp(1 - 1. / ratio)

  bleu = geo_mean * bp

  return (bleu, precisions, bp, ratio, translation_length, reference_length)


def _bleu(ref_file, trans_file, subword_option=None):
    max_order = 4
    smooth = True
    ref_files = [ref_file]
    reference_text = []
    for reference_filename in ref_files:
        with open(reference_filename) as fh:
            reference_text.append(fh.readlines())
    per_segment_references = []
    for references in zip(*reference_text):
        reference_list = []
        for reference in references:
            reference_list.append(reference.strip().split())
        per_segment_references.append(reference_list)
    translations = []
    with open(trans_file) as fh:
        for line in fh:
            translations.append(line.strip().split())
    bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
    return round(100 * bleu_score,2)




from io import open
import sentencepiece as spm
import jieba

class JIEBATokenizer():
    r"""
    Jieba Tokenizer
    """
    def __init__(self, model_file, max_len=None):
        self.max_len = max_len if max_len is not None else int(1e12)
        self.encoder = {}
        self.sp = spm.SentencePieceProcessor(model_file=model_file)

        for i in range(self.sp.get_piece_size()):
            self.encoder[self.sp.id_to_piece(i)] = i
        self.translator = str.maketrans(" \n", "\u2582\u2583")

        self.eod_id = self.encoder['<eod>']
        self.eot_id = self.encoder['<eot>']
        self.pad_id = self.encoder['<pad>']
        
    @property
    def vocab_size(self):
        return len(self.encoder)

    def __len__(self):
        return len(self.encoder) + len(self.special_tokens)

    @property
    def eod(self):
        return self.eod_id

    def tokenize(self, text):
        """ Tokenize a string. """
        seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
        return seg_list

    def convert_tokens_to_ids(self, tokens):
        new_seg = " ".join(tokens)
        return self.sp.encode(new_seg)

    def convert_ids_to_tokens(self, ids):
        return self.sp.id_to_piece(ids)

    def process_tokens(self, text):
        text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
        return text

    def encode(self, text):
        res = self.tokenize(text)
        return res

    def decode(self, tokens):
        text = self.sp.decode(tokens)
        return self.process_tokens(text)
    
#     @property
#     def vocab_size(self):
#         return len(self.encoder)

#     def __len__(self):
#         return len(self.encoder) + len(self.special_tokens)

#     @property
#     def eod(self):
#         return self.eod_id

#     def tokenize(self, text):
#         """ Tokenize a string. """
#         seg_list = [x.translate(self.translator) for x in jieba.cut(text, cut_all=False)]
#         new_seg = " ".join(seg_list)
#         return self.sp.encode(new_seg)

#     def convert_tokens_to_ids(self, tokens):
#         return tokens

#     def convert_ids_to_tokens(self, ids):
#         return self.decode(ids)


#     def encode(self, text):
#         res = self.tokenize(text)
#         return res

#     def decode(self, tokens):
#         text = self.sp.decode(tokens)
#         text = text.replace(' ', '').replace('\u2582', ' ').replace('\u2583', '\n')
#         return text


# tokenizer = JIEBATokenizer('/home/ma-user/work/notebook_code/pangu_ckpt/tokenizer/vocab.model')
from transformers import GPT2Tokenizer, AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("/home/ma-user/work/notebook_code/openi/chatglm/", trust_remote_code=True)


None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# https://github.com/microsoft/CodeXGLUE/blob/main/Text-Code/text-to-code/evaluator/predictions.txt

import json
with open('mbpp_results_level.json', 'r') as f:
    examples = json.load(f)

ans_txt = []
pre_txt = []

for item in examples:
    idx = item['idx']
    answers = item['ans']
    try:
        prediction = item['predict'].strip()  # should be text
    except:
        print("Skipped")
        print('----------------------------')
        continue
    
    ans_txt.append(' '.join(tokenizer.tokenize(answers))) 
    pre_txt.append(' '.join(tokenizer.tokenize(prediction))) 

    with open('ans.txt', 'w') as f, open('pre.txt', 'w') as f1:
        f.writelines(ans_txt)
        f1.writelines(pre_txt)
    
bleu_score = round(_bleu("ans.txt", 'pre.txt'), 2)   #传入两个 txt
print(f"BLEU: {bleu_score}")

BLEU: 23.42
