#### 测试Generator ZH

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

def gen_pred_collate(batch_data, gen_tokenizer):
    input_ids, length_list = [], []
    for item in batch_data:
        cur_input_ids = gen_tokenizer(
            '<bos>“' + item['text1'] + '”的相似句是“', return_tensors='pt'
        ).input_ids.squeeze()[:-1]  # 不能加<eos>

        # 每个样本复制 N 份
        length = [cur_input_ids.size(0)] * 1
        cur_input_ids = [cur_input_ids] * 1

        length_list.extend(length)
        input_ids.extend(cur_input_ids)

    input_ids = pad_sequence(
        [x for x in input_ids], batch_first=True, 
        padding_value=gen_tokenizer.pad_token_id)
    length_tensor = torch.tensor(length_list)

    return {
        'input_ids': input_ids,
        'length_tensor': length_tensor,
    }
    
# hyper parameters
all_cycle = 8
data_name = 'afqmc'
ckpt_name = 'afqmc0'

In [None]:
from transformers import T5Tokenizer
from torch.utils.data import DataLoader

import sys, glob, datasets
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sim_gen_dataset import SimGanDataset


data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/' + data_name)

gen_tokenizer = T5Tokenizer.from_pretrained(
    '/cognitive_comp/wutong/source/model_base/chinese_sentencepiece/cog-pretrain.model',
    eos_token='<|endoftext|>',
    pad_token='<|endoftext|>',
    extra_ids=0)
gen_tokenizer.add_special_tokens({'bos_token': '<bos>'})

predict_dataset = SimGanDataset(data)
def collate_fn(batch_data):
    return gen_pred_collate(batch_data, gen_tokenizer)
dataloader = DataLoader(
    dataset=predict_dataset,
    batch_size=100,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
    collate_fn=collate_fn,
)

In [None]:
import numpy as np
import sys, evaluate, torch
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sample_sequence import sample_sequence_batch
from model_utils.sim_gen_model import Generator

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

class Config:
    cycle = 0
    chinese = 1
    txl_config_path = '/cognitive_comp/wutong/similarity_generation/model_utils/txl_5B_config.json'
    txl_model_path = '/cognitive_comp/wutong/source/model_base/model_zh/txl_zh_5.0B.pt'
    ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/all_checkpoints/' + ckpt_name

config = Config()
perplexity = evaluate.load("perplexity", module_type="metric")
raw_texts, sim_texts, mean_perplexity = [], [], []
for idx in [0, 8]:
    config.cycle = idx
    generator = Generator(config)
    generator.half().eval().cuda()
    sim_sent_list = []
    for batch in dataloader:
        torch.cuda.empty_cache()
        output_dict = sample_sequence_batch(
            model=generator.gen, context_tokens_tensor=batch['input_ids'].cuda(),
            context_length_tensor=batch['length_tensor'], repetition_penalty=1.0, max_out_seq=200,
            end_token_id=50000, temperature=1.0, top_k=1, top_p=0.0,
        )

        sim_sent_list.extend(
            gen_tokenizer.batch_decode(output_dict['ids_list'], skip_special_tokens=True))

    raw_text, sim_text = [], []
    for idx, item in enumerate(sim_sent_list):

        item = item.replace(' ', '').split('”的相似句是“')
        if item[0][1:] and item[1][:-1]:
            raw_text.append(item[0][1:])
            sim_text.append(item[1][:-1])
    
    raw_texts.append(raw_text)
    sim_texts.append(sim_text)

    mean_perplexity.append(perplexity.compute(input_texts=sim_text, model_id='gpt2')['mean_perplexity'])
    print(mean_perplexity)


In [None]:
from evaluate import load
bertscore = load("bertscore")
res_list = []
for raw_text, sim_text in zip(raw_texts, sim_texts):
    predictions = sim_text
    references = raw_text
    results = bertscore.compute(predictions=predictions, references=references, lang="zh")['f1']
    res_list.append(np.mean(results))
print(res_list)


#### 测试Generator EN

In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence


def gen_pred_collate(batch_data, gen_tokenizer):
    input_ids, length_list = [], []
    for item in batch_data:
        cur_input_ids = gen_tokenizer(
            '"' + item['text1'] + '" is similar to "', return_tensors='pt'
        ).input_ids.squeeze()[1:]  # 去掉<bos>

        # 每个样本复制 N 份
        length = [cur_input_ids.size(0)] * 1
        cur_input_ids = [cur_input_ids] * 1

        length_list.extend(length)
        input_ids.extend(cur_input_ids)

    if config.chinese:
        input_ids = pad_sequence(
            [x for x in input_ids], batch_first=True, 
            padding_value=gen_tokenizer.pad_token_id)
    else:
        input_ids = pad_sequence(
            [x for x in input_ids], batch_first=True, 
            padding_value=gen_tokenizer.pad_token_id)
    length_tensor = torch.tensor(length_list)

    return {
        'input_ids': input_ids,
        'length_tensor': length_tensor,
    }

In [None]:
from transformers import GPT2Tokenizer
from torch.utils.data import DataLoader

import sys, glob, datasets
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sim_gen_dataset import SimGanDataset


data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/mrpc')

gen_tokenizer = GPT2Tokenizer.from_pretrained('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b')

predict_dataset = SimGanDataset(data)
def collate_fn(batch_data):
    return gen_pred_collate(batch_data, gen_tokenizer)
dataloader = DataLoader(
    dataset=predict_dataset,
    batch_size=300,
    shuffle=False,
    num_workers=8,
    pin_memory=True,
    collate_fn=collate_fn,
)

In [None]:
import numpy as np
import sys, evaluate, torch
from tqdm import tqdm
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from model_utils.sim_gen_model import Generator_EN
from data_utlis.sample_sequence import sample_sequence_batch_en

import os
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

class Config:
    cycle = 0
    chinese = 0
    opt_name = 'opt-2.7b'
    opt_model_path = '/cognitive_comp/wutong/source/model_base/model_en/'
    ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/all_checkpoints/mrpc0'

config = Config()
perplexity = evaluate.load("perplexity", module_type="metric")
raw_texts, sim_texts, mean_perplexity, sim_sent_list = [], [], [], []
for idx in [0, 8]:
    config.cycle = idx
    generator = Generator_EN(config)
    generator.half().eval().cuda()
    for batch in dataloader:
        torch.cuda.empty_cache()
        output_dict = sample_sequence_batch_en(
            model=generator.gen, context_tokens_tensor=batch['input_ids'].cuda(),
            context_length_tensor=batch['length_tensor'], repetition_penalty=1.0, max_out_seq=100,
            end_token_id=gen_tokenizer.eos_token_id, temperature=1.0, top_k=1, top_p=0.0,
        )

    sim_sent_list.extend(
        gen_tokenizer.batch_decode(output_dict['ids_list'], skip_special_tokens=True))

    raw_text, sim_text = [], []
    for item in tqdm(sim_sent_list):
        item = item.replace('\n', '').split('\" is similar to \"')
        if len(item) >= 2:
            raw_text.append(item[0][1:])
            if '"' in item[0][1:]:
                sim_text.append(item[1][:-1])
            else:
                sim_text.append(item[1].split('"')[0])

    raw_texts.append(raw_text)
    sim_texts.append(sim_text)
    
    mean_perplexity.append(perplexity.compute(input_texts=sim_text, model_id='gpt2')['mean_perplexity'])
    print(mean_perplexity)

In [None]:
import datasets, evaluate
from tqdm import tqdm


perplexity = evaluate.load("perplexity", module_type="metric")
for i in [0, 8]:
    sim_text = []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/ipynb/afqmc/data_cycle_' + str(i))
    for j in tqdm(range(data.num_rows)):
        if data[j]['text2']:
            sim_text.append(data[j]['text2'])
    print(len(sim_text))
    print(perplexity.compute(input_texts=sim_text, model_id='gpt2')['mean_perplexity'])
    

In [None]:
import numpy as np

res_list = []
bertscore = evaluate.load("bertscore", module_type="metric")
for i in [0, 8]:
    references, predictions = [], []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/ipynb/afqmc/data_cycle_' + str(i))
    for j in tqdm(range(data.num_rows)):
        if data[j]['text1'] and data[j]['text2']:
            references.append(data[j]['text1'])
            predictions.append(data[j]['text2'])
    print(len(references))
    results = bertscore.compute(predictions=predictions, references=references, lang="zh")['f1']
    res_list.append(np.mean(results))
print(res_list)

In [None]:
def get_dict(tokens, ngram, gdict=None):
    """
    get_dict
    统计n-gram频率并用dict存储
    """
    token_dict = {}
    if gdict is not None:
        token_dict = gdict
    tlen = len(tokens)
    for i in range(0, tlen - ngram + 1):
        ngram_token = "".join(tokens[i:(i + ngram)])
        if token_dict.get(ngram_token) is not None: 
            token_dict[ngram_token] += 1
        else:
            token_dict[ngram_token] = 1
    return token_dict


def calc_distinct_ngram(pair_list, ngram):
    """
    calc_distinct_ngram
    """
    ngram_total = 0.0
    ngram_distinct_count = 0.0
    pred_dict = {}
    for predict_tokens, _ in pair_list:
        get_dict(predict_tokens, ngram, pred_dict)
    for key, freq in pred_dict.items():
        ngram_total += freq
        ngram_distinct_count += 1 
        #if freq == 1:
        #    ngram_distinct_count += freq
    return ngram_distinct_count / ngram_total


def calc_distinct(pair_list):
    """
    calc_distinct
    """
    distinct1 = calc_distinct_ngram(pair_list, 3)
    distinct2 = calc_distinct_ngram(pair_list, 4)
    return [distinct1, distinct2]


import math


def count(pred_tokens, gold_tokens, ngram, result):
    """
    计算BLEU中pn
    """
    cover_count, total_count = result
    pred_dict = get_dict(pred_tokens, ngram)
    gold_dict = get_dict(gold_tokens, ngram)
    cur_cover_count = 0
    cur_total_count = 0
    for token, freq in pred_dict.items():
        if gold_dict.get(token) is not None:
            gold_freq = gold_dict[token]
            cur_cover_count += min(freq, gold_freq)
        cur_total_count += freq
    result[0] += cur_cover_count
    result[1] += cur_total_count


def calc_bp(pair_list):
    """
    calc_bp
    """
    c_count = 0.0
    r_count = 0.0
    for pair in pair_list:
        pred_tokens, gold_tokens = pair
        c_count += len(pred_tokens)
        r_count += len(gold_tokens)
    bp = 1
    if c_count < r_count:
        bp = math.exp(1 - r_count / c_count)
    return bp 


def calc_cover_rate(pair_list, ngram):
    """
    calc_cover_rate
    """
    result = [0.0, 0.0] # [cover_count, total_count]
    for pair in pair_list:
        pred_tokens, gold_tokens = pair
        count(pred_tokens, gold_tokens, ngram, result)
    cover_rate = result[0] / result[1]
    return cover_rate 


def calc_bleu(pair_list):
    """
    calc_bleu
    """
    bp = calc_bp(pair_list)
    cover_rate1 = calc_cover_rate(pair_list, 1)
    cover_rate2 = calc_cover_rate(pair_list, 2)
    cover_rate3 = calc_cover_rate(pair_list, 3)
    bleu1 = 0
    bleu2 = 0
    bleu3 = 0
    if cover_rate1 > 0:
        bleu1 = bp * math.exp(math.log(cover_rate1))
    if cover_rate2 > 0:
        bleu2 = bp * math.exp((math.log(cover_rate1) + math.log(cover_rate2)) / 2)
    if cover_rate3 > 0:
        bleu3 = bp * math.exp((math.log(cover_rate1) + math.log(cover_rate2) + math.log(cover_rate3)) / 3)
    return [bleu1, bleu2, bleu3]


import jieba

dist1, dist2 = [], []
bleu1_list, bleu2_list, bleu3_list = [], [], []
for raw_text, sim_text in zip(raw_texts, sim_texts):
    sents = []
    for idx in range(10):
        seg_list = jieba.cut(raw_text[idx])
        gold_tokens = " ".join(seg_list).strip().split(" ")
        
        seg_list = jieba.cut(sim_text[idx])
        pred_tokens = " ".join(seg_list).strip().split(" ")
        sents.append([pred_tokens, gold_tokens])
    
    bleu1, bleu2, bleu3 = calc_bleu(sents)
    distinct1, distinct2 = calc_distinct(sents)
    
    dist1.append(distinct1)
    dist2.append(distinct2)
    bleu1_list.append(bleu1)
    bleu2_list.append(bleu2)
    bleu3_list.append(bleu3)

print(dist1)
print(dist2)
print(bleu1_list)
print(bleu2_list)
print(bleu3_list)

#### Sample ZH

In [None]:
from transformers import T5Tokenizer


gen_tokenizer = T5Tokenizer.from_pretrained(
    '/cognitive_comp/wutong/source/model_base/chinese_sentencepiece/cog-pretrain.model',
    eos_token='<|endoftext|>',
    pad_token='<|endoftext|>',
    extra_ids=0)
gen_tokenizer.add_special_tokens({'bos_token': '<bos>'})

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import json
import sys
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from model_utils.gpt2_modeling import GPT2Model


with open('/cognitive_comp/wutong/similarity_generation/model_utils/txl_5B_config.json', 'r') as f:
    txl_config = json.load(f)
gen = GPT2Model(
    num_layers=txl_config['num_layers'],
    vocab_size=txl_config['vocab_size'],
    hidden_size=txl_config['hidden_size'],
    num_attention_heads=txl_config['num_attention_heads'],
    embedding_dropout_prob=txl_config['embedding_dropout_prob'],
    attention_dropout_prob=txl_config['attention_dropout_prob'],
    output_dropout_prob=txl_config['output_dropout_prob'],
    max_sequence_length=txl_config['max_sequence_length'],
    max_memory_length=txl_config['max_memory_length'],
    checkpoint_activations=txl_config['checkpoint_activations'],
    checkpoint_num_layers=txl_config['checkpoint_num_layers'],
    parallel_output=txl_config['parallel_output'],
    relative_encoding=txl_config['relative_encoding']
)


def load_model(cycle):
    ckpt_path = '/cognitive_comp/wutong/similarity_generation/all_checkpoints/new_exp6'
    pt_path = ckpt_path +\
        f'/generator_cycle_{cycle}.ckpt/checkpoint/mp_rank_00_model_states.pt'
    new_dict = {}
    state_dict = torch.load(pt_path, map_location='cpu')['module']
    for k, v in state_dict.items():
        if any([i in k for i in ['module.generator.gen.']]):
            new_dict[k[len('module.generator.gen.'):]] = v
        else:
            continue
    if new_dict == {}:
        new_dict = state_dict
    gen.load_state_dict(new_dict)
    print('The Generator Transformer-XL Load Successfully !\n')
    
    return gen

In [None]:
from torch.nn.utils.rnn import pad_sequence
import sys
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sample_sequence import sample_sequence_batch


torch.cuda.empty_cache()
# input_ids, length_list = [], []
# data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/qqp')
# for idx in range(80, 110):
#     cur_input_ids = gen_tokenizer(
#         '<bos>“' + data[idx]['text1'] + '”的相似句是“', return_tensors='pt'
#     ).input_ids.squeeze()[:-1]  # 不能加<eos>

#     length = [cur_input_ids.size(0)]
#     cur_input_ids = [cur_input_ids]

#     length_list.extend(length)
#     input_ids.extend(cur_input_ids)

# input_ids = pad_sequence(
#     [x for x in input_ids], batch_first=True, padding_value=50000)
# length_tensor = torch.tensor(length_list)

input_ids = gen_tokenizer(
        '“透明质酸具有优异的生物相容性和主动肿瘤靶向性, 可被透明质酸酶降解。”的相似句是“', return_tensors='pt').input_ids[:, :-1]  # 不能加<eos>
length_tensor = torch.tensor([input_ids.size(1)])
print('input_ids', input_ids.size())

gen = load_model(1).half().eval().cuda()
output_dict = sample_sequence_batch(
    model=gen, context_tokens_tensor=input_ids.cuda(),
    context_length_tensor=length_tensor, repetition_penalty=1.0, max_out_seq=200,
    end_token_id=50000, temperature=1.0, top_k=0, top_p=0.95,
)
print(gen_tokenizer.batch_decode(output_dict['ids_list'], skip_special_tokens=True))

torch.cuda.empty_cache()
load_model(1).half().cuda().eval()
output_dict = sample_sequence_batch(
    model=gen, context_tokens_tensor=input_ids.cuda(),
    context_length_tensor=length_tensor, repetition_penalty=1.0, max_out_seq=200,
    end_token_id=50000, temperature=1.0, top_k=0, top_p=0.95,
)
print(gen_tokenizer.batch_decode(output_dict['ids_list'], skip_special_tokens=True))

#### Sample EN

In [None]:
import os, torch
from transformers import OPTForCausalLM, GPT2Tokenizer

os.environ['CUDA_VISIBLE_DEVICES'] = '6'

model = OPTForCausalLM.from_pretrained('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b')
state_dict = torch.load('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b.pt',
                        map_location='cpu')['module']
new_dict = {}
for k, v in state_dict.items():
    if any([i in k for i in ['module.generator.gen.']]):
        new_dict[k[len('module.generator.gen.'):]] = v
    else:
        continue
if new_dict == {}:
    new_dict = state_dict
model.load_state_dict(new_dict)
model = model.to('cuda')

tokenizer = GPT2Tokenizer.from_pretrained('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b')

In [None]:
import datasets

train_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train_ds')
train_paws = train_paws.select(range(5000))
train_paws.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws_tmp')

In [None]:
prompt, text = [], []
for i in range(10):
    if train_paws[i]['score'] == 1:
        prompt.append('"' + train_paws[i]['text1'] + '" is similar to "')
        text.append(train_paws[i]['text2'])
inputs = tokenizer.batch_encode_plus(prompt, padding=True, return_tensors='pt')
generate_ids = model.generate(inputs.input_ids.cuda(), top_p=0.8, max_length=200, repetition_penalty=1.0)
res = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

In [None]:
for i in range(len(res)):
    print(res[i])
    print(text[i])
    print('\n')