### 加载模型

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
model = AutoModel.from_pretrained("facebook/opt-350m")

In [None]:
_ = model.save_pretrained("/cognitive_comp/wutong/source/model_base/model_en/opt_350m.pt")
_ = tokenizer.save_pretrained("/cognitive_comp/wutong/source/model_base/model_en/opt_350m.pt")

In [None]:
total_num = sum(p.numel() for p in model.parameters())
print(total_num)

### 加载数据集

##### QQP

In [None]:
import json
import pandas as pd
from tqdm import tqdm

qqp_train = pd.read_csv('/cognitive_comp/wutong/source/sim_data/raw_data_en/QQP/qqp_train.tsv', sep='\t')
qqp_dev = pd.read_csv('/cognitive_comp/wutong/source/sim_data/raw_data_en/QQP/qqp_dev.tsv', sep='\t')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/qqp_sim_data.json', 'w') as wp:
    for idx in tqdm(range(len(qqp_train))):
        if qqp_train['is_duplicate'].iloc[idx] == 1 and str(qqp_train['question1'].iloc[idx]) != "" and str(qqp_train['question1'].iloc[idx]) != "":
            wp.write(json.dumps({'text1': str(qqp_train['question1'].iloc[idx]),
                                'text2': str(qqp_train['question2'].iloc[idx]),
                                # 'score': int(qqp_train['is_duplicate'].iloc[idx]),
                                }, ensure_ascii=False) + '\n')
    
    for idx in tqdm(range(len(qqp_dev))):
        if qqp_dev['is_duplicate'].iloc[idx] == 1 and str(qqp_dev['question1'].iloc[idx]) != "" and str(qqp_dev['question1'].iloc[idx]) != "":
            wp.write(json.dumps({'text1': str(qqp_dev['question1'].iloc[idx]),
                                 'text2': str(qqp_dev['question2'].iloc[idx]),
                                # 'score': int(qqp_dev['is_duplicate'].iloc[idx]),
                                }, ensure_ascii=False) + '\n')
wp.close()

In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/qqp_sim_data.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_sim_data')

In [None]:
import datasets

qqp_data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_sim_data')
split_data = qqp_data.train_test_split(test_size=0.08, seed=42)
split_data['train'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_train')
split_data['test'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_test')

In [None]:
import json
import pandas as pd
from tqdm import tqdm

qqp_train = pd.read_csv('/cognitive_comp/wutong/source/sim_data/raw_data_en/QQP/qqp_train.tsv', sep='\t')
qqp_dev = pd.read_csv('/cognitive_comp/wutong/source/sim_data/raw_data_en/QQP/qqp_dev.tsv', sep='\t')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/qqp_sim_data.json', 'w') as wp:
    for idx in tqdm(range(len(qqp_train))):
        if str(qqp_train['question1'].iloc[idx]) != "" and str(qqp_train['question1'].iloc[idx]) != "":
            wp.write(json.dumps({'text1': str(qqp_train['question1'].iloc[idx]),
                                'text2': str(qqp_train['question2'].iloc[idx]),
                                'score': int(qqp_train['is_duplicate'].iloc[idx]),
                                }, ensure_ascii=False) + '\n')
    
    for idx in tqdm(range(len(qqp_dev))):
        if str(qqp_dev['question1'].iloc[idx]) != "" and str(qqp_dev['question1'].iloc[idx]) != "":
            wp.write(json.dumps({'text1': str(qqp_dev['question1'].iloc[idx]),
                                 'text2': str(qqp_dev['question2'].iloc[idx]),
                                'score': int(qqp_dev['is_duplicate'].iloc[idx]),
                                }, ensure_ascii=False) + '\n')
wp.close()

In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/qqp_sim_data.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           "score": datasets.Value("int8")
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_sim_data')

##### MRPC

In [None]:
import json
from tqdm import tqdm

wp = open('/cognitive_comp/wutong/source/sim_data/raw_data_en/MRPC/mrpc_dev.json', 'w')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/MRPC/msr_paraphrase_test.txt', 'r') as rp:
    lines = rp.readlines()
    for line in tqdm(lines):
        line_list = line.split('\t')
        wp.write(json.dumps({'text1': str(line_list[-2]),
                             'text2': str(line_list[-1]),
                             'score': int(line_list[0][-1]),
                            }, ensure_ascii=False) + '\n')
rp.close()
wp.close()

In [None]:
import datasets


path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/MRPC/mrpc_dev.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           "score": datasets.Value('int8'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/mrpc_dev')

In [None]:
import datasets, json
from tqdm import tqdm

train_mrpc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/mrpc_train_ds')
# dev_mrpc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/mrpc')

with open('/cognitive_comp/wutong/source/sim_data/similarity_data_en/mrpc_data.json', 'w') as wp:
    for idx in tqdm(range(train_mrpc.num_rows)):
        wp.write(json.dumps({'sentence': train_mrpc[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': train_mrpc[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
    # for idx in tqdm(range(dev_mrpc.num_rows)):
    #     wp.write(json.dumps({'sentence': dev_mrpc[idx]['text1']}, ensure_ascii=False) + '\n')
    #     wp.write(json.dumps({'sentence': dev_mrpc[idx]['text2']}, ensure_ascii=False) + '\n')
    #     wp.flush()
wp.close()

In [None]:
import os

ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data_en/mrpc_data.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk(os.path.join('/cognitive_comp/wutong/source/sim_data/predict_sentences/mrpc_sentence'))

##### PAWS

In [None]:
from datasets import load_dataset

dataset = load_dataset("paws-x", "en")
dataset.save_to_disk('/cognitive_comp/wutong/source/sim_data/raw_data_en/paws')

In [None]:
import datasets, json
from tqdm import tqdm

paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/raw_data_en/paws')
paws_train = datasets.concatenate_datasets([paws['train'], paws['validation']])
paws_val = paws['test']

with open('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws-x_train.json', 'w') as wp:
    for idx in tqdm(range(paws_train.num_rows)):
        wp.write(json.dumps({'text1': str(paws_train[idx]['sentence1']),
                             'text2': str(paws_train[idx]['sentence2']),
                             'score': int(paws_train[idx]['label']),
                            }, ensure_ascii=False) + '\n')
wp.close()

with open('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws-x_dev.json', 'w') as wp1:
    for idx in tqdm(range(paws_val.num_rows)):
        wp1.write(json.dumps({'text1': str(paws_val[idx]['sentence1']),
                             'text2': str(paws_val[idx]['sentence2']),
                             'score': int(paws_val[idx]['label']),
                            }, ensure_ascii=False) + '\n')
wp1.close()

In [None]:
# path = '/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws-x_train.json'
path = '/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws-x_dev.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           "score": datasets.Value('int8'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
# ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train_ds')
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')

In [None]:
import datasets, json
from tqdm import tqdm

train_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train_ds')
dev_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')

with open('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws_data.json', 'w') as wp:
    for idx in tqdm(range(train_paws.num_rows)):
        wp.write(json.dumps({'sentence': train_paws[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': train_paws[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
    for idx in tqdm(range(dev_paws.num_rows)):
        wp.write(json.dumps({'sentence': dev_paws[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': dev_paws[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
import os

ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws_data.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk(os.path.join('/cognitive_comp/wutong/source/sim_data/predict_sentences/paws_sentence'))

In [None]:
import datasets, json
from tqdm import tqdm

train_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train_ds')
dev_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')
paws = datasets.concatenate_datasets([train_paws, dev_paws])

with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/paws_sim_data.json', 'w') as wp:
    for idx in tqdm(range(paws.num_rows)):
        if paws[idx]['score'] == 1 and str(paws[idx]['text1']) != "" and str(paws[idx]['text2']) != "":
            wp.write(json.dumps({'text1': str(paws[idx]['text1']),
                                 'text2': str(paws[idx]['text2']),
                                }, ensure_ascii=False) + '\n')
wp.close()

In [None]:
import datasets


path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/paws_sim_data.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws_sim_data')

##### STS-B

In [None]:
import json
from tqdm import tqdm

wp = open('/cognitive_comp/wutong/source/sim_data/raw_data_en/STS-B/sts-b.json', 'w')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/STS-B/train.tsv', 'r') as rp:
    lines = rp.readlines()
    for line in tqdm(lines):
        line_list = line.split('\t')
        if float(line_list[-1][:-1]) >= 4.0:
            wp.write(json.dumps({'text1': str(line_list[-3]),
                                'text2': str(line_list[-2]),
                                }, ensure_ascii=False) + '\n')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/STS-B/dev.tsv', 'r') as rp:
    lines = rp.readlines()
    for line in tqdm(lines):
        line_list = line.split('\t')
        if float(line_list[-1][:-1]) >= 4.0:
            wp.write(json.dumps({'text1': str(line_list[-3]),
                                'text2': str(line_list[-2]),
                                }, ensure_ascii=False) + '\n')
rp.close()
wp.close()

In [None]:
import datasets


path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/STS-B/sts-b.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/sts-b_data')

##### WikiText

In [None]:
import datasets


wikitext = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/raw_data_en/wikitext')
wikitext

In [None]:
import json
from tqdm import tqdm


with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/wikitext.json', 'w') as wp:
    for idx in tqdm(range(wikitext.num_rows)):
        if len(wikitext[idx]['text']) != 0 and '=' not in wikitext[idx]['text']:
            wp.write(json.dumps({'text1': str(wikitext[idx]['text']),
                                 'text2': 'general',
                                }, ensure_ascii=False) + '\n')
wp.close()

In [None]:
import datasets


path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/wikitext.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/wikitext_data')

In [None]:
import datasets, json
from tqdm import tqdm

wiki_data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/wikitext_data')
with open('/cognitive_comp/wutong/source/sim_data/raw_data_en/wikitext.json', 'w') as wp:
    for idx in tqdm(range(wiki_data.num_rows)):
        if len(wiki_data[idx]['text1']) >= 50 and '<unk>' not in wiki_data[idx]['text1'] and '@' not in wiki_data[idx]['text1']:
            wp.write(json.dumps({'text1': str(wiki_data[idx]['text1']),
                                 'text2': 'general',
                                }, ensure_ascii=False) + '\n')
wp.close()

In [None]:
path = '/cognitive_comp/wutong/source/sim_data/raw_data_en/wikitext.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/wikitext_data')

##### 合并数据集

In [None]:
import datasets

wikitext = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/wikitext_data')
qqp = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_sim_data')
sts_b = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/sts-b_data')
paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/paws_sim_data')

pretrain_data = datasets.concatenate_datasets([wikitext, qqp, sts_b, paws])
pretrain_data = pretrain_data.shuffle(seed=42)
pretrain_data.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/pretrain_data')

In [None]:
import datasets


pretrain_data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/pretrain_data')
pretrain_data = pretrain_data.train_test_split(test_size=0.03)
pretrain_data

In [None]:
pretrain_data['train'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/pre_train')
pretrain_data['test'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/pre_val')

In [None]:
import datasets

qqp = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/qqp_sim_data')

# train_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train_ds')
# dev_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')

train_mrpc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/mrpc_train_ds')
dev_mrpc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/mrpc')

data4paws = datasets.concatenate_datasets([qqp, train_mrpc, dev_mrpc])
data4paws = data4paws.shuffle(seed=42)
data4paws.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/labeled4paws')

In [None]:
data4paws = data4paws.train_test_split(test_size=0.03)
data4paws['train'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/labeled_train_paws')
data4paws['test'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data_en/labeled_test_paws')

#### 测试生成结果

In [None]:
import os, torch
from transformers import OPTForCausalLM, GPT2Tokenizer

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

model = OPTForCausalLM.from_pretrained('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b')
state_dict = torch.load('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b.pt',
                        map_location='cpu')['module']
new_dict = {}
for k, v in state_dict.items():
    if any([i in k for i in ['module.generator.gen.']]):
        new_dict[k[len('module.generator.gen.'):]] = v
    else:
        continue
if new_dict == {}:
    new_dict = state_dict
model.load_state_dict(new_dict)
model.to('cuda')

tokenizer = GPT2Tokenizer.from_pretrained('/cognitive_comp/wutong/source/model_base/model_en/opt-2.7b')

In [None]:
prompt = ["\"In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland .\" is similar to \"", 
          "\"The NBA season of 1975 -- 76 was the 30th season of the National Basketball Association .\" is similar to \""]
inputs = tokenizer.batch_encode_plus(prompt, return_tensors="pt", padding=True)
# Generate
generate_ids = model.generate(inputs.input_ids.cuda(), do_sample=True, top_p=0.8, max_length=200, num_return_sequences=1)
tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

### 杂

##### 相似句数据增广

In [None]:
import datasets, json
from tqdm import tqdm

wp = open('/cognitive_comp/wutong/bustm.json', 'w')
with open('/cognitive_comp/wutong/train_0.json', 'r') as rp:
    lines = rp.readlines()
    for line in tqdm(lines):
        data = json.loads(line)
        wp.write(json.dumps({'sentence': data['sentence1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': data['sentence2']}, ensure_ascii=False) + '\n')
        wp.flush()

rp.close()
wp.close()

In [None]:
path = '/cognitive_comp/wutong/bustm.json'
feats = datasets.Features({"sentence": datasets.Value('string')})
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/bustm')

In [None]:
import torch
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

def gen_pred_collate(batch_data, gen_tokenizer):
    input_ids, length_list = [], []
    for item in batch_data:
        if not item['text2']:
            continue
        cur_input_ids = gen_tokenizer(
            '<bos>“' + item['text2'] + '”的相似句是“', return_tensors='pt'
        ).input_ids.squeeze()[:-1]  # 不能加<eos>

        # 每个样本复制 N 份
        length = [cur_input_ids.size(0)] * 2
        cur_input_ids = [cur_input_ids] * 2

        length_list.extend(length)
        input_ids.extend(cur_input_ids)

    input_ids = pad_sequence(
        [x for x in input_ids], batch_first=True, 
        padding_value=gen_tokenizer.pad_token_id)
    length_tensor = torch.tensor(length_list)

    return {
        'input_ids': input_ids,
        'length_tensor': length_tensor,
    }

In [None]:
from transformers import T5Tokenizer
from torch.utils.data import DataLoader

import sys, glob, datasets
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sim_gen_dataset import SimGanDataset

for k in range(1, 21):
    num = k
    data = datasets.load_from_disk(f'/cognitive_comp/wutong/bustm{num-1}')

    gen_tokenizer = T5Tokenizer.from_pretrained(
        '/cognitive_comp/wutong/source/model_base/chinese_sentencepiece/cog-pretrain.model',
        eos_token='<|endoftext|>',
        pad_token='<|endoftext|>',
        extra_ids=0)
    gen_tokenizer.add_special_tokens({'bos_token': '<bos>'})

    predict_dataset = SimGanDataset(data)
    def collate_fn(batch_data):
        return gen_pred_collate(batch_data, gen_tokenizer)
    dataloader = DataLoader(
        dataset=predict_dataset,
        batch_size=128,
        shuffle=False,
        num_workers=8,
        pin_memory=True,
        collate_fn=collate_fn,
    )

    import numpy as np
    import sys, evaluate, torch, json
    sys.path.append('/cognitive_comp/wutong/similarity_generation/')
    from data_utlis.sample_sequence import sample_sequence_batch
    from model_utils.sim_gen_model import Generator

    import os
    os.environ['CUDA_VISIBLE_DEVICES'] = '4'

    class Config:
        cycle = 0
        chinese = 1
        txl_config_path = '/cognitive_comp/wutong/similarity_generation/model_utils/txl_5B_config.json'
        txl_model_path = '/cognitive_comp/wutong/source/model_base/model_zh/txl_zh_5.0B.pt'
        ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/all_checkpoints/chip0'

    config = Config()
    perplexity = evaluate.load("perplexity", module_type="metric")
    raw_texts, sim_texts, mean_perplexity = [], [], []
    for idx in [10]:
        config.cycle = idx
        generator = Generator(config)
        generator.half().eval().cuda()
        sim_sent_list = []
        for batch in dataloader:
            torch.cuda.empty_cache()
            output_dict = sample_sequence_batch(
                model=generator.gen, context_tokens_tensor=batch['input_ids'].cuda(),
                context_length_tensor=batch['length_tensor'], repetition_penalty=1.0, max_out_seq=200,
                end_token_id=50000, temperature=1.0, top_k=0, top_p=0.5,
            )

            sim_sent_list.extend(
                gen_tokenizer.batch_decode(output_dict['ids_list'], skip_special_tokens=True))

        with open(f'/cognitive_comp/wutong/bustm{num}.json', 'w') as wp:
            for jdx, item in tqdm(enumerate(sim_sent_list)):
                item = item.replace(' ', '').split('”的相似句是“')
                if len(item) == 2 and item[0][1:] != item[1][:-1]:
                    wp.write(json.dumps({'text1': item[0][1:],
                                        'text2': item[1][:-1]}, ensure_ascii=False) + '\n')
            wp.close()
        
        path = f'/cognitive_comp/wutong/bustm{num}.json'
        feats = datasets.Features({"text1": datasets.Value('string'), 
                                "text2": datasets.Value('string'),
                                })
        ds = (datasets.load_dataset('json', data_files=path, 
                                    cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                                    features=feats)['train'])
        ds.save_to_disk(f'/cognitive_comp/wutong/bustm{num}')


In [None]:
data_list = []
for i in range(8):
    data_list.append(datasets.load_from_disk('/cognitive_comp/wutong/bustm' + str(i)))
bustm = datasets.concatenate_datasets(data_list)
bustm.save_to_disk('/cognitive_comp/wutong/bustm0_')

In [None]:
with open(f'/cognitive_comp/wutong/gen_data.json', 'w') as wp:
    for i in tqdm(range(bustm.num_rows)):
        item = bustm[i]
        if item['text1'] and item['text2']:
            wp.write(json.dumps({'sentence1': item['text1'],
                                 'sentence2': item['text2']}, ensure_ascii=False) + '\n')

#### Bert Score

In [None]:
import datasets, os
from bert_score import score

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

for i in [0, 8]:  # [0, 10]
    raw_text, sim_text = [], []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/consistency/afqmc/data_cycle_' + str(i))
    for j in range(data.num_rows):
        raw_text.append(data[j]['text1'])
        sim_text.append(data[j]['text2'])
    P, R, F1 = score(sim_text, raw_text, lang="zh", verbose=True)
    print(F1.mean())

#### Perplexity

##### en

In [None]:
import datasets, os, evaluate
import numpy as np
from tqdm import tqdm

os.environ['CUDA_VISIBLE_DEVICES'] = '1'


perplexity = evaluate.load("perplexity", module_type="metric")
for i in [0, 8]:
    ppl, sim_text = [], []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/consistency/mrpc/data_cycle_' + str(i))
    for j in tqdm(range(data.num_rows)):
        if data[j]['text2']:
            sim_text.append(data[j]['text2'])
    print(perplexity.compute(input_texts=sim_text, model_id='gpt2')['mean_perplexity'])

##### zh

In [None]:
import os
from transformers import GPT2Tokenizer,GPT2LMHeadModel

os.environ['CUDA_VISIBLE_DEVICES'] = '1'


tokenizer = GPT2Tokenizer.from_pretrained('IDEA-CCNL/Wenzhong-GPT2-110M')
model = GPT2LMHeadModel.from_pretrained('IDEA-CCNL/Wenzhong-GPT2-110M')
model.to('cuda').eval()

def gpt_ppl(sent):
    inputs = tokenizer(sent, return_tensors='pt')
    loss = model(input_ids=inputs["input_ids"].cuda(), 
                 attention_mask=inputs["attention_mask"].cuda(),
                 labels=inputs["input_ids"].cuda()).loss
    ppl = np.exp(loss.item())

    return ppl

In [None]:
import datasets
import numpy as np
from tqdm import tqdm


for i in [0, 10]:
    ppl, sim_text = [], []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/consistency/qqp/data_cycle_' + str(i))
    for j in tqdm(range(data.num_rows)):
        if data[j]['text2']:
            ppl.append(gpt_ppl(data[j]['text2']))
    print(np.array(ppl).mean())

#### Consistency

In [None]:
import os, torch
from transformers import BertTokenizer, BertModel

os.environ['CUDA_VISIBLE_DEVICES'] = '1'


tokenizer = BertTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")  # princeton-nlp/sup-simcse-bert-base-uncased / SimCSE-bert-base
model = BertModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
model.to('cuda').eval()

def get_emb(sent_list):
    torch.cuda.empty_cache()
    inputs = tokenizer(sent_list, padding=True, return_tensors="pt")
    outputs = model(input_ids=inputs["input_ids"].cuda(), 
                    attention_mask=inputs["attention_mask"].cuda()
                ).pooler_output 

    return outputs.squeeze()

In [None]:
import datasets
import numpy as np
import torch.nn.functional as F

from tqdm import tqdm
from scipy.stats import wasserstein_distance, entropy


all_w, all_e, all_kl, all_h = [], [], [], []
for i in range(9):  # [0, 10]
    sim_text, raw_text = [], []
    cos_sim, prob = [], []
    data = datasets.load_from_disk('/cognitive_comp/wutong/similarity_generation/consistency/mrpc/data_cycle_' + str(i))
    for j in tqdm(range(data.num_rows)):
        if data[j]['text1'] and data[j]['text2']:
            prob.append(data[j]['prob'])
            raw_text.append(data[j]['text1'])
            sim_text.append(data[j]['text2'])
        
        if j != 0 and (j % 100 == 0 or j == data.num_rows - 1):
            emb1 = get_emb(raw_text).tolist()
            emb2 = get_emb(sim_text).tolist()
            for e1, e2 in zip(emb1, emb2):
                e1, e2 = np.array(e1), np.array(e2)
                cos_sim.append(e1.dot(e2) / (np.linalg.norm(e1) * np.linalg.norm(e2)))
            sim_text, raw_text = [], []
    
    del_index = []
    for j in range(len(cos_sim)):
        if np.isnan(cos_sim[j]) or cos_sim[j] <= 0:
            del_index.append(j)
    for j in reversed(del_index):
        del prob[j]
        del cos_sim[j]
    
    print(len(cos_sim))
    
    # e_d = np.sqrt(np.sum(np.square(np.array(cos_sim) - np.array(prob))))
    # all_e.append(e_d)

    kl = entropy(prob, cos_sim)
    all_kl.append(kl)

    # w_d = wasserstein_distance(cos_sim, prob)
    # all_w.append(w_d)

    # h_d = 1 / np.sqrt(2) * np.linalg.norm(np.sqrt(cos_sim) - np.sqrt(prob))
    # all_h.append(h_d)
    
    # print(all_e)
    print(all_kl)
    # print(all_w)
    # print(all_h)


In [None]:
from transformers import AutoTokenizer
pt_path = '/cognitive_comp/wutong/source/model_base/pretrained_zh/ernie_base_mc'
dis_tokenizer = AutoTokenizer.from_pretrained(pt_path)


In [None]:
res = dis_tokenizer.encode_plus('hello:'*500, max_length=512, padding="longest", truncation=True)

In [None]:
import json
from tqdm import tqdm

wp = open('/cognitive_comp/wutong/source/sim_data/raw_data/bustm/new_unlabel.json', 'w')

with open('/cognitive_comp/wutong/source/sim_data/raw_data/bustm/unlabeled.json', 'r') as f:
    lines = f.readlines()
    for line in tqdm(lines):
        line = eval(line)
        wp.write(json.dumps({'sentence': line['sentence1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': line['sentence2']}, ensure_ascii=False) + '\n')
    f.close()
wp.close()

In [None]:
import random, datasets

test_ds = datasets.Dataset.from_json('/cognitive_comp/wutong/source/sim_data/raw_data/bustm/new_unlabel.json')

random_list = random.sample(range(60), 10)
data = test_ds.select(random_list)

In [None]:
for i in range(10):
    print(test_ds[random_list[i]])
    print(data[i])