In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'
import torch
from transformers import AutoTokenizer, AlbertTokenizer
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence


def discriminator_collate_fn(batch_data, tokenizer):
    dis_text_input_ids, labels = [], []
    for item in batch_data:
        # sentence1
        dis_text = item['text1'] + '[SEP]' + item['text2']
        input_ids = tokenizer(dis_text, return_tensors='pt').input_ids.squeeze()

        dis_text_input_ids.append(input_ids)
        labels.append(torch.tensor(int(item['score']), dtype=torch.long))

    dis_text_input_ids = pad_sequence([x for x in dis_text_input_ids],
                                      batch_first=True, 
                                      padding_value=tokenizer.pad_token_id)

    return {
        'dis_text_input_ids': dis_text_input_ids,
        'labels': torch.stack(labels),
    }


class Config:
    cycle = 0
    zero_shot = 0  #
    data_name = 'mrpc'  #
    chinese = 0 # 
    warm_up_model = True  #
    pretrain_dis = False
    discriminator_en = 'albert_xxlarge'
    discriminator_zh = 'albert_xxlarge'  # roformer_large / roberta_large
    pretrained_en = '/cognitive_comp/wutong/source/model_base/pretrained_en/'
    pretrained_zh = '/cognitive_comp/wutong/source/model_base/pretrained_zh/'
    ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/experiments/lightning_logs/checkpoints/2'
    # ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/all_checkpoints/new_exp7'

class SimGanDataset(Dataset):
    def __init__(self, data) -> None:
        super().__init__()
        self.data = data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return self.data.num_rows

In [None]:
from sklearn.metrics import f1_score, accuracy_score
import sys, datasets
from torch.utils.data import DataLoader

sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from model_utils.sim_gen_model import Discriminator


config = Config()
data = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/' + config.data_name)
dataset = SimGanDataset(data)
dis_tokenizer = AlbertTokenizer.from_pretrained(config.pretrained_en + config.discriminator_en)
# dis_tokenizer = AutoTokenizer.from_pretrained(config.pretrained_zh + config.discriminator_zh)
def collate_fn(batch_data):
    return discriminator_collate_fn(batch_data, dis_tokenizer)
dataloader = DataLoader(
        dataset=dataset,
        batch_size=512,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        collate_fn=collate_fn,
    )

pred_result = []
f1_result, acc_result = [], []
for idx in range(11):
    config.cycle = idx
    discriminator = Discriminator(config)
    discriminator.cuda().eval()
    with torch.no_grad():
        pred_list = []
        f1_score_list, acc_score_list = [], []
        for batch in dataloader:
            torch.cuda.empty_cache()
            logits = discriminator.forward(
                batch['dis_text_input_ids'].cuda(),
                None
            )
            
            predictions = torch.argmax(logits, dim=1).tolist()
            f1_score_list.append(
                f1_score(batch['labels'].cuda().tolist(), predictions)
            )
            acc_score_list.append(
                accuracy_score(batch['labels'].cuda().tolist(), predictions)
            )
        print(sum(f1_score_list) / len(f1_score_list))
        f1_result.append(sum(f1_score_list) / len(f1_score_list))
        print(sum(acc_score_list) / len(acc_score_list))
        acc_result.append(sum(acc_score_list) / len(acc_score_list))

print(f1_result)
print(acc_result)

In [None]:
from torch.nn.utils.rnn import pad_sequence

import sys
sys.path.append('/cognitive_comp/wutong/similarity_generation/')
from data_utlis.sim_data_collate import padding_dis_mask


def dis_pred_collate(batch_data, tokenizer):
    max_length = 0
    input_ids, token_type_ids, attention_mask, position_ids = [], [], [], []
    clslabels_mask, sentence1, sentence2, labels, label_idx = [], [], [], [], []
    for item in batch_data:
        max_length = max(max_length, item['attention_mask'].size(0))
        input_ids.append(item['input_ids'])
        token_type_ids.append(item['token_type_ids'])
        attention_mask.append(item['attention_mask'])
        position_ids.append(item['position_ids'])
        clslabels_mask.append(item['clslabels_mask'])
        sentence1.append(item['sentence1'])
        sentence2.append(item['sentence2'])
        labels.append(item['label'])
        label_idx.append(item['label_idx'])
    
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
    attention_mask = padding_dis_mask(attention_mask, max_length)
    position_ids = pad_sequence(position_ids, batch_first=True, padding_value=0)
    clslabels_mask = pad_sequence(clslabels_mask, batch_first=True, padding_value=-10000)
        
    return {
            "input_ids": input_ids,
            "token_type_ids": token_type_ids,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
            "clslabels_mask": clslabels_mask,
            'label_idx': torch.stack(label_idx),
            'sentence1': sentence1,
            'sentence2': sentence2,
            'labels': labels
        }

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

import datasets, torch
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from sklearn.metrics import f1_score, accuracy_score

from data_utlis.sim_gen_dataset import SimGanDataset, preprocess
from model_utils.sim_gen_model import Discriminator


class Config:
    cycle = 0
    warm_up_model = True
    data_path = '/cognitive_comp/wutong/source/sim_data/raw_data/bustm'
    dis_model_path = '/cognitive_comp/wutong/source/model_base/pretrained_zh/macbert_large_mc'
    dis_ckpt_path = '/cognitive_comp/wutong/finetune_large.bin'
    ckpt_model_path = '/cognitive_comp/wutong/similarity_generation/experiments/lightning_logs/checkpoints'
    
config = Config()
dis_tokenizer = AutoTokenizer.from_pretrained(config.dis_model_path)

test_data = datasets.Dataset.from_json(config.data_path + '/test_public.json')
test_data = test_data.map(preprocess)
test_dataset = SimGanDataset(data=test_data, tokenizer=dis_tokenizer, test=True)
def collate_fn(batch_data):
    return dis_pred_collate(batch_data, dis_tokenizer)
dataloader = DataLoader(
        dataset=test_dataset,
        batch_size=512,
        shuffle=False,
        num_workers=4,
        pin_memory=True,
        collate_fn=collate_fn,
    )

pred_result = []
f1_result, acc_result = [], []
all_labels, all_preds = [], []
for idx in range(4):
    config.cycle = idx
    discriminator = Discriminator(config, dis_tokenizer)
    discriminator.cuda().eval()
    with torch.no_grad():
        pred_list = []
        f1_score_list, acc_score_list = [], []
        for batch in dataloader:
            all_logits = []
            torch.cuda.empty_cache()
            prob = discriminator.forward(
                input_ids=batch['input_ids'].cuda(),
                attention_mask=batch['attention_mask'].cuda(),
                token_type_ids=batch['token_type_ids'].cuda(),
                position_ids=batch['position_ids'].cuda(),
                clslabels_mask=batch['clslabels_mask'].cuda(),
                bt_label_idx=batch['label_idx'].cuda()
            )
            
            predictions = torch.argmax(prob, dim=-1).tolist()
            all_labels.extend(batch['labels'])
            all_preds.extend(predictions)
            
        print(f1_score(all_labels, all_preds))
        f1_result.append(f1_score(all_labels, all_preds))
        print(accuracy_score(all_labels, all_preds))
        acc_result.append(accuracy_score(all_labels, all_preds))

print(f1_result)
print(acc_result)

In [None]:
# roberta = paws:[0.8997369931329687] [0.903875942887931]
#          mrpc = [0.7673407401588641] [0.6276403356481481]
# xlnet = paws:[0.47952997670125214] [0.5578023976293103] //es=3都不行
#        mrpc = [0.7941365210413075] [0.659014343584656]
# bert = paws:[0.9213888861666291] [0.9253771551724138]
#       mrpc = [0.8156460192358714] [0.7269112723214286]
# electra = paws:[0.9214410094402226] [0.9246868265086207]
#          mrpc = [0.7939015098372849] [0.658526062334656]
# xlm-roberta = paws:[0.905323910226371] [0.9117894665948276]
#              mrpc = [0.8426940116719521] [0.7911861359126984]
# albert = paws:[0.9604555945805979] [0.963934536637931]
#         mrpc = [0.8893473320445198] [0.8592251570767195]

In [None]:
# electra = afqmc:[0.545895310635966] [0.7228772095959596]
#          chip = [0.849673048762821] [0.8430739182692307]
#          qqp = [0.7180762308328926] [0.781982421875]
# macbert = afmqc:[0.6111182068285741] [0.7516611426767676]
#          chip:[0.8593980022419215] [0.8554875300480769]
#          qqp = [0.7294041926803511] [0.781982421875]
# rofomer = afmqc:[0.6418719522930292] [0.7750946969696969]
#          chip:[0.8415617903407737] [0.8429987980769231]
#          qqp = [0.765667422645637] [0.813232421875]
# structbert = afmqc:[0.6056075382357595] [0.7497001262626263]
#             chip:[0.8516500083224742] [0.8484825721153846]
#             qqp:[0.7732818709313707] [0.813720703125]
# xlnet = afmqc:[0.5031211435150875] [0.6857796717171717]
#        chip:[0.8296630108013876] [0.8177396334134616]
#        qqp:[0.6496849405633153] [0.73095703125]
# albert = afmqc:[0.5686514435362098] [0.7508680555555556]
#         chip:[0.8632537912808637] [0.8587552584134616]
#         qqp:[0.6600652114652226] [0.747802734375]
# roberta = afmqc:[0.5729197585634695] [0.7502130681818182]
#          chip:[0.8692697270032463] [0.8669057992788461]
#          qqp:[0.7458100776127146] [0.7978515625]

#### 生成csv

In [None]:
import pandas as pd
 
y = [0.3450724267046834, 0.43526217574723775, 0.33945017097133845, 0.3619290824702441, 0.36383696270018284, 
0.39280227933930073, 0.36088015890524094, 0.3230967660316284, 0.2617301088737617, 0, 0.18178095085881757, 0.0892337549256072, 0.07488227572939889, 0.07151414141036311, 0.07504186251877931, 
0.05583403570053498, 0.05289366441930157, 0.03416507607136132, 0.033453614692229725, 0.024714469986622287, 
0.02888675736874282, 0, 0.2189011242828891, 0.14067707286676562, 0.2031573829354798, 0.12241785405091402, 0.13518911602506856, 
0.12585420520606475, 0.0899325954908943, 0.12480431998217825, 0.12156096303645424, 0.090277737478651, 
0.050300214886722844, 0, 0.3523108767950005, 0.11316448387455255, 0.030861984236866754, 0.021433970858802387, 0.011469380308480093, 
0.012914158734129873, 0.016765391879763763, 0.012811650941711477, 0.02138641281263294]
test = pd.DataFrame(data=y)
print(test)
test.to_csv('test.csv',encoding='gbk')

In [None]:
#encoding=utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
fig, axes = plt.subplots(ncols=4, figsize=(28,7))  # , nrows=2
[ax1, ax2, ax3, ax4] = axes
 
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
selective_trust = np.array([0.38254154474255286, 0.3960984181095649, 0.44978116229809234, 0.45989012915572847, 0.45709906383664045, 
0.4801106211003655, 0.504149500468291, 0.506782448928266, 0.5136386072545487])
fullly_trust = np.array([0.38254154474255286, 0.46877269447755715, 0.46909745565333594, 0.4692954930270959, 0.46966238165481183, 
0.46911981674991676, 0.4690417776913679, 0.47011498891802805, 0.4687900941179161])
no_label = np.array([0.38254154474255286, 0.4064452662911879, 0.44236206320170124, 0.47367297728223673, 0.4850168321537159, 
0.5012746902756866, 0.5037796076160921, 0.5026058825825503, 0.5021541312154869])
 
ax1.plot(x, selective_trust, label='Proposed Method', color='red', linestyle='-', marker='o', markerfacecolor='black', markersize='10')
ax1.plot(x, fullly_trust, label='No Selection Mechanism', color='blue', linestyle=':', marker='*', markerfacecolor='black', markersize='10')
ax1.plot(x, no_label, label='No Pretraining Data', color='green', linestyle='--', marker='^', markerfacecolor='black', markersize='10')

#设置坐标轴
ax1.set_title('AFQMC', color='black', fontsize=25)
ax1.set_xlabel('Round Nums', color='black', fontsize=15)
ax1.set_ylabel('F1 Score', color='black', fontsize=15)
ax1.tick_params(axis='both', labelcolor='black', labelsize=15, width=3, color='black')

#显示网格
#ax.grid(True, linestyle='-.')
ax1.yaxis.grid(True, linestyle='-.')
#添加图例
legend = ax1.legend(loc='best', fontsize=15)



x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
selective_trust = np.array([0.5881660637636877, 0.6289374431681133, 0.6723658508141568, 0.713794292460992, 0.7145139426994398, 
0.740641428621894, 0.7408486799912206, 0.7665794132256334, 0.7630026406183327, 0.7666930433539052, 
0.774214486289817])
fullly_trust = np.array([0.5881660637636877, 0.6636617726976335, 0.6642206660203174, 0.6639590979377188, 0.6641740286236856, 
0.6642811330314553, 0.6640371291044285, 0.6638241591952845, 0.6638179210213015, 0.6642828177708464, 
0.6640151542979859])
no_label = np.array([0.5881660637636877, 0.6358490369533671, 0.6480864578022787, 0.651020791966919, 0.6564546334248053, 
0.6734860331918114, 0.6902199014502423, 0.6888101552952934, 0.688709687421092, 0.6940691606531408,
0.6962635170769439])
 
ax2.plot(x, selective_trust, label='Proposed Method', color='red', linestyle='-', marker='o', markerfacecolor='black', markersize='10')
ax2.plot(x, fullly_trust, label='No Selection Mechanism', color='blue', linestyle=':', marker='*', markerfacecolor='black', markersize='10')
ax2.plot(x, no_label, label='No Pretraining Data', color='green', linestyle='--', marker='^', markerfacecolor='black', markersize='10')

#设置坐标轴
ax2.set_title('CHIP-STS', color='black', fontsize=25)
ax2.set_xlabel('Round Nums', color='black', fontsize=15)
ax2.set_ylabel('F1 Score', color='black', fontsize=15)
ax2.tick_params(axis='both', labelcolor='black', labelsize=15, width=3, color='black')

#显示网格
#ax.grid(True, linestyle='-.')
ax2.yaxis.grid(True, linestyle='-.')
#添加图例
legend = ax2.legend(loc='best', fontsize=15)



x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
selective_trust = np.array([0.5787733523556429, 0.600761073931724, 0.5857334315782603, 0.6030196371638206, 0.613082687657, 
0.6446848063163415, 0.664392721223111, 0.6387986670145489, 0.6545834340943931, 0.6807849710717174, 
0.7050992611635148])
fullly_trust = np.array([0.5787733523556429, 0.6374456960733875, 0.6312069203979489, 0.6361256846512542, 0.6368882523259844, 
0.6334402436786893, 0.6385384809511449, 0.634539659014934, 0.632161278274643, 0.6362743578965781, 
0.6344880601932359])
no_label = np.array([0.5787733523556429, 0.5904298590861632, 0.6394423836030231, 0.6477267406003373, 0.6715233623036275, 
0.66688813607648, 0.6831579329962527, 0.6851558182653499, 0.6811772008040317, 0.676038318180333, 
0.6885035291405497])
 
ax3.plot(x, selective_trust, label='Proposed Method', color='red', linestyle='-', marker='o', markerfacecolor='black', markersize='10')
ax3.plot(x, fullly_trust, label='No Selection Mechanism', color='blue', linestyle=':', marker='*', markerfacecolor='black', markersize='10')
ax3.plot(x, no_label, label='No Pretraining Data', color='green', linestyle='--', marker='^', markerfacecolor='black', markersize='10')

#设置坐标轴
ax3.set_title('Chinese-QQP', color='black', fontsize=25)
ax3.set_xlabel('Round Nums', color='black', fontsize=15)
ax3.set_ylabel('F1 Score', color='black', fontsize=15)
ax3.tick_params(axis='both', labelcolor='black', labelsize=15, width=3, color='black')

#显示网格
#ax.grid(True, linestyle='-.')
ax3.yaxis.grid(True, linestyle='-.')
#添加图例
legend = ax3.legend(loc='best', fontsize=15)



x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
selective_trust = np.array([0.685425149429463, 0.7547010093663151, 0.7662849329866745, 0.8300382835835954, 0.8389952050899818, 
0.8424136200685073, 0.8450697319911847, 0.8432245404107888, 0.8461271960949839])
fullly_trust = np.array([0.685425149429463, 0.7955473204511441, 0.7939015098372849, 0.7939015098372849, 0.7939015098372849, 
0.7941354097281316, 0.7941354097281316, 0.7941354097281316, 0.7941354097281316])
no_label = np.array([0.685425149429463, 0.6818982417546661, 0.7375619526379604, 0.8209849067032067, 0.834055656344813, 
0.8345211380648245, 0.8368184565131172, 0.8292048620492494, 0.8321404582284176])
 
ax4.plot(x, selective_trust, label='Proposed Method', color='red', linestyle='-', marker='o', markerfacecolor='black', markersize='10')
ax4.plot(x, fullly_trust, label='No Selection Mechanism', color='blue', linestyle=':', marker='*', markerfacecolor='black', markersize='10')
ax4.plot(x, no_label, label='No Pretraining Data', color='green', linestyle='--', marker='^', markerfacecolor='black', markersize='10')

#设置坐标轴
ax4.set_title('MRPC', color='black', fontsize=25)
ax4.set_xlabel('Round Nums', color='black', fontsize=15)
ax4.set_ylabel('F1 Score', color='black', fontsize=15)
ax4.tick_params(axis='both', labelcolor='black', labelsize=15, width=3, color='black')

#显示网格
# ax1.grid(True, linestyle='-.')
# ax2.grid(True, linestyle='-.')
# ax3.grid(True, linestyle='-.')
# ax4.grid(True, linestyle='-.')
ax4.yaxis.grid(True, linestyle='-.')
#添加图例
legend = ax4.legend(loc='best', fontsize=15)
 
plt.savefig('./filename.svg', format='svg')
plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
 
fig, ax = plt.subplots(figsize=(10,10))
 
x1 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8])
x2 = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
afqmc = np.array([0.28654564444219904, 0.35972071429313923, 0.29298804289042746, 0.3306807832380508, 0.3324986684131944, 
0.3645114140726299, 0.3424742080541955, 0.3140620108209938, 0.2628001582167687])
chip = np.array([0.16206089395187703, 0.09569190730008093, 0.09556990179221979, 0.09420757335900724, 0.10161252788187716, 
0.09115370391518327, 0.08399135543848252, 0.07265377248154187, 0.07835081026023685, 0.0701899316398505, 0.0809814560033198])
qqp = np.array([0.18226184063713147, 0.121416109122202, 0.17365331953532936, 0.10583140891885817, 0.11715967163923266, 
0.11437480329256752, 0.08495643617900697, 0.11638218546801168, 0.11427012829847398, 0.08989371306790174, 0.05884738531972761])
mrpc = np.array([0.22290342796420745, 0.07787502813333516, 0.02161599728930512, 0.01768073187058273, 0.011070382003902497, 
                 0.0153173071317978, 0.01596686939883061, 0.014495716325335358, 0.021032058399025413])
 
ax.plot(x1, afqmc, label='AFQMC', color='orange', linestyle='-.', marker='s', markerfacecolor='black', markersize='10')
ax.plot(x2, chip, label='CHIP-STS', color='blue', linestyle=':', marker='*', markerfacecolor='black', markersize='10')
ax.plot(x2, qqp, label='Chinese-QQP', color='green', linestyle='--', marker='^', markerfacecolor='black', markersize='10')
ax.plot(x1, mrpc, label='MRPC', color='red', linestyle='-', marker='o', markerfacecolor='black', markersize='10')

#设置坐标轴
ax.set_xlabel('Round Nums', color='black', fontsize=20)
ax.set_ylabel('KL Divergence', color='black', fontsize=20)
ax.tick_params(axis='both', labelcolor='black', labelsize=20, width=3, color='black')

#显示网格
# ax.grid(True, linestyle='-.')
ax.yaxis.grid(True, linestyle='-.')
#添加图例
legend = ax.legend(loc='best', fontsize=20)

plt.savefig('./filename.svg', format='svg')
plt.show()

#### 处理数据集(json->datasets)

##### 处理AFQMC数据集

In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data/AFQMC/afqmc_test.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                        #    "score": datasets.Value('int8'),
                           "id": datasets.Value('int64'),
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/afqmc_test')

In [None]:
import datasets

label_ds = datasets.load_from_disk('/cognitive_comp/wutong/source/data_base/similarity_data/labeled_data')
afqmc_ds = datasets.load_from_disk('/cognitive_comp/wutong/source/data_base/similarity_data/afqmc_train')
label_afqmc_ds = datasets.concatenate_datasets([label_ds, afqmc_ds])
label_afqmc_ds.save_to_disk('/cognitive_comp/wutong/source/data_base/similarity_data/labeled_afqmc_ds')

In [None]:
import datasets, json
from tqdm import tqdm

train_afqmc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/afqmc_train_ds')
# dev_afqmc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/afqmc')
test_afqmc = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/afqmc_test')
afqmc = datasets.concatenate_datasets([train_afqmc, test_afqmc])
afqmc = afqmc.shuffle(seed=42)

with open('/cognitive_comp/wutong/source/sim_data/similarity_data/afqmc_train_ds.json', 'w') as wp:
    for idx in tqdm(range(afqmc.num_rows)):
        wp.write(json.dumps({'sentence': afqmc[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': afqmc[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data/afqmc_train_ds.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/predict_sentences/afqmc_sentence')

##### 处理QQP数据集

In [None]:
import datasets, glob
from concurrent.futures import ProcessPoolExecutor

feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                           "score": datasets.Value('int8')})
def _generate_cache_arrow(index, path):
    print('saving dataset shard {}'.format(index))
    ds = (datasets.load_dataset('json', data_files=path,
                                cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                                features=feats)['train'])
    ds.save_to_disk(os.path.join('/cognitive_comp/wutong/source/sim_data/translate_data/translate_cache_data', f'0{index}'))
    return 'saving dataset shard {} done'.format(index)


def generate_cache_arrow(num_proc=1) -> None:
    data_dict_paths = []
    data_dict_paths = glob.glob('/cognitive_comp/wutong/source/sim_data/translate_data/translate_json_data/*')
    print(data_dict_paths)
    
    p = ProcessPoolExecutor(max_workers=num_proc)
    res = []

    for index, path in enumerate(data_dict_paths):
        res.append(p.submit(_generate_cache_arrow, index, path))

    p.shutdown(wait=True)
    for future in res:
        print(future.result(), flush=True)

generate_cache_arrow()

In [None]:
import glob
cache_dict_paths = glob.glob('/cognitive_comp/wutong/source/sim_data/translate_data/translate_cache_data/*')
sim_ds_list = []
for path in cache_dict_paths:
    sim_ds_list.append(datasets.load_from_disk(path))
sim_dataset = datasets.concatenate_datasets(sim_ds_list)
sim_dataset.save_to_disk('/cognitive_comp/wutong/source/sim_data/translate_data/qqp_data')

In [None]:
sim_dataset = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/translate_data/qqp_data')
split_ds = sim_dataset.train_test_split(test_size=0.4)
split_ds['train'].save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/qqp_train_ds')
split_ds['test'].save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/qqp')

In [None]:
import datasets, json
from tqdm import tqdm

train_qqp = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/qqp_train_ds')
with open('/cognitive_comp/wutong/source/sim_data/similarity_data/qqp_train_data.json', 'w') as wp:
    for idx in tqdm(range(train_qqp.num_rows)):
        wp.write(json.dumps({'sentence': train_qqp[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': train_qqp[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data/qqp_train_data.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk(os.path.join('/cognitive_comp/wutong/source/sim_data/predict_sentences/qqp_sentence'))

##### 处理CHIP数据集

In [None]:
import json
from tqdm import tqdm


with open('/cognitive_comp/wutong/source/sim_data/raw_data/test.json', 'r') as rp:
    data = json.load(rp)
rp.close()

with open('/cognitive_comp/wutong/source/sim_data/raw_data/chip_test.json', 'w') as wp:
    for item in tqdm(data):
        wp.write(json.dumps({'text1': item['text1'],
                             'text2': item['text2'],
                            #  'score': item['label']
                             },
                            ensure_ascii=False) + '\n')
wp.close()

In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data/CHIP/chip_test.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                        #    "score": datasets.Value('int8')
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/chip_test')

In [None]:
import datasets, json
from tqdm import tqdm

train_chip = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/chip_train_ds')
# dev_chip = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/chip')
test_chip = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/chip_test')
chip = datasets.concatenate_datasets([train_chip, test_chip])
chip = chip.shuffle(seed=42)

with open('/cognitive_comp/wutong/source/sim_data/similarity_data/chip_train_ds.json', 'w') as wp:
    for idx in tqdm(range(chip.num_rows)):
        wp.write(json.dumps({'sentence': chip[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': chip[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data/chip_train_ds.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/predict_sentences/chip_sentence')

##### 处理OPPO数据集

In [None]:
import json
from tqdm import tqdm

# test, train, dev: 50000 167173 10000
with open('/cognitive_comp/wutong/source/sim_data/raw_data/oppp.json', 'r') as rp:
    data = json.load(rp)
rp.close()

with open('/cognitive_comp/wutong/source/sim_data/raw_data/oppo_train.json', 'w') as wp:
    for item in tqdm(data['train']):
        wp.write(json.dumps({'text1': item['q1'],
                             'text2': item['q2'],
                             'score': item['label'],
                             }, ensure_ascii=False) + '\n')
wp.close()

with open('/cognitive_comp/wutong/source/sim_data/raw_data/oppo_dev.json', 'w') as wp1:
    for item in tqdm(data['dev']):
        wp1.write(json.dumps({'text1': item['q1'],
                             'text2': item['q2'],
                             'score': item['label'],
                             }, ensure_ascii=False) + '\n')
wp1.close()

with open('/cognitive_comp/wutong/source/sim_data/raw_data/oppo_test.json', 'w') as wp2:
    for item in tqdm(data['test']):
        wp2.write(json.dumps({'text1': item['q1'],
                             'text2': item['q2'],
                             }, ensure_ascii=False) + '\n')
wp2.close()

In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data/oppo_test.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                        #    "score": datasets.Value('int8')
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
# ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/oppo_train')
# ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/oppo')
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/oppo_test')

In [None]:
import datasets, json
from tqdm import tqdm

train_oppo = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/oppo_train_ds')
dev_oppo = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/oppo')
test_oppo = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/oppo_test')
oppo = datasets.concatenate_datasets([train_oppo, dev_oppo, test_oppo])
oppo = oppo.shuffle(seed=42)

with open('/cognitive_comp/wutong/source/sim_data/similarity_data/oppo_train_ds.json', 'w') as wp:
    for idx in tqdm(range(oppo.num_rows)):
        wp.write(json.dumps({'sentence': oppo[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': oppo[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data/oppo_train_ds.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/predict_sentences/oppo_sentence')  # 454346

##### 处理PAWS数据集

In [None]:
import json
import pandas as pd
from tqdm import tqdm

paws_train = pd.read_csv('/cognitive_comp/wutong/source/sim_data/raw_data/paws_test.tsv', sep='\t', header=None)
with open('/cognitive_comp/wutong/source/sim_data/raw_data/paws_test.json', 'w') as wp:
    for idx in tqdm(range(len(paws_train))):
        wp.write(json.dumps({'text1': str(paws_train[0][idx]),
                             'text2': str(paws_train[1][idx]),
                            #  'score': int(paws_train[2][idx]),
                             }, ensure_ascii=False) + '\n')
wp.close()


In [None]:
import datasets

path = '/cognitive_comp/wutong/source/sim_data/raw_data/paws_test.json'
feats = datasets.Features({"text1": datasets.Value('string'), 
                           "text2": datasets.Value('string'),
                        #    "score": datasets.Value('int8')
                           })
ds = (datasets.load_dataset('json', data_files=path, 
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache',
                            features=feats)['train'])
# ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train')
# ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/paws_test')

In [None]:
import datasets, json
from tqdm import tqdm

train_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/paws_train')
dev_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/paws')
test_paws = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/paws_test')
paws = datasets.concatenate_datasets([train_paws, dev_paws, test_paws])
paws = paws.shuffle(seed=42)

with open('/cognitive_comp/wutong/source/sim_data/similarity_data/paws_train_ds.json', 'w') as wp:
    for idx in tqdm(range(paws.num_rows)):
        wp.write(json.dumps({'sentence': paws[idx]['text1']}, ensure_ascii=False) + '\n')
        wp.write(json.dumps({'sentence': paws[idx]['text2']}, ensure_ascii=False) + '\n')
        wp.flush()
wp.close()

In [None]:
ds = (datasets.load_dataset('json', data_files='/cognitive_comp/wutong/source/sim_data/similarity_data/paws_train_ds.json',
                            cache_dir='/cognitive_comp/wutong/source/data_base/huggingface-cache')['train'])
ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/predict_sentences/paws_sentence')

##### 合并数据集

In [None]:
import datasets, glob


cache_dict_paths = glob.glob('/cognitive_comp/wutong/source/sim_data/similarity_data/sim_cache_data/*')
ds = []
for path in cache_dict_paths:
    ds.append(datasets.load_from_disk(path))

afqmc_train = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/afqmc_train_ds')
afqmc_dev = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/afqmc')
afqmc = datasets.concatenate_datasets([afqmc_train, afqmc_dev])
ds.append(afqmc)

chip_train = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/chip_train_ds')
chip_dev = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/chip')
chip = datasets.concatenate_datasets([chip_train, chip_dev])
ds.append(chip)

oppo_train = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_train_data/oppo_train_ds')
oppo_dev = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/sim_test_data/oppo')
oppo = datasets.concatenate_datasets([oppo_train, oppo_dev])
ds.append(oppo)

print(len(ds))
label_ds = datasets.concatenate_datasets(ds)
label_ds.save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/labeled4paws')

In [None]:
import datasets

datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/labeled_train_chip')

In [None]:
import datasets

labeled_ds = datasets.load_from_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/labeled4chip')
split_data = labeled_ds.train_test_split(test_size=0.02, seed=42)
split_data['train'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/labeled_train_chip')
split_data['test'].save_to_disk('/cognitive_comp/wutong/source/sim_data/similarity_data/labeled_test_chip')
print(labeled_ds, split_data)

#### 其他Test

In [None]:
from bert_score import score

# data
cands = ['我们都曾经年轻过']
refs = ['我们都年少']

P, R, F1 = score(cands, refs, lang="zh", verbose=True)

print(f"System level F1 score = {F1.mean():.3f}") 

In [None]:
import requests
# url="http://192.168.190.2:6631/davae"
url="http://192.168.52.173:23628/davae"
sents = ['当回首往事的时候，他不会因为虚度年华而悔恨，也不会因为碌碌无为而羞耻', '我心里那高兴劲啊,好像有一股甜滋滋清凉凉的风,掠过我的心头!']#,
# json={ 'sent_inputs':sents, 'std_scale':0.5, 'delta_z':0.1, 'sent_goodcases':None, 'sent_badcases':None, 'augm_num':1, 'batch_size': 64, 'temperature': 1.0, 'top_k': 0,'top_p': 0.9,'max_out_length':128}
result = requests.post(url,                                                                                               
            json={
                'sent_inputs': sents,
                'top_p': 0.95,
                # 'sent_badcases': sents,
                'std_scale': 1.5,
                'augm_num':2
            }
        ).json()

print(result['generated_sentence'],  result['time'])

In [None]:
import requests
url="http://192.168.52.151:42345/simgen"
sents = ['第九届新马领导人非正式峰会周二将在布城首相署举行。'] * 10
result = requests.post(url,                                                                                               
            json={
                'sent_inputs': sents,
                'top_p': 0.9,
                'repetition_penalty': 1.0,
                'max_out_length': 128
            }
        ).json()

print(result['origin_sentence'])
print(result['generated_sentence'])
print(result['time'])