In [1]:
# # 移动文件
# import shutil
# shutil.move("data/data101045/words.vector.gz","./")

In [2]:
! pip install -U synonyms

Looking in indexes: https://mirror.baidu.com/pypi/simple/
Requirement already up-to-date: synonyms in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (3.16.0)


In [3]:
! cp 'words.vector.gz' '/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/synonyms/data'

In [4]:
import synonyms

[jieba] default dict file path ../data/vocab.txt
[jieba] default dict file path ../data/vocab.txt
[jieba] load default dict ../data/vocab.txt ...
[jieba] load default dict ../data/vocab.txt ...
>> Synonyms load wordseg dict [/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/synonyms/data/vocab.txt] ... 
>> Synonyms on loading stopwords [/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/synonyms/data/stopwords.txt] ...
[Synonyms] on loading vectors [/opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/synonyms/data/words.vector.gz] ...


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## EDA（Easy Data Augmentation）

![EDA3](https://img-blog.csdnimg.cn/50c22b4212714b509ce053ff921d6bdd.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

### 对于训练集中的给定句子，随机选择并执行以下操作之一：
* 同义词替换（SR）：从句子中随机选择 n 个不是停用词的词。 用随机选择的同义词之一替换这些单词中的每一个。
* 随机插入 (RI)：在句子中随机找到一个词，并找出其同义词，且该同义词不是停用词。 将该同义词插入句子中的随机位置。 这样做n次。
* 随机交换（RS）：随机选择句子中的两个单词并交换它们的位置。 这样做n次。
* 随机删除（RD）：以概率 p 随机删除句子中的每个单词。

In [5]:
# 读取停用词表
import random
import re
from random import shuffle
stop_words = {word.strip() for word in open('baidu_stopwords.txt', 'r', encoding='utf8').readlines()}

In [6]:
def get_synonym(word):
    syn = set(synonyms.nearby(word)[0])
    if word in syn:
        syn.remove(word)
    return list(syn)

def synonym_replacement(words, n):
    new_words = words.copy()
    # 去除停用词，去重，变成列表
    random_word_list = list(set([word for word in words if word not in stop_words]))
    # 打乱
    random.shuffle(random_word_list)

    num_replaced = 0
    for random_word in random_word_list:
        synonym_words = get_synonym(random_word)
        if len(synonym_words)>=1:
            synonym = random.choice(list(synonym_words))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        
        if num_replaced >= n:
            break
    
    sentence = ' '.join(new_words)
    new_words = sentence.split(' ')

    return new_words

In [7]:
# 使用eda进行数据增强
def eda(sentence, alpha_sr=0.1, num_aug=9):
    words = synonyms.seg(sentence)[0]
    num_words = len(words)
    n_sr = max(1, int(alpha_sr * num_words))

    augmented_sentences = []

    for _ in range(num_aug):
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))
    return augmented_sentences

In [8]:
# 存在这样一种情况
sentence = ['in', 'war', 'one', 'must', 'be', 'a', 'good', 'actor']
word = 'actor'
example_synonyms = ['actress', 'film star', 'performer', 'comedian', 'entertainer']
new_sentence = ['in', 'war', 'one', 'must', 'be', 'a', 'good', 'film star']
new_sentence = ['in', 'war', 'one', 'must', 'be', 'a', 'good', 'film', 'star']

In [9]:
eda('9月15日以来，台积电、高通、三星等华为的重要合作伙伴，只要没有美国的相关许可证，都无法供应芯片给华为，而中芯国际等国产芯片企业，也因采用美国技术，而无法供货给华为。目前华为部分型号的手机产品出现货少的现象，若该形势持续下去，华为手机业务将遭受重创。')

['9 月底 15 日 以来 ， GT5316SB0 、 高通 、 三星 等 OPPO 的 重要 合作伙伴 ， 只要 没有 美国 的 相关 许可证 ， 都 无法 供应 芯片 给 OPPO ， 而 中芯国际 等 国产 芯片 跨国公司 ， 也 因 采用 美国 技术 ， 而 无法 供货 给 OPPO 。 目前 OPPO 部分 型号 的 手机 品类 出现 货 缺 的 现象 ， 若 该 形势 持续 下去 ， OPPO 手机 业务 将 遭受 重挫 。',
 '9 月 15 日 以来 ， 台积电 、 高通 、 三星 等 中兴通讯 的 重要 合作伙伴 ， 只要 没有 美国 的 有关 许可证 ， 全都 无法 供货 微处理器 给 中兴通讯 ， 而 中芯国际 等 国产 微处理器 企业 ， 也 因 采用 美国 技术 ， 而 无法 供货 给 中兴通讯 。 目前 中兴通讯 部分 型号 的 手机 产品 出现 货 少 的 现象 ， 若 该 形势 持续 下去 ， 中兴通讯 手机 业务 将 遭致 挫败 。',
 '9 月 15 日 以来 ， 台积电 、 Qualcomm 、 三星 等 华为 的 重要 合作伙伴 ， 只要 没有 美国 的 相关 许可证 ， 都 无法 供货 积体电路 给 华为 ， 而 中芯国际 等 国产 积体电路 企业 ， 也 因 采用 美国 技术开发 ， 而 无法 供货 给 华为 。 目前 华为 部分 型号 的 手机 产品 出现 货品 少 的 现象 ， 若 该 形势 持续增长 下去 ， 华为 手机 产品销售 将 遭受 重创 。',
 '9 月 15 日 以来 ， 富士康 、 高通 、 三星 等 华为 的 重要 合作伙伴 ， 只要 没有 美国 的 有关 营业执照 ， 都 无法 供应 芯片 给 华为 ， 而 中芯国际 等 国产 芯片 企业 ， 也 因 采用 美国 技术 ， 而 无法 供货商 给 华为 。 目前 华为 部分 改进型 的 手机 产品 出现 货 太少 的 乱象 ， 若 该 形势 持续 下去 ， 华为 手机 业务 将 遭受 重创 。',
 '9 月 15 日 以来 ， 台积电 、 德州仪器 、 三星 等 华为 的 重要 合作伙伴 ， 只要 没有 美国 的 相关 许可证 ， 即使 无法 供应 芯片 给 华为 ， 而 中芯国际 等 国产 芯片 跨国公司 ， 也 因 采用 美国 技术 ， 

![UDA5](https://img-blog.csdnimg.cn/9d10da70d1d0467e93ef5bb1267ac87f.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![在这里插入图片描述](https://img-blog.csdnimg.cn/88a3abe95bbd4e369fe4d085533c9c35.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [10]:
from bucket_sampler import SortedSampler, BucketBatchSampler
from EMA import *

In [11]:
import random
import numpy as np
import torch
config = {
        'train_file_path': 'data/data100821/train.json',
        'dev_file_path': 'data/data100821/dev.json',
        'test_file_path': 'data/data100821/test.json',
        'output_path': '.',
        'model_path': 'data/data94445',
        'batch_size': 16,
        'num_epochs': 1,
        'max_seq_len': 64,
        'learning_rate': 2e-5,
        'weight_decay': 0.01,
        'use_bucket': True,
        'bucket_multiplier': 200,
        'unsup_data_ratio': 1.5,
        'uda_softmax_temp': 0.4,
        'uda_confidence_threshold': 0.8,
        'device': 'cuda',
        'n_gpus': 0,
        'logging_step': 400,
        'ema_start_step': 500,
        'ema_start': False,
        'seed': 2021
    }

if not torch.cuda.is_available():
    config['device'] = 'cpu'
else:
    config['n_gpus'] = torch.cuda.device_count()
    config['batch_size'] *= config['n_gpus']

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

seed_everything(config['seed'])

2021

In [12]:
! pip install transformers==4.0.1

Looking in indexes: https://mirror.baidu.com/pypi/simple/


In [13]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(config['model_path'])

In [14]:
def build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
    inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens=True,
                                        return_token_type_ids=True, return_attention_mask=True)
    inputs['input_ids'].append(inputs_dict['input_ids'])
    inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
    inputs['attention_mask'].append(inputs_dict['attention_mask'])
    inputs['labels'].append(label)

## 对偶数据增强
### a-b对，变成b-a对, 把两个句子换顺序
### 我们的无监督数据增强就是用的对偶数据增强
### BERT 输入 a，b两个句子，现在输入以b,a作为输入，增强样本

In [15]:
import pandas as pd
from tqdm import tqdm
import json
def parse_data(path, data_type='train'):
    sentence_a = []
    sentence_b = []
    labels = []
    with open(path, 'r', encoding='utf8') as f:
        for line in tqdm(f.readlines(), desc=f'Reading {data_type} data'):
            line = json.loads(line)
            sentence_a.append(line['sentence1'])
            sentence_b.append(line['sentence2'])
            if data_type != 'test':
                labels.append(int(line['label']))
            else:
                labels.append(0)
    df = pd.DataFrame(zip(sentence_a, sentence_b, labels), columns=['text_a', 'text_b', 'labels'])
    return df

In [16]:
def build_unsup_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer):
    lr_inputs_dict = tokenizer.encode_plus(sentence_a, sentence_b, add_special_tokens=True,
                                           return_token_type_ids=True, return_attention_mask=True)
    rl_inputs_dict = tokenizer.encode_plus(sentence_b, sentence_a, add_special_tokens=True,
                                           return_token_type_ids=True, return_attention_mask=True)
    inputs['input_ids'].append((lr_inputs_dict['input_ids'], rl_inputs_dict['input_ids']))
    inputs['token_type_ids'].append((lr_inputs_dict['token_type_ids'], rl_inputs_dict['token_type_ids']))
    inputs['attention_mask'].append((lr_inputs_dict['attention_mask'], rl_inputs_dict['attention_mask']))
    inputs['labels'].append(label)

In [17]:
from collections import defaultdict
def read_data(config, tokenizer):
    
    train_df = parse_data(config['train_file_path'], data_type='train')
    dev_df = parse_data(config['dev_file_path'], data_type='dev')
    test_df = parse_data(config['test_file_path'], data_type='test')

    data_df = {'train': train_df, 'dev': dev_df, 'test': test_df}
    processed_data = {}

    unsup_data = defaultdict(list)
    
    for data_type, df in data_df.items():
        inputs = defaultdict(list)
        if data_type == 'train':
            reversed_inputs = defaultdict(list)
        for i, row in tqdm(df.iterrows(), desc=f'Preprocessing {data_type} data', total=len(df)):
            label = 0 if data_type == 'test' else row[2]
            sentence_a, sentence_b = row[0], row[1]
            build_bert_inputs(inputs, label, sentence_a, sentence_b, tokenizer)

            if data_type.startswith('test'):
                build_bert_inputs(inputs, label, sentence_b, sentence_a, tokenizer)

            build_unsup_bert_inputs(unsup_data, label, sentence_a, sentence_b, tokenizer)

        processed_data[data_type] = inputs
    
    processed_data['unsup_data'] = unsup_data

    return processed_data

In [18]:
data = read_data(config, tokenizer)

Reading train data: 100%|██████████| 34334/34334 [00:00<00:00, 259237.08it/s]
Reading dev data: 100%|██████████| 4316/4316 [00:00<00:00, 269488.43it/s]
Reading test data: 100%|██████████| 3861/3861 [00:00<00:00, 265531.05it/s]
Preprocessing train data: 100%|██████████| 34334/34334 [00:43<00:00, 795.18it/s]
Preprocessing dev data: 100%|██████████| 4316/4316 [00:05<00:00, 800.04it/s]
Preprocessing test data: 100%|██████████| 3861/3861 [00:06<00:00, 623.95it/s]


In [19]:
from torch.utils.data import Dataset
class AFQMCDataset(Dataset):

    def __init__(self, data_dict):
        super(AFQMCDataset, self).__init__()
        self.data_dict = data_dict

    def __getitem__(self, index):
        data = (self.data_dict['input_ids'][index], self.data_dict['token_type_ids'][index],
                self.data_dict['attention_mask'][index], self.data_dict['labels'][index])
        return data

    def __len__(self):
        return len(self.data_dict['input_ids'])

In [20]:
class Collator:
    def __init__(self, max_seq_len, tokenizer):
        self.max_seq_len = max_seq_len
        self.tokenizer = tokenizer

    def pad_and_truncate(self, input_ids_list, token_type_ids_list,
                         attention_mask_list, labels_list, max_seq_len):
        input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
        token_type_ids = torch.zeros_like(input_ids)
        attention_mask = torch.zeros_like(input_ids)
        for i in range(len(input_ids_list)):
            seq_len = len(input_ids_list[i])
            if seq_len <= max_seq_len:
                input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
                token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
                attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
            else:
                input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
                                            dtype=torch.long)
                token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
                attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)


        labels = torch.tensor(labels_list, dtype=torch.long)
        return input_ids, token_type_ids, attention_mask, labels

    def __call__(self, examples):
        input_ids_list, token_type_ids_list, attention_mask_list, labels_list = list(zip(*examples))
        cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
        max_seq_len = min(cur_max_seq_len, self.max_seq_len)

        input_ids, token_type_ids, attention_mask, labels = self.pad_and_truncate(input_ids_list, token_type_ids_list,
                                                                                  attention_mask_list, labels_list,
                                                                                  max_seq_len)

        data_dict = {
            'input_ids': input_ids,
            'token_type_ids': token_type_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

        return data_dict

In [21]:
collate_fn = Collator(config['max_seq_len'], tokenizer)

In [22]:
class UnsupAFQMCDataset(Dataset):
    # UDA
    def __init__(self, data_dict):
        super(UnsupAFQMCDataset, self).__init__()
        self.data_dict = data_dict

    def __getitem__(self, index):
        input_ids = self.data_dict['input_ids'][index]
        token_type_ids = self.data_dict['token_type_ids'][index]
        attention_mask = self.data_dict['attention_mask'][index]
        labels = self.data_dict['labels'][index]
        return (input_ids[0], token_type_ids[0], attention_mask[0],
                input_ids[1], token_type_ids[1], attention_mask[1],
                labels)

    def __len__(self):
        return len(self.data_dict['input_ids'])

In [23]:
class UnsupCollator(Collator):

    def __init__(self, max_seq_len, tokenizer):
        super(UnsupCollator, self).__init__(max_seq_len, tokenizer)

    def __call__(self, examples):
        (ab_input_ids_list, ab_token_type_ids_list, ab_attention_mask_list,
         ba_input_ids_list, ba_token_type_ids_list, ba_attention_mask_list,
         labels_list) = list(zip(*examples))

        cur_max_seq_len = max(len(input_id) for input_id in ab_input_ids_list)
        max_seq_len = min(cur_max_seq_len, self.max_seq_len)
        
        ab_input_ids, ab_token_type_ids, ab_attention_mask, labels = self.pad_and_truncate(
            ab_input_ids_list, ab_token_type_ids_list, ab_attention_mask_list, labels_list, max_seq_len
        )

        ba_input_ids, ba_token_type_ids, ba_attention_mask, labels = self.pad_and_truncate(
            ba_input_ids_list, ba_token_type_ids_list, ba_attention_mask_list, labels_list, max_seq_len
        )
        
        data_dict = {
            'ab_input_ids': ab_input_ids,
            'ab_token_type_ids': ab_token_type_ids,
            'ab_attention_mask': ab_attention_mask,
            'ba_input_ids': ba_input_ids,
            'ba_token_type_ids': ba_token_type_ids,
            'ba_attention_mask': ba_attention_mask,
            'labels': labels
        }

        return data_dict

In [24]:
from torch.utils.data import DataLoader, RandomSampler
def build_dataloader(config, data, tokenizer):
    train_dataset = AFQMCDataset(data['train'])
    dev_dataset = AFQMCDataset(data['dev'])
    test_dataset = AFQMCDataset(data['test'])
    unsup_dataset = UnsupAFQMCDataset(data['unsup_data'])
    
    collate_fn = Collator(config['max_seq_len'], tokenizer)
    unsup_collate_fn = UnsupCollator(config['max_seq_len'], tokenizer)
    
    if config['use_bucket']:
        train_sampler = RandomSampler(train_dataset)
        
        bucket_sampler = BucketBatchSampler(train_sampler, batch_size=config['batch_size'],
                                            drop_last=False, sort_key=lambda x: len(train_dataset[x][0]),  # 以 input_id 长度作为排序的指标
                                            bucket_size_multiplier=config['bucket_multiplier'])

        train_dataloader = DataLoader(dataset=train_dataset, batch_sampler=bucket_sampler,
                                      num_workers=4, collate_fn=collate_fn)
        
        unsup_sampler = RandomSampler(unsup_dataset)

        unsup_bucket_sampler = BucketBatchSampler(unsup_sampler, 
                            batch_size=int(config['batch_size'] * config['unsup_data_ratio']),
                            drop_last=False, sort_key=lambda x: len(unsup_dataset[x][0]),  # 以 input_id 长度作为排序的指标
                            bucket_size_multiplier=config['bucket_multiplier'])
                    
        unsup_dataloader = DataLoader(dataset=unsup_dataset, 
                            batch_sampler=unsup_bucket_sampler,
                            num_workers=4, collate_fn=unsup_collate_fn)
                            

    else:
        train_dataloader = DataLoader(dataset=train_dataset, batch_size=config['batch_size'],
                                      shuffle=True, num_workers=4, collate_fn=collate_fn)
        
        unsup_dataloader = DataLoader(
            dataset=unsup_dataset, shuffle=True, num_workers=4, collate_fn=unsup_collate_fn,
            batch_size=int(config['batch_size'] * config['unsup_data_ratio'])
        )

    dev_dataloader = DataLoader(dataset=dev_dataset, batch_size=config['batch_size'],
                                shuffle=False, num_workers=4, collate_fn=collate_fn)

    test_dataloader = DataLoader(dataset=test_dataset, batch_size=config['batch_size'],
                                 shuffle=False, num_workers=4, collate_fn=collate_fn)

    return unsup_dataloader, train_dataloader, dev_dataloader, test_dataloader

In [25]:
unsup_dataloader, train_dataloader, dev_dataloader, test_dataloader = build_dataloader(config, data, tokenizer)

In [26]:
# evaluation 
from sklearn import metrics
def evaluation(config, model, val_dataloader):
    model.eval()
    preds = []
    labels = []
    val_loss = 0.
    val_iterator = tqdm(val_dataloader, desc='Evaluation', total=len(val_dataloader))

    with torch.no_grad():
        for batch in val_iterator:
            labels.append(batch['labels'])
            batch_cuda = {item: value.to(config['device']) for item, value in list(batch.items())}
            batch_cuda['mode'] = 'val'
            loss, logits = model(**batch_cuda)[:2]

            if config['n_gpus'] > 1:
                loss = loss.mean()

            val_loss += loss.item()
            preds.append(logits.argmax(dim=-1).detach().cpu())

    avg_val_loss = val_loss / len(val_dataloader)
    labels = torch.cat(labels, dim=0).numpy()
    preds = torch.cat(preds, dim=0).numpy()
    f1 = metrics.f1_score(labels, preds)
    acc = metrics.accuracy_score(labels, preds)
    return avg_val_loss, f1, acc

In [27]:
# 复写 BertForSequenceClassification
from transformers import BertForSequenceClassification
import torch.nn as nn
class BertForAFQMC(BertForSequenceClassification):
    def forward(self,
                input_ids,
                token_type_ids,
                attention_mask,
                labels=None,
                mode='train'):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        
        outputs = (logits, )

        if mode == 'val':
            loss_fct = nn.CrossEntropyLoss()

            loss = loss_fct(logits, labels.view(-1))
                
            outputs = (loss,) + outputs

        return outputs

![在这里插入图片描述](https://img-blog.csdnimg.cn/88a3abe95bbd4e369fe4d085533c9c35.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![在这里插入图片描述](https://img-blog.csdnimg.cn/38a2b12d76094f17819ae918b25f3c71.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![addtitional Training1](https://img-blog.csdnimg.cn/5916fe8ae028469bb877d15a1ac566de.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

### 用于无监督训练数据中
基于置信度的MASK，发现MASK当前模型不自信的examples很有帮助。总结来水，无监督数据（grad)data ba_unsup_value）要 选出置信度>$\beta$的样本（够自信的样本）

![UDA5](https://img-blog.csdnimg.cn/1ddf28077b88449aa84e0391149467e4.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

在半监督学习中，经常会遇到未标记数据量和标记数据量存在巨大差异的情况\
模型通常很快会在标记数据上过拟合，同时在未标记数据欠拟合。\
为了解决这个问题，引入一种技术，训练信号退火（TSA）. 它随着训练的进行逐渐释放。\

这是一种MASK\
$\eta=1$ 代表所有数据都训练了
$\eta=0.5$  代表不是所有数据都训练

In [28]:
def get_tsa_threshold(total_steps, global_steps):
    return np.exp((global_steps / total_steps - 1) * 5) / 2 + 0.5

![在这里插入图片描述](https://img-blog.csdnimg.cn/b21349f49ba446b698e6f27823983fc5.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![ce](https://img-blog.csdnimg.cn/52f18386dbea423f846611c558aa24c7.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

$$CE = -plogq$$   
$$E = -plogp$$
p-真实分布（已知） q-预测分布
$$KLDiv(p||q) = \sum_{i=1}^{N}p(x_{i})(logp(x_{i})- logq(x_{i}))$$
$$ = p(logp-logq) $$
$$ = -plogq - (- plogp)$$
$$ = CE-E$$

![cross entropy](https://img-blog.csdnimg.cn/4572c78d76624c49b01b96a1cba42279.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![KL](https://img-blog.csdnimg.cn/189e4bc953904c199afbc7e6a11e5d9a.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![在这里插入图片描述](https://img-blog.csdnimg.cn/88a3abe95bbd4e369fe4d085533c9c35.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

![在这里插入图片描述](https://img-blog.csdnimg.cn/38a2b12d76094f17819ae918b25f3c71.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [29]:
# 返回 grad_data：需要计算梯度，需要进行反向传播的数据
# 返回 no_grad_data: 不需要计算梯度，不需要进行反向传播的数据
def get_data(sup_batch, unsup_batch, config):
    grad_data = {}
    no_grad_data = {}
    # sup_batch [bs, seq_len]
    # unsup_batch [bs, seq_len]
    # 监督数据的 最长 长度
    sup_max_len = sup_batch['input_ids'].size(1)

    # 无监督数据 的最长 长度
    unsup_max_len = unsup_batch['ba_input_ids'].size(1)

    # 当前数据 的最长 长度
    cur_max_len = max(sup_max_len, unsup_max_len)

    for item, sup_value in sup_batch.items():
        if item == 'labels':
            grad_data[item] = sup_value.to(config['device'])
            continue
        
        ba_unsup_value = unsup_batch[f'ba_{item}']
        ab_unsup_value = unsup_batch[f'ab_{item}']

        # 谁短补谁，ba_unsup_value短
        if sup_max_len == cur_max_len:
            padding_value = torch.zeros((ba_unsup_value.size(0), cur_max_len - unsup_max_len),
                                        dtype=ba_unsup_value.dtype)
            ba_unsup_value = torch.cat([ba_unsup_value, padding_value], dim=-1)

        else:
            padding_value = torch.zeros((sup_value.size(0), cur_max_len - sup_max_len),
                                        dtype=sup_value.dtype)
            sup_value = torch.cat([sup_value, padding_value], dim=-1)
        
        # 把 sup_batch 和 ba 的 数据放在一起
        grad_value = torch.cat([sup_value, ba_unsup_value], dim=0)

        grad_data[item] = grad_value.to(config['device'])
        no_grad_data[item] = ab_unsup_value.to(config['device'])

    return grad_data, no_grad_data

In [30]:
import torch.nn as nn
logits = torch.randn(2,3)
print(logits)
t_softmax = torch.softmax(logits, dim=1)
print(t_softmax)
t_sharpen = torch.softmax(logits/0.4, dim=1)
print(t_sharpen)

tensor([[ 2.2871,  0.6413, -0.8615],
        [-0.3649, -0.6931,  0.9023]])
tensor([[0.8092, 0.1561, 0.0347],
        [0.1897, 0.1366, 0.6737]])
tensor([[9.8356e-01, 1.6068e-02, 3.7519e-04],
        [3.9675e-02, 1.7467e-02, 9.4286e-01]])


In [31]:
 # 无监督数据 (ab) 只需要正向传播
def forward_no_grad(no_grad_data, config, model):
    with torch.no_grad():
        np_grad_logits = model(**no_grad_data)[0]
        # ----------- sharpen -------------#
        no_grad_probs = torch.softmax(np_grad_logits / config['uda_softmax_temp'], dim=-1)
        # ----------- sharpen -------------#
        # largest_probs [B] [0.879, 0.987, 0.234, 0.768, 0.333]
        largest_probs, _= no_grad_probs.max(dim=-1)
        unsup_loss_mask = largest_probs.gt(config['uda_confidence_threshold']).float()
        # unsup_loss_mask tensor([True, True, False, True, False])
    return unsup_loss_mask, no_grad_probs

![在这里插入图片描述](https://img-blog.csdnimg.cn/b21349f49ba446b698e6f27823983fc5.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3dlaXhpbl80MTI4NzA2MA==,size_16,color_FFFFFF,t_70)

In [32]:
def forward_with_grad(unsup_loss_mask, unsup_probs, config, cur_bs, 
    model, grad_data, total_steps, global_steps):
    # 得到\eta值， 随着训练的进行，阈值逐渐变大，最后是1，把所有监督数据都用上了
    tsa_threshold = get_tsa_threshold(total_steps, global_steps)
    
    logits = model(**grad_data)[0]
    # --------- 有监督损失 -------#
    # cur_bs 无监督 ba 的 batch_size
    # 前面一部分是 train 的 sup_data, 后面是unsup_data
    sup_logits, unsup_logits = logits.split([logits.size(0)-cur_bs, cur_bs])

    # 得到 sup_labels
    sup_labels = grad_data['labels'][:logits.size(0)-cur_bs]

    per_example_loss = nn.CrossEntropyLoss(reduction='none')(sup_logits, sup_labels)
    
    # 拿出 正确标签 对应的概率
    correct_label_probs = torch.softmax(sup_logits, dim=-1).gather(dim=-1, index=sup_labels.view(-1, 1))
    
    # 监督数据 过于自信不要，留下小于等于 tsa_threshold 的计算损失
    sup_loss_mask = correct_label_probs.le(tsa_threshold).squeeze().float()
    
    # 应用mask掩盖有监督数据过度自信的样本损失
    per_example_loss *= sup_loss_mask
    
    # 有效监督样本的平均损失
    sup_loss = per_example_loss.sum()/max(sup_loss_mask.sum(), 1) # max(sup_loss_mask.sum(), 1) 有效个数
    # --------- 有监督损失 -------#


    # --------- 无监督损失 -------#
    unsup_log_probs = torch.log_softmax(unsup_logits, dim=-1)
    # input 希望是一个对数概率
    # Target 目标为概率值
    per_example_kl_loss = nn.KLDivLoss(reduction='none')(unsup_log_probs, unsup_probs).sum(dim=-1)

    # 应用mask掩盖无监督数据中不自信的样本损失
    per_example_kl_loss *= unsup_loss_mask

    # 计算无监督样本的平均损失
    unsup_loss = per_example_kl_loss.sum()/max(unsup_loss_mask.sum(), 1)
    # --------- 无监督损失 -------#

    # 加权两种损失
    loss = sup_loss + unsup_loss

    # 多卡取平均
    if config['n_gpus']>1:
        loss = loss.mean()
        sup_loss = sup_loss.mean()
        unsup_loss = unsup_loss.mean()
    
    return loss, tsa_threshold, unsup_loss, sup_loss

In [33]:
from transformers import AdamW
from tqdm import trange
import os
def train(config, train_dataloader, dev_dataloader, unsup_dataloader=None):
    model = BertForAFQMC.from_pretrained(config['model_path'])

    optimizer = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    model.to(config['device'])
    # unsup_dataloader train, dev, test
    # 使用 unsup_dataloader，因为unsup_dataloader比较大
    total_steps = len(unsup_dataloader) * config['num_epochs']
    epoch_iterator = trange(config['num_epochs'])
    global_steps = 0
    train_loss = 0.
    logging_loss = 0.
    best_acc = 0.
    best_model_path = ''

    if config['n_gpus'] > 1:
        model = nn.DataParallel(model)

    train_iterator = iter(train_dataloader)
    for _ in epoch_iterator:
        unsup_iterator = tqdm(unsup_dataloader, desc='Training', total=len(unsup_dataloader))
        model.train()
        # ----------------------- new ----------------------#
        for unsup_batch in unsup_iterator:
            cur_bs = unsup_batch['ab_input_ids'].size(0)
            try:
                sup_batch = next(train_iterator)
            except StopIteration:
                train_iterator = iter(train_dataloader)
                sup_batch = next(train_iterator)
            
            # 返回 grad_data：需要计算梯度，需要进行反向传播的数据
            # 返回 no_grad_data: 不需要计算梯度，不需要进行反向传播的数据
            grad_data, no_grad_data = get_data(sup_batch, unsup_batch, config)
            
            # 无监督数据 (ab) 只需要正向传播
            # mask, ab_logits
            unsup_loss_mask, unsup_probs = forward_no_grad(no_grad_data, config, model)
             
            # 得出loss
            loss, tsa_threshold, unsup_loss, sup_loss = forward_with_grad(
                unsup_loss_mask, unsup_probs, config, cur_bs, model, grad_data, total_steps, global_steps
            )
            

            model.zero_grad()
            loss.backward()

            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            if config['ema_start']:
                ema.update()

            train_loss += loss.item()
            global_steps += 1

            unsup_iterator.set_postfix_str(f'running training loss: {loss.item():.4f}')
        
            if global_steps % config['logging_step'] == 0:
                if global_steps >= config['ema_start_step'] and not config['ema_start']:
                    print('\n>>> EMA starting ...')
                    config['ema_start'] = True
                    ema = EMA(model.module if hasattr(model, 'module') else model, decay=0.999)

                print_train_loss = (train_loss - logging_loss) / config['logging_step']
                logging_loss = train_loss

                if config['ema_start']:
                    ema.apply_shadow()
                val_loss, f1, acc = evaluation(config, model, dev_dataloader)

                print_log = f'\n>>> training loss: {print_train_loss:.6f}, valid loss: {val_loss:.6f}, '

                if acc > best_acc:
                    model_save_path = os.path.join(config['output_path'],
                                                   f'checkpoint-{global_steps}-{acc:.6f}')
                    model_to_save = model.module if hasattr(model, 'module') else model
                    model_to_save.save_pretrained(model_save_path)
                    best_acc = acc
                    best_model_path = model_save_path

                print_log += f'valid f1: {f1:.6f}, valid acc: {acc:.6f}'

                print(print_log)
                model.train()
                if config['ema_start']:
                    ema.restore()

    return model, best_model_path

In [34]:
model, best_model_path = train(config, train_dataloader, dev_dataloader, unsup_dataloader)

Some weights of the model checkpoint at data/data94445 were not used when initializing BertForAFQMC: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForAFQMC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForAFQMC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForAFQMC were not initialized from the model checkpoint at data/data94445 and are newly initialized: ['classifier.weight', 'class


>>> training loss: 0.972990, valid loss: 0.644075, valid f1: 0.368159, valid acc: 0.646895



Training:  23%|██▎       | 401/1772 [01:01<32:24,  1.42s/it, running training loss: 1.0375][A
Training:  23%|██▎       | 402/1772 [01:01<23:31,  1.03s/it, running training loss: 1.0375][A
Training:  23%|██▎       | 402/1772 [01:01<23:31,  1.03s/it, running training loss: 0.8650][A
Training:  23%|██▎       | 403/1772 [01:01<17:22,  1.31it/s, running training loss: 0.8650][A
Training:  23%|██▎       | 403/1772 [01:01<17:22,  1.31it/s, running training loss: 0.9297][A
Training:  23%|██▎       | 404/1772 [01:01<13:01,  1.75it/s, running training loss: 0.9297][A
Training:  23%|██▎       | 404/1772 [01:02<13:01,  1.75it/s, running training loss: 0.7594][A
Training:  23%|██▎       | 405/1772 [01:02<09:55,  2.29it/s, running training loss: 0.7594][A
Training:  23%|██▎       | 405/1772 [01:02<09:55,  2.29it/s, running training loss: 1.0619][A
Training:  23%|██▎       | 406/1772 [01:02<07:46,  2.93it/s, running training loss: 1.0619][A
Training:  23%|██▎       | 406/1772 [01:02<07:46,


>>> EMA starting ...




Evaluation:   0%|          | 1/270 [00:00<02:39,  1.68it/s][A[A

Evaluation:   2%|▏         | 6/270 [00:00<01:51,  2.37it/s][A[A

Evaluation:   5%|▍         | 13/270 [00:00<01:17,  3.33it/s][A[A

Evaluation:   7%|▋         | 19/270 [00:00<00:54,  4.63it/s][A[A

Evaluation:   9%|▉         | 25/270 [00:01<00:38,  6.39it/s][A[A

Evaluation:  11%|█▏        | 31/270 [00:01<00:27,  8.68it/s][A[A

Evaluation:  14%|█▎        | 37/270 [00:01<00:19, 11.67it/s][A[A

Evaluation:  16%|█▌        | 43/270 [00:01<00:14, 15.28it/s][A[A

Evaluation:  18%|█▊        | 49/270 [00:01<00:11, 19.59it/s][A[A

Evaluation:  20%|██        | 55/270 [00:01<00:08, 24.37it/s][A[A

Evaluation:  23%|██▎       | 61/270 [00:01<00:07, 29.31it/s][A[A

Evaluation:  25%|██▍       | 67/270 [00:01<00:05, 34.28it/s][A[A

Evaluation:  27%|██▋       | 73/270 [00:01<00:05, 39.27it/s][A[A

Evaluation:  29%|██▉       | 79/270 [00:01<00:04, 43.69it/s][A[A

Evaluation:  31%|███▏      | 85/270 [00:02<00:03


>>> training loss: 1.054030, valid loss: 0.626248, valid f1: 0.374742, valid acc: 0.648981



Training:  45%|████▌     | 801/1772 [02:01<23:12,  1.43s/it, running training loss: 1.0178][A
Training:  45%|████▌     | 802/1772 [02:01<16:55,  1.05s/it, running training loss: 1.0178][A
Training:  45%|████▌     | 802/1772 [02:02<16:55,  1.05s/it, running training loss: 1.0446][A
Training:  45%|████▌     | 803/1772 [02:02<12:31,  1.29it/s, running training loss: 1.0446][A
Training:  45%|████▌     | 803/1772 [02:02<12:31,  1.29it/s, running training loss: 1.1339][A
Training:  45%|████▌     | 804/1772 [02:02<09:32,  1.69it/s, running training loss: 1.1339][A
Training:  45%|████▌     | 804/1772 [02:02<09:32,  1.69it/s, running training loss: 1.1925][A
Training:  45%|████▌     | 805/1772 [02:02<07:22,  2.18it/s, running training loss: 1.1925][A
Training:  45%|████▌     | 805/1772 [02:02<07:22,  2.18it/s, running training loss: 0.9109][A
Training:  45%|████▌     | 806/1772 [02:02<06:08,  2.62it/s, running training loss: 0.9109][A
Training:  45%|████▌     | 806/1772 [02:02<06:08,


>>> training loss: 1.023801, valid loss: 0.611119, valid f1: 0.403929, valid acc: 0.648517



Training:  68%|██████▊   | 1201/1772 [03:06<12:26,  1.31s/it, running training loss: 0.8841][A
Training:  68%|██████▊   | 1202/1772 [03:06<09:11,  1.03it/s, running training loss: 0.8841][A
Training:  68%|██████▊   | 1202/1772 [03:06<09:11,  1.03it/s, running training loss: 0.8821][A
Training:  68%|██████▊   | 1203/1772 [03:06<06:48,  1.39it/s, running training loss: 0.8821][A
Training:  68%|██████▊   | 1203/1772 [03:06<06:48,  1.39it/s, running training loss: 1.0406][A
Training:  68%|██████▊   | 1204/1772 [03:06<05:09,  1.84it/s, running training loss: 1.0406][A
Training:  68%|██████▊   | 1204/1772 [03:06<05:09,  1.84it/s, running training loss: 0.9105][A
Training:  68%|██████▊   | 1205/1772 [03:06<04:03,  2.32it/s, running training loss: 0.9105][A
Training:  68%|██████▊   | 1205/1772 [03:07<04:03,  2.32it/s, running training loss: 0.9547][A
Training:  68%|██████▊   | 1206/1772 [03:07<03:26,  2.73it/s, running training loss: 0.9547][A
Training:  68%|██████▊   | 1206/1772 [0


>>> training loss: 0.999916, valid loss: 0.589353, valid f1: 0.236436, valid acc: 0.690222



Training:  90%|█████████ | 1601/1772 [04:11<04:06,  1.44s/it, running training loss: 0.6819][A
Training:  90%|█████████ | 1602/1772 [04:11<03:00,  1.06s/it, running training loss: 0.6819][A
Training:  90%|█████████ | 1602/1772 [04:11<03:00,  1.06s/it, running training loss: 1.2535][A
Training:  90%|█████████ | 1603/1772 [04:11<02:13,  1.27it/s, running training loss: 1.2535][A
Training:  90%|█████████ | 1603/1772 [04:12<02:13,  1.27it/s, running training loss: 0.9168][A
Training:  91%|█████████ | 1604/1772 [04:12<01:40,  1.68it/s, running training loss: 0.9168][A
Training:  91%|█████████ | 1604/1772 [04:12<01:40,  1.68it/s, running training loss: 0.9584][A
Training:  91%|█████████ | 1605/1772 [04:12<01:17,  2.15it/s, running training loss: 0.9584][A
Training:  91%|█████████ | 1605/1772 [04:12<01:17,  2.15it/s, running training loss: 0.8801][A
Training:  91%|█████████ | 1606/1772 [04:12<01:01,  2.71it/s, running training loss: 0.8801][A
Training:  91%|█████████ | 1606/1772 [0

In [35]:
from zipfile import ZipFile
def predict(config, model, test_dataloader):
    test_iterator = tqdm(test_dataloader, desc='Predicting', total=len(test_dataloader))
    test_preds = []

    model.eval()
    with torch.no_grad():
        for batch in test_iterator:
            batch_cuda = {item: value.to(config['device']) for item, value in list(batch.items())}

            logits = model(**batch_cuda)[0]

            probs = torch.softmax(logits, dim=-1)

            test_preds.append(probs[:, 1].detach().cpu())
    
    test_preds = torch.cat(test_preds)
    test_preds = torch.stack(test_preds.split(2), dim=0).mean(dim=1).numpy()
    submission_path = os.path.join(config['output_path'], 'submission.tsv')
    test_df = pd.DataFrame(data={'prediction': test_preds})
    test_df.to_csv(submission_path, index=False, header=False, encoding='utf8', sep='\t')
    with ZipFile(os.path.join(config['output_path'], 'submission.zip'), 'w') as myzip:
        myzip.write(submission_path, 'submission.tsv')

In [36]:
predict(config, model, test_dataloader)

Predicting: 100%|██████████| 483/483 [00:08<00:00, 55.56it/s]
