In [1]:
import os
import json
import random
import math
import logging
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import BertModel
from tqdm import tqdm
logger = logging.getLogger(__name__)

In [2]:
def set_seed(seed):
    """
    设置随机种子
    :param seed:
    :return:
    """
    random.seed(seed) # Python
    np.random.seed(seed) # cpu vars
    torch.manual_seed(seed) # cpu  vars

    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed) # gpu vars
        torch.backends.cudnn.deterministic = True  #needed
        torch.backends.cudnn.benchmark = False

In [3]:
class InputExample:
    def __init__(self,
                 set_type,
                 text,
                 id_=None,
                 label=None):
        self.set_type = set_type
        self.id_ = id_
        self.text = text
        self.label = label
        
class BaseFeature:
    def __init__(self,
                 token_ids,
                 attention_masks,
                 token_type_ids):
        # BERT 输入
        self.token_ids = token_ids
        self.attention_masks = attention_masks
        self.token_type_ids = token_type_ids
        
class BertFeature(BaseFeature):
    def __init__(self,
                 token_ids,
                 attention_masks,
                 token_type_ids,
                 id_=None,
                 label=None):
        super(BertFeature, self).__init__(token_ids=token_ids,
                                         attention_masks=attention_masks,
                                         token_type_ids=token_type_ids)
        self.label = label
        self.id_ = id_

        
class Processor:

    @staticmethod
    def read_data(file_path):
        df = pd.read_csv(file_path, sep="\t")
        return df
    
    @staticmethod
    def get_examples(file_path):
        df = Processor.read_data(file_path)
        examples = []
        df_columns = df.columns.tolist()
        
        if 'label' in df_columns:
            set_type = 'train'
        else:
            set_type = 'test'
        
        if set_type == 'train':
            for idx, (label, text) in enumerate(zip(df.label, df.comment)):
                examples.append(InputExample(set_type=set_type,
                                             text=text,
                                             label=label))
        else:
            for idx, (id_, text) in enumerate(zip(df.id, df.comment)):
                label = None
                examples.append(InputExample(set_type=set_type,
                                             id_=id_,
                                             text=text,
                                             label=label))            
            
        return examples
    
def convert_input_example(example: InputExample, tokenizer: BertTokenizer,
                        max_seq_len):
    set_type = example.set_type
    id_ = example.id_
    text = example.text
    label = example.label

    encode_dict = tokenizer.encode_plus(text=text,
                                        max_length=max_seq_len,
                                        pad_to_max_length=True,
                                        return_token_type_ids=True,
                                        return_attention_mask=True,
                                        truncation=True,
                                        padding=True)


    token_ids = encode_dict['input_ids']
    attention_masks = encode_dict['attention_mask']
    token_type_ids = encode_dict['token_type_ids']

    out_len = len(encode_dict['input_ids'])
    pad_len = max_seq_len - out_len

    token_ids  = encode_dict['input_ids'] + [0] * pad_len
    attention_masks  = encode_dict['attention_mask'] + [0] * pad_len
    token_type_ids  = encode_dict['token_type_ids'] + [0] * pad_len

    feature = BertFeature(
        # bert inputs
        token_ids=token_ids,
        attention_masks=attention_masks,
        token_type_ids=token_type_ids,
        id_=id_,
        label=label
    )

    return feature

def convert_examples_to_features(examples, max_seq_len, bert_path):

    tokenizer = BertTokenizer(os.path.join(bert_path, 'vocab.txt'))

    features = []

    for i, example in enumerate(examples):
        
        feature = convert_input_example(
            example=example,
            max_seq_len=max_seq_len,
            tokenizer=tokenizer
        )

        if feature is None:
            continue

        features.append(feature)

    return features


train_path = './data/train.csv'
test_path = './data/test.csv'
bert_path = '/Users/zy/bert_wwm_ext/'

tokenizer = BertTokenizer.from_pretrained(bert_path)
train_examples = Processor.get_examples(train_path)
train_features = convert_examples_to_features(examples=train_examples, max_seq_len=156, bert_path=bert_path)

test_examples = Processor.get_examples(test_path)
test_features = convert_examples_to_features(examples=test_examples, max_seq_len=156, bert_path=bert_path)

In [19]:
class args():
    
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    sample_path = './data/sample.csv'
    bert_path = '/Users/zy/bert_wwm_ext/'
    
    hid_size = 768
    seed = 666
    train_epochs = 1
    train_batch_size = 4
    test_batch_size = 4
    bert_lr = 1e-5
    classifier_lr = 1e-4
    adam_epsilon = 1e-12
    warmup_proportion = 1
    weight_decay = 0.01
    attack_train_mode = None
    max_grad_norm = 5
    
    attack_train = None
    loss_fn = 'ce'
    accumulation_steps = 1
    
    model_save_path = './best_models/'
    model_save_name = 'baseline'
    
    use_gpu = False
    mul_gpu = False
    
    
set_seed(args.seed)

In [5]:
from utils.dataset_utils import ComDataset
from models.layers import MaskedGlobalMaxPool1D, MaskedGlobalAveragePooling1D
import torch 
from torch import nn
from torch.utils.data import DataLoader, RandomSampler

In [6]:
train_dataset = ComDataset(train_features, 'train')
test_dataset = ComDataset(test_features, 'test')

In [9]:
train_sampler = RandomSampler(train_dataset)
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=args.train_batch_size,
                          sampler=train_sampler)

dev_loader = train_loader

# test_sampler = RandomSampler(test_dataset)
test_loader = DataLoader(dataset=test_dataset,
                          batch_size=args.test_batch_size,
                          shuffle=False)

In [10]:
for i in train_loader:
    print(i.keys())
    token_ids = i['token_ids']
    attention_masks = i['attention_masks']
    token_type_ids = i['token_type_ids']
    labels = i['labels']
    
    break

dict_keys(['token_ids', 'attention_masks', 'token_type_ids', 'labels'])


In [11]:
class BaseModel(nn.Module):
    def __init__(self, bert_path):
        super(BaseModel, self).__init__()
        config_path = os.path.join(bert_path, 'config.json')

        assert os.path.exists(bert_path) and os.path.exists(config_path), 'pretrained bert file does not exist'
        #'/home/dc2-user/p_data/nlp_pretrained/bert_chinese/chinese_roberta_wwm_large_ext'
        self.bert_module = BertModel.from_pretrained(bert_path, output_hidden_states=True)

        self.bert_config = self.bert_module.config

    @staticmethod
    def _init_weights(blocks, **kwargs):
        """
        参数初始化，将 Linear / Embedding / LayerNorm 与 Bert 进行一样的初始化
        """
        for block in blocks:
            for module in block.modules():
                if isinstance(module, nn.Linear):
                    if module.bias is not None:
                        nn.init.zeros_(module.bias)
                elif isinstance(module, nn.Embedding):
                    nn.init.normal_(module.weight, mean=0, std=kwargs.pop('initializer_range', 0.02))
                elif isinstance(module, nn.LayerNorm):
                    nn.init.ones_(module.weight)
                    nn.init.zeros_(module.bias)

In [12]:
class BaseLineClassifier(nn.Module):
    def __init__(self, args, n_class=2):
        super(BaseLineClassifier, self).__init__()
        
        hid_size = args.hid_size
        self.avg_pooling = MaskedGlobalAveragePooling1D()
        self.max_pooling = MaskedGlobalMaxPool1D()
        self.dropout = nn.Dropout(0.2)#args.mt_dropout_prob[0]
        self.dropouts = nn.ModuleList([
            nn.Dropout(0.5) for _ in range(5)
        ])

        self.classifier = nn.Linear(hid_size * 5, n_class)

    def forward(self, all_hidden_states, pooler_output, input_mask):
        last_hidden_states = all_hidden_states[-2:]
        last_hidden_state = torch.cat((last_hidden_states[-1], last_hidden_states[-2]), dim=2)
        avg_pooled = self.avg_pooling(last_hidden_state, input_mask)
        max_pooled = self.max_pooling(last_hidden_state, input_mask)
        pooled = torch.cat((avg_pooled, max_pooled, pooler_output), dim=1)


        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                h = self.classifier(dropout(pooled))
            else:
                h += self.classifier(dropout(pooled))
        h = h / len(self.dropouts)

        return h

In [13]:
class ForWardModel(BaseModel):
    def __init__(self, args, classifier=None):
        super(ForWardModel, self).__init__(args.bert_path)
        assert classifier is not None, '检查分类器是否正常输入。'
        
        self.classifier = classifier
            
    def forward(self, token_ids, attention_masks, token_type_ids):
        
        all_bert_outputs = self.bert_module(input_ids=token_ids,
                                            attention_mask=attention_masks,
                                            token_type_ids=token_type_ids)
#         print(all_bert_outputs)
        last_hidden_state = all_bert_outputs.last_hidden_state
        pooler_output = all_bert_outputs.pooler_output
        all_hidden_states = all_bert_outputs.hidden_states
        
        outputs = self.classifier(all_hidden_states, pooler_output, attention_masks)
        
        return outputs

classifier = BaseLineClassifier(args)
model = ForWardModel(args, classifier)
h = model(token_ids, attention_masks, token_type_ids)

In [21]:
from transformers import AdamW, get_linear_schedule_with_warmup
from utils.attack_train_utils import FGM, PGD
from models.losses import FocalLoss, LabelSmoothingCrossEntropy
from sklearn.metrics import accuracy_score, f1_score
import math
t_total=5000

In [15]:
def build_optimizer_scheduler(model, t_total):
    module = (model.module if hasattr(model, "module") else model)
    # 差分学习率
    no_decay = ["bias", "LayerNorm.weight"]
    embedding_param = list(module.bert_module.named_parameters())
    classifier_param = list(module.classifier.named_parameters())

    optimizer_grouped_parameters = [
        # bert other module
        {"params": [p for n, p in embedding_param if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.bert_lr},
        {"params": [p for n, p in embedding_param if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.bert_lr},

        # other
        {"params": [p for n, p in classifier_param if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay, 'lr': args.classifier_lr},
        {"params": [p for n, p in classifier_param if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0, 'lr': args.classifier_lr},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.bert_lr, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=math.ceil(args.warmup_proportion * t_total), 
        num_training_steps=t_total
    )
    
    return optimizer, scheduler

In [16]:
def metric(y_true, y_pred):
    
    if len(y_pred.shape)>1:
        y_pred = np.argmax(y_pred,axis=1)
    if len(y_true.shape)>1:
        y_true = np.argmax(y_true,axis=1)
        
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true,y_pred,average= 'macro')
    return acc, f1

In [17]:
# args model train_loader

def train_model(args, model, train_loader):
    
    loss_name = args.loss_fn.lower()
    assert loss_name in ['ce', 'focalloss', 'labelsmooth'], '请输入正确的lossfn，ex: ce, focalloss, labelsmooth'

    if loss_name == 'ce':
        loss_fn = torch.nn.CrossEntropyLoss()
    elif loss_name == 'focalloss':
        loss_fn = FocalLoss()
    elif loss_name == 'labelsmooth':
        loss_fn = LabelSmoothingCrossEntropy()

    if args.use_gpu:
        device = torch.device('gpu')
    else:
        device = torch.device('cpu')

    train_f1 = []
    train_acc = []
    one_epoch_steps = len(train_loader)
    optimizer, scheduler = build_optimizer_scheduler(model, math.ceil(one_epoch_steps / args.accumulation_steps))

    global_step = 0
    model.zero_grad()
    model.train()
    fgm, pgd = None, None

    attack_train_mode = args.attack_train
    if attack_train_mode is not None:
        attack_train_mode = attack_train_mode.lower()
    if attack_train_mode == 'fgm':
        fgm = FGM(model=model)
    elif attack_train_mode == 'pgd':
        pgd = PGD(model=model)


    for epoch in range(args.train_epochs):
        tk0 = tqdm(train_loader, total=one_epoch_steps)
        for step, batch_data in enumerate(tk0):
            token_ids = batch_data['token_ids'].to(device)
            attention_masks = batch_data['attention_masks'].to(device)
            token_type_ids = batch_data['token_type_ids'].to(device)
            labels = batch_data['labels'].to(device)

            outputs = model(token_ids=token_ids, 
                            attention_masks=attention_masks, 
                            token_type_ids=token_type_ids)

            loss = loss_fn(outputs, labels)
            loss = loss.mean() / args.accumulation_steps
            loss.backward()

            if fgm is not None:
                fgm.attack()
                outputs = model(token_ids=token_ids, 
                                attention_masks=attention_masks, 
                                token_type_ids=token_type_ids)
                loss_adv = loss_fn(outputs, labels)
                loss_adv = loss_adv.mean() / args.accumulation_steps
                loss_adv.backward()
                fgm.restore()

            elif pgd is not None: # 梯度累计pgd可能会有问题
                pgd.backup_grad()

                for _t in range(pgd_k):
                    pgd.attack(is_first_attack=(_t == 0))
                    if _t != pgd_k - 1:
                        model.zero_grad()
                    else:
                        pgd.restore_grad()
                    outputs = model(token_ids=token_ids, 
                                    attention_masks=attention_masks, 
                                    token_type_ids=token_type_ids)
                    loss_adv = loss_fn(outputs, labels)
                    loss_adv = loss_adv.mean()
                    loss_adv.backward()
                pgd.restore()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 

            if (step + 1) % args.accumulation_steps == 0 or (step + 1) == one_epoch_steps + 1:
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
                    model.zero_grad()

            labels = labels.cpu().detach().numpy() 
            outputs = outputs.cpu().detach().numpy()
            acc, f1 = metric(labels, outputs)
            train_f1.append(f1)
            train_acc.append(acc)
            torch.cuda.empty_cache()
            tk0.set_postfix(train_f1=np.mean(train_f1), 
                            train_acc=np.mean(train_acc))         

            if step > 10:
                break

    # swa(swa_raw_model, args.output_dir, swa_start=args.swa_start)

    # clear cuda cache to avoid OOM
    torch.cuda.empty_cache()

    logger.info('Train done')        
        
    
# train(args, model, train_loader)    

In [18]:
def eval_model(args, model, best_f1):
    
    if args.use_gpu:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        model.eval()
    dev_f1 = []
    dev_acc = []
    one_epoch_steps = len(dev_loader)

    tk0 = tqdm(dev_loader, total=one_epoch_steps)
    for step, batch_data in enumerate(tk0):
        token_ids = batch_data['token_ids'].to(device)
        attention_masks = batch_data['attention_masks'].to(device)
        token_type_ids = batch_data['token_type_ids'].to(device)
        labels = batch_data['labels'].to(device)

        outputs = model(token_ids=token_ids, 
                        attention_masks=attention_masks, 
                        token_type_ids=token_type_ids)

        labels = labels.cpu().detach().numpy() 
        outputs = outputs.cpu().detach().numpy()
        acc, f1 = metric(labels, outputs)
        train_f1.append(f1)
        train_acc.append(acc)
        torch.cuda.empty_cache()
        if step>10:
            break
        tk0.set_postfix(train_f1=np.mean(train_f1), 
                        train_acc=np.mean(train_acc))  
    
    if np.mean(dev_f1) > best_f1:
        model_save_allname = args.model_save_name + '_seed-' + str(args.seed) + '_bs-' + str(args.train_batch_size)
        model_save_dir = args.model_save_path + model_save_allname + '.bin'
        torch.save(model.state_dict(), model_save_dir)
        
        
# eval_model(model, best_f1=0)

In [30]:
def test_model(args, model, test_loader):
    if args.use_gpu:
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        model.eval()
    model.eval()
    one_epoch_steps = len(test_loader)
    test_preds = []
    tk0 = tqdm(test_loader, total=one_epoch_steps)
    for step, batch_data in enumerate(tk0):
        token_ids = batch_data['token_ids'].to(device)
        attention_masks = batch_data['attention_masks'].to(device)
        token_type_ids = batch_data['token_type_ids'].to(device)

        logits = model(token_ids=token_ids, 
                        attention_masks=attention_masks, 
                        token_type_ids=token_type_ids)
        preds = torch.argmax(logits, dim=-1).cpu().detach().numpy() 
        test_preds.append(preds)
        test_preds = np.hstack(test_preds)
    return test_preds

# test_preds = test_model(args, model, test_loader)

  2%|▏         | 11/500 [00:17<12:38,  1.55s/it]


In [31]:
np.hstack(test_preds)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1])