In [1]:
import json 
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import random
import os
import time
from sklearn.model_selection import *
from sklearn.metrics import f1_score
from transformers import *

In [2]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'hfl/chinese-macbert-base',
    'max_len': 512, 
    'epochs': 4,
    'train_bs': 16, 
    'valid_bs': 16,
    'lr': 1e-5, 
    'num_workers': 4,
    'accum_iter': 1, 
    'weight_decay': 1e-6, 
    'device': 1,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['seed'])

torch.cuda.set_device(CFG['device'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
train_df =  pd.read_csv('train.csv')
test_df =  pd.read_csv('test.csv')

In [5]:
train_df.fillna(-1, inplace=True)

In [6]:
train_df['stratify'] = 3*(train_df.labelA+1)+train_df.labelB+1

In [7]:
train_df

Unnamed: 0,source,target,labelA,labelB,type,len1,len2,stratify
0,谁能打破科比81分纪录？奥尼尔给出5个候选人，补充利拉德比尔！,NBA现役能入名人堂的球星很多，但是能被立铜像只有2人,0.0,-1.0,ss,31,27,3.0
1,请扩散！明天，黄金埠这些地方会停电！,生活｜这几个地方注意啦！1月12日有部分线路停电检修,0.0,-1.0,ss,18,26,3.0
2,居家健身增强免疫！（三）,原来是背影杀手#你愿意和我做朋友吗#户外健身,0.0,-1.0,ss,12,22,3.0
3,太极拳罗师傅被洋人打伤，叶问霸气复仇？,非常完美：感动！女嘉宾告白赵杰被拒绝，尹康霸气上台挽留！,0.0,-1.0,ss,19,30,3.0
4,恭喜哈登！篮网因祸得福，29+7超巨大概率复出，3换4交易方案出炉,三英缺席杜兰特，篮网迎战湖人NBA重头大戏！,1.0,-1.0,ss,33,22,6.0
...,...,...,...,...,...,...,...,...
69573,凯尔特人是苏超的霸主，不过随着格拉斯哥流浪者本赛季的强势回归，球队本赛季遭遇了前所未有的挑战...,019，利物浦vs曼特斯特联 利物浦上赛季遥遥领先的拿到了冠军，本以为利物浦是准备做百分...,-1.0,0.0,ll,341,1517,1.0
69574,谁是中国影史最另类的导演？ 答案只有一个：姜文。 能够切中时代脉搏，真正做到商业与艺术的完...,最近，有眼尖的网友发现，在电视剧版《天官赐福》豆瓣页面，导演一栏竟然是姜文。 虽然这大概率...,-1.0,1.0,ll,3635,2210,2.0
69575,“建设海南自由贸易港，打造我国面向太平洋、印度洋的重要开放门户，是国家重大战略。”近日，中国...,法国：禽类及其相关产品暂停输华 近日，法国官方通报该国家禽发生H5N8亚型高致病性禽流感疫情...,-1.0,0.0,ll,1658,1806,1.0
69576,[辉瑞和Moderna现在承诺，到7月底前将在美国分别生产3亿剂疫苗，并在欧洲和其他地区生产...,国际在线报道（记者高俊雅）：南非总统拉马福萨2月28号宣布，随着该国每日新增病例的持续下降以...,-1.0,0.0,ll,1658,759,1.0


In [8]:
tokenizer = BertTokenizer.from_pretrained(CFG['model'])

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=109540.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=112.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=268961.0), HTML(value='')))




In [9]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text1 = str(self.df.source.values[idx])
        text2 = str(self.df.target.values[idx])
        
        label1 = self.df.labelA.values[idx]
        label2 = self.df.labelB.values[idx]
        
        return text1, text2, label1, label2

In [10]:
def collate_fn(data):
    text = tokenizer([x[0] for x in data], text_pair=[x[1] for x in data], padding='max_length', truncation=True, max_length=CFG['max_len'], return_tensors='pt')
    input_ids = text['input_ids']
    attention_mask = text['attention_mask']
    token_type_ids = text['token_type_ids']
    label1 = torch.LongTensor([x[2] for x in data])
    label2 = torch.LongTensor([x[3] for x in data])
    return input_ids, attention_mask, token_type_ids, label1, label2

In [11]:
class Model(nn.Module):

    def __init__(self, CFG):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(CFG['model'])
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 2)
        self.fc2 = nn.Linear(self.bert.config.hidden_size, 2)
 
    def forward(self, input_ids, attention_mask, token_type_ids):
        text = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]
        y1 = self.fc1(text)
        y2 = self.fc2(text)
        return y1, y2

In [12]:
class AverageMeter:
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
        
def train_model(model, train_loader):
    model.train() 
    
    losses = AverageMeter()
    
    optimizer.zero_grad()
    
    tk = tqdm(train_loader, total=len(train_loader), position=0, leave=True)
    for step, batch in enumerate(tk):
        input_ids, attention_mask, token_type_ids, y1, y2 = batch
        
        input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
        y1, y2 = y1.to(device), y2.to(device)
        
        with autocast():
            output1, output2 = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(output1, y1) / 2 + criterion(output2, y2) / 2

        scaler.scale(loss).backward()

        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad() 
        scheduler.step()
            
        lr = optimizer.param_groups[-1]['lr']

        losses.update(loss.item(), y1.size(0))
        tk.set_postfix(loss=losses.avg, lr=lr)
  
    return losses.avg


def test_model(model, val_loader):
    model.eval()
    
    losses = AverageMeter()

    y_truth1, y_pred1 = [], []
    y_truth2, y_pred2 = [], []
    
    with torch.no_grad():
        tk = tqdm(val_loader, total=len(val_loader), position=0, leave=True)
        for step, (input_ids, attention_mask, token_type_ids, y1, y2) in enumerate(tk):
            input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
            y1, y2 = y1.to(device), y2.to(device)
        
            output1, output2 = model(input_ids, attention_mask, token_type_ids)
            
            loss = criterion(output1, y1) / 2 + criterion(output2, y2) / 2
        
            losses.update(loss.item(), y1.size(0))
            tk.set_postfix(loss=losses.avg)
            
            idx1 = y1>-1
            if idx1.sum():
                y_truth1.extend(y1[idx1].cpu().numpy())
                y_pred1.extend(output1[idx1].softmax(1)[:,1].cpu().numpy())
            idx2 = y2>-1
            if idx2.sum():
                y_truth2.extend(y2[idx2].cpu().numpy())
                y_pred2.extend(output2[idx2].softmax(1)[:,1].cpu().numpy())
  
    def best_f1(y_truth, y_pred):   
        thresholds = []
        for thresh in np.arange(0.4, 0.61, 0.1):
            thresh = np.round(thresh, 2)
            res = f1_score(y_truth, (y_pred >= thresh).astype(int))
            thresholds.append([thresh, res])
        thresholds.sort(key=lambda x: x[1], reverse=True)
        best_thresh = thresholds[0][0]
        best_score = thresholds[0][1]
        print(thresholds)
        return best_score
    
    f1 = (best_f1(y_truth1, y_pred1) + best_f1(y_truth2, y_pred2)) / 2
       
    return losses.avg, f1

In [None]:
seed_everything(CFG['seed'])

folds = StratifiedKFold(n_splits=CFG['fold_num'], shuffle=True, random_state=CFG['seed'])\
                    .split(np.arange(train_df.shape[0]), train_df['stratify'].values)

cv = [] 

for fold, (trn_idx, val_idx) in enumerate(folds):
    
    print(fold)

    train = train_df.loc[trn_idx]
    val = train_df.loc[val_idx]
    
    train_set = MyDataset(train)
    val_set = MyDataset(val)
    
    train_loader = DataLoader(train_set, batch_size=CFG['train_bs'], collate_fn=collate_fn, shuffle=True, num_workers=CFG['num_workers'])
    val_loader = DataLoader(val_set, batch_size=CFG['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=CFG['num_workers'])
    
    steps_per_epoch = len(train_loader)
    
    best_f1 = 0
    
    model =  Model(CFG).to(device)

    scaler = GradScaler()
    optimizer = AdamW(model.parameters(), lr=CFG['lr'], weight_decay=CFG['weight_decay'])
    criterion = nn.CrossEntropyLoss(ignore_index=-1)
    scheduler = get_cosine_schedule_with_warmup(optimizer, 0.05*CFG['epochs']*steps_per_epoch, CFG['epochs']*steps_per_epoch)

    for epoch in range(CFG['epochs']):

        print('epoch:',epoch)
        time.sleep(0.2)

        train_loss = train_model(model, train_loader)
        val_loss, val_f1 = test_model(model, val_loader)
        
        print(val_f1)
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), '{}_fold_{}.pt'.format(CFG['model'].split('/')[-1], fold))
            
    cv.append(best_f1) 

0


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1306488754.0), HTML(value='')))


epoch: 0


100%|██████████| 3479/3479 [44:18<00:00,  1.31it/s, loss=0.336, lr=8.95e-6]
100%|██████████| 870/870 [08:29<00:00,  1.71it/s, loss=0.153]


[[0.4, 0.7954806587514363], [0.5, 0.782537405205985], [0.6, 0.7581896551724138]]
[[0.4, 0.6612779060816013], [0.5, 0.6562631800927878], [0.6, 0.6356517733763243]]
0.7283792824165187
epoch: 1


100%|██████████| 3479/3479 [44:18<00:00,  1.31it/s, loss=0.257, lr=5.41e-6]
100%|██████████| 870/870 [08:29<00:00,  1.71it/s, loss=0.155]


[[0.4, 0.7929444778512728], [0.5, 0.779746835443038], [0.6, 0.7588028169014086]]
[[0.4, 0.6367384333486028], [0.5, 0.6001003512293026], [0.6, 0.5572519083969466]]
0.7148414555999378
epoch: 2


100%|██████████| 3479/3479 [44:15<00:00,  1.31it/s, loss=0.164, lr=1.61e-6]
100%|██████████| 870/870 [08:29<00:00,  1.71it/s, loss=0.195]


[[0.4, 0.7857142857142856], [0.5, 0.7842053307008885], [0.6, 0.7812563118561906]]
[[0.4, 0.6471782379212342], [0.5, 0.6388415672913118], [0.6, 0.630963096309631]]
0.7164462618177598
epoch: 3


 70%|███████   | 2437/3479 [31:02<13:12,  1.31it/s, loss=0.093, lr=1.53e-7] 

In [None]:
cv

In [None]:
np.mean(cv)