In [1]:
import json 
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import random
import os
import time
from sklearn.model_selection import *
from sklearn.metrics import f1_score
from transformers import *
import matplotlib.pyplot as plt

  'The interface of "soundfile" backend is planned to change in 0.8.0 to '


In [2]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'nghuyong/ernie-1.0', 
#     'model': 'hfl/chinese-macbert-large',
    'max_len': 512, 
    'train_bs': 32, 
    'valid_bs': 32,
    'num_workers': 0,
    'device': 0,
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['seed'])

torch.cuda.set_device(CFG['device'])
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
test_df =  pd.read_csv('test_final.csv')
test_df

Unnamed: 0,source,target,id,type,len1,len2
0,龙虎榜：1.4亿资金抢筹邦讯技术机构净买这17股,2021年03月08日龙虎榜机构买入和卖出的个股一览,ss0_a,ss,24,26
1,意难忘：阿水仗义，宁愿去坐牢也不肯出卖蔡进炮,意难忘：有人要找王胜志的麻烦，蔡进炮的面子都不好用,ss1_a,ss,22,27
2,置办年货少不了买买买，但小心别被花花绿绿的食品包装迷了眼。 挑选有诀窍，我们给你来支招。新的...,置办年货少不了买买买，但小心别被花花绿绿的食品包装迷了眼。挑选有诀窍，小编给你来支招～新的一...,ss2_a,ss,72,82
3,唐仁健：乡村建设不能搞形式主义、官僚主义,部长通道|唐仁健:我国粮食生产保持“17连丰”,ss3_a,ss,20,23
4,华谊兄弟：定增股票申请收到深交所第二轮审核问询函,精测电子(300567.SZ)申请定增股票获中证监注册批准,ss4_a,ss,24,29
...,...,...,...,...,...,...
41475,北京时间2月25日消息，2020-2021赛季西甲结束一场第一轮的补赛，巴萨主场3-0轻取埃...,3-0！巴萨在主场轻松大胜埃尔切，多赛一轮反超塞维利亚重返第三，同时距离榜首还差5分，重新看...,ll7582_b,ll,624,735
41476,中超联赛还有一个月就要打响了，每支队伍都在忙碌地筹备着新赛季的事项，深圳佳兆业队也开始了他们...,导读：现阶段关于深足的消息不可为不多，先是被爆料有意引进已经停止运营的江苏队队长吴曦，日前又...,ll7583_b,ll,1597,790
41477,3月23日0—24时，31个省（自治区、直辖市）和新疆生产建设兵团报告新增确诊病例10例，均...,中新网3月23日电据国家卫健委网站消息，3月22日0—24时，31个省(自治区、直辖市)和新...,ll7584_b,ll,1370,572
41478,【CNMO新闻】中国家电及消费电子博览会（AWE）日前在上海虹桥国家会展中心隆重开幕。作为人...,3月23日，以“新十年·智竞未来”为主题的中国家电及消费电子博览会（AWE）在上海虹桥国家会...,ll7585_b,ll,420,1460


In [5]:
test_df['task'] = test_df['id'].apply(lambda x:{'a':0,'b':1}[x.split('_')[-1]])

In [6]:
test_df

Unnamed: 0,source,target,id,type,len1,len2,task
0,龙虎榜：1.4亿资金抢筹邦讯技术机构净买这17股,2021年03月08日龙虎榜机构买入和卖出的个股一览,ss0_a,ss,24,26,0
1,意难忘：阿水仗义，宁愿去坐牢也不肯出卖蔡进炮,意难忘：有人要找王胜志的麻烦，蔡进炮的面子都不好用,ss1_a,ss,22,27,0
2,置办年货少不了买买买，但小心别被花花绿绿的食品包装迷了眼。 挑选有诀窍，我们给你来支招。新的...,置办年货少不了买买买，但小心别被花花绿绿的食品包装迷了眼。挑选有诀窍，小编给你来支招～新的一...,ss2_a,ss,72,82,0
3,唐仁健：乡村建设不能搞形式主义、官僚主义,部长通道|唐仁健:我国粮食生产保持“17连丰”,ss3_a,ss,20,23,0
4,华谊兄弟：定增股票申请收到深交所第二轮审核问询函,精测电子(300567.SZ)申请定增股票获中证监注册批准,ss4_a,ss,24,29,0
...,...,...,...,...,...,...,...
41475,北京时间2月25日消息，2020-2021赛季西甲结束一场第一轮的补赛，巴萨主场3-0轻取埃...,3-0！巴萨在主场轻松大胜埃尔切，多赛一轮反超塞维利亚重返第三，同时距离榜首还差5分，重新看...,ll7582_b,ll,624,735,1
41476,中超联赛还有一个月就要打响了，每支队伍都在忙碌地筹备着新赛季的事项，深圳佳兆业队也开始了他们...,导读：现阶段关于深足的消息不可为不多，先是被爆料有意引进已经停止运营的江苏队队长吴曦，日前又...,ll7583_b,ll,1597,790,1
41477,3月23日0—24时，31个省（自治区、直辖市）和新疆生产建设兵团报告新增确诊病例10例，均...,中新网3月23日电据国家卫健委网站消息，3月22日0—24时，31个省(自治区、直辖市)和新...,ll7584_b,ll,1370,572,1
41478,【CNMO新闻】中国家电及消费电子博览会（AWE）日前在上海虹桥国家会展中心隆重开幕。作为人...,3月23日，以“新十年·智竞未来”为主题的中国家电及消费电子博览会（AWE）在上海虹桥国家会...,ll7585_b,ll,420,1460,1


In [7]:
class MyDataset(Dataset):
    def __init__(self, dataframe):
        self.df = dataframe
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text1 = self.df.source.values[idx]
        text2 = self.df.target.values[idx]
        
        task = self.df.task.values[idx]
        
        return text1, text2, task

In [8]:
def collate_fn(data):
    input_ids, attention_mask, token_type_ids = [], [], []
    text = tokenizer([x[0] for x in data], text_pair=[x[1] for x in data], padding='max_length', truncation=True, max_length=CFG['max_len'], return_tensors='pt')
    input_ids = text['input_ids']
    attention_mask = text['attention_mask']
    token_type_ids = text['token_type_ids']
    label = torch.LongTensor([x[2] for x in data])
    return input_ids, attention_mask, token_type_ids, label

In [9]:
class Model(nn.Module):

    def __init__(self, CFG):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(CFG['model'])
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 2)
        self.fc2 = nn.Linear(self.bert.config.hidden_size, 2)
 
    def forward(self, input_ids, attention_mask, token_type_ids):
        text = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)[1]
        y1 = self.fc1(text)
        y2 = self.fc2(text)
        return y1, y2

In [10]:
test_set = MyDataset(test_df)
test_loader = DataLoader(test_set, batch_size=CFG['valid_bs'], collate_fn=collate_fn, shuffle=False, num_workers=CFG['num_workers'])

In [30]:
prediction = []

In [31]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'nghuyong/ernie-1.0', 
    'max_len': 512, 
    'train_bs': 32, 
    'valid_bs': 32,
    'num_workers': 0,
    'device': 0,
}
tokenizer = BertTokenizer.from_pretrained(CFG['model'])


models = []
model =  Model(CFG).to(device)

for i in range(2,3):
    model.load_state_dict(torch.load('models/{}_full_epoch_{}.pt'.format(CFG['model'].split('/')[-1], i),map_location=device))
    model.eval()
    models.append(model)


    
y_pred = []

with torch.no_grad():

    tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True)
    
    for step, (input_ids, attention_mask, token_type_ids, tasks) in enumerate(tk):
        
        input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
        
        outputs = []
        
        for i in range(len(models)):
            output1, output2 = models[i](input_ids, attention_mask, token_type_ids)
            output = torch.zeros_like(output1)
            
            for j in range(len(tasks)):
                task = tasks[j]

                if task == 0:
                    output[j] = output1[j]
                else:
                    output[j] = output2[j]

            outputs.append(output.softmax(1)[:,1].cpu().numpy())

        output = np.mean(outputs, 0)
        
        y_pred.extend(output)

y_pred = np.array(y_pred)

prediction.append(y_pred)

100%|██████████| 1297/1297 [15:06<00:00,  1.43it/s]


In [32]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'hfl/chinese-macbert-base',
    'max_len': 512, 
    'train_bs': 32, 
    'valid_bs': 32,
    'num_workers': 0,
    'device': 0,
}
tokenizer = BertTokenizer.from_pretrained(CFG['model'])

models = []
model =  Model(CFG).to(device)

for i in range(2,3):
    model.load_state_dict(torch.load('models/{}_full_epoch_{}.pt'.format(CFG['model'].split('/')[-1], i),map_location=device))
    model.eval()
    models.append(model)


    
y_pred = []

with torch.no_grad():

    tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True)
    
    for step, (input_ids, attention_mask, token_type_ids, tasks) in enumerate(tk):
        
        input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
        
        outputs = []
        
        for i in range(len(models)):
            output1, output2 = models[i](input_ids, attention_mask, token_type_ids)
            output = torch.zeros_like(output1)
            
            for j in range(len(tasks)):
                task = tasks[j]

                if task == 0:
                    output[j] = output1[j]
                else:
                    output[j] = output2[j]

            outputs.append(output.softmax(1)[:,1].cpu().numpy())

        output = np.mean(outputs, 0)
        
        y_pred.extend(output)

y_pred = np.array(y_pred)

prediction.append(y_pred)

100%|██████████| 1297/1297 [15:25<00:00,  1.40it/s]


In [33]:
CFG = {
    'fold_num': 5,
    'seed': 42,
    'model': 'hfl/chinese-macbert-large',
    'max_len': 512, 
    'train_bs': 16, 
    'valid_bs': 16,
    'num_workers': 0,
    'device': 0,
}
tokenizer = BertTokenizer.from_pretrained(CFG['model'])

models = []
model =  Model(CFG).to(device)

for i in range(2,3):
    model.load_state_dict(torch.load('models/{}_full_epoch_{}.pt'.format(CFG['model'].split('/')[-1], i),map_location=device))
    model.eval()
    models.append(model)


    
y_pred = []

with torch.no_grad():

    tk = tqdm(test_loader, total=len(test_loader), position=0, leave=True)
    
    for step, (input_ids, attention_mask, token_type_ids, tasks) in enumerate(tk):
        
        input_ids, attention_mask, token_type_ids = input_ids.to(device), attention_mask.to(device), token_type_ids.to(device)
        
        outputs = []
        
        for i in range(len(models)):
            output1, output2 = models[i](input_ids, attention_mask, token_type_ids)
            output = torch.zeros_like(output1)
            
            for j in range(len(tasks)):
                task = tasks[j]

                if task == 0:
                    output[j] = output1[j]
                else:
                    output[j] = output2[j]

            outputs.append(output.softmax(1)[:,1].cpu().numpy())

        output = np.mean(outputs, 0)
        
        y_pred.extend(output)

y_pred = np.array(y_pred)

prediction.append(y_pred)

100%|██████████| 1297/1297 [32:27<00:00,  1.50s/it]


In [34]:
prediction_ensemble = np.mean(prediction, 0)

In [35]:
test_df['prob'] = prediction_ensemble

In [44]:
test_df.loc[test_df['task']==0,'label'] = (test_df[test_df['task']==0]['prob']>0.44).astype('int') 
test_df.loc[test_df['task']==1,'label'] = (test_df[test_df['task']==1]['prob']>0.37).astype('int') 

In [45]:
sub = pd.read_csv('data/rematch_test_with_id/test_sample_submission.csv')
sub = pd.merge(sub[['id']],test_df)[['id','label']]

sub

Unnamed: 0,id,label
0,ll0_a,0.0
1,ll1_a,1.0
2,ll2_a,1.0
3,ll3_a,1.0
4,ll4_a,1.0
...,...,...
41475,ll7582_b,1.0
41476,ll7583_b,0.0
41477,ll7584_b,1.0
41478,ll7585_b,1.0


In [46]:
sub.to_csv('sub5.csv',index=False)