In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np 
import pandas as pd
import datetime
import pkg_resources
import seaborn as sns
import time
import scipy.stats as stats
import gc
import re
import os
import operator 
import sys
import logging
import shutil
import pickle
import torch
import torch.nn as nn
import torch.utils.data
import torch.nn.functional as F
import warnings
warnings.filterwarnings(action='once')
%load_ext autoreload
%autoreload 2
%matplotlib inline
from apex import amp
from sklearn import metrics
from sklearn import model_selection
from nltk.stem import PorterStemmer
from sklearn.metrics import roc_auc_score
from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
from pytorch_pretrained_bert import BertConfig
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertAdam
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel
InteractiveShell.ast_node_interactivity = "all"

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  _config = json.load(open(_config_path))
  return f(*args, **kwds)


# 配置信息

In [3]:
device=torch.device('cuda')
MAX_SEQUENCE_LENGTH = 220
SEED = 567
EPOCHS = 2
lr = 1e-5
batch_size = 48
Data_dir="../data"
BERT_MODEL_PATH = '../models/uncased_L-12_H-768_A-12/'

# 读入数据

In [4]:
train_df = pd.read_csv(os.path.join(Data_dir,"train.csv"))

# 模型

In [5]:
# bert后连接分类层
class BertForSequenceClassification(BertPreTrainedModel):
    
    def __init__(self, config, num_labels):
        super(BertForSequenceClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.15)
        self.linear1 = nn.Linear(config.hidden_size, config.hidden_size//2)
        self.linear2 = nn.Linear(config.hidden_size, config.hidden_size//2)
        self.linear_out = nn.Linear(config.hidden_size//2, 1)
        self.linear_aux_out = nn.Linear(config.hidden_size//2, num_labels-1)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)   
        h_conc_linear1  = self.dropout(F.relu(self.linear1(pooled_output)))
        h_conc_linear2  = self.dropout(F.relu(self.linear2(pooled_output)))
        
        result = self.linear_out(h_conc_linear1)
        aux_result = self.linear_aux_out(h_conc_linear2)
        logits = torch.cat([result, aux_result], 1)

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            return loss
        else:
            return logits

# 按照评测要求使用修改的loss

In [6]:
y_aux_train = train_df[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']]
identity_columns = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

# Overall
weights = np.ones((len(train_df),)) / 4

# Subgroup
weights += (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) / 4

# Background Positive, Subgroup Negative
weights += (( (train_df['target'].values>=0.5).astype(bool).astype(np.int) +
   (train_df[identity_columns].fillna(0).values<0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4

# Background Negative, Subgroup Positive
weights += (( (train_df['target'].values<0.5).astype(bool).astype(np.int) +
   (train_df[identity_columns].fillna(0).values>=0.5).sum(axis=1).astype(bool).astype(np.int) ) > 1 ).astype(bool).astype(np.int) / 4
loss_weight = 1.0 / weights.mean()

# 训练过程中使用focal loss和自定义的loss

In [7]:
def focal_loss_sigmoid(logit, labels, weights, gamma=2):
    y_pred=torch.sigmoid(logit)
    alpha = torch.tensor(list(weights)).reshape(len(weights), 1).cuda()
    alpha[(labels==0)] = (1-alpha[(labels==0)]).float().cuda()
    L=-labels*alpha*((1-y_pred)**gamma)*torch.log(y_pred)-\
      (1-labels)*(1-alpha)*(y_pred**gamma)*torch.log(1-y_pred)
    return L.sum() / len(labels)

In [8]:
def custom_loss(data, targets):
    ''' Define custom loss function for weighted BCE on 'target' column '''
    loss_1 = focal_loss_sigmoid(data[:,:1],targets[:,:1],targets[:,1:2])
    loss_2 = nn.BCEWithLogitsLoss()(data[:,1:],targets[:,2:])
    loss_all = (loss_1 * loss_weight) + loss_2
    return loss_all

# 将文本转换成BERT向量

In [9]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
def convert_lines(example, max_seq_length,tokenizer):
    max_seq_length -=2
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(longer)
    return np.array(all_tokens)

In [10]:
print('loaded %d records' % len(train_df))
train_df['comment_text'] = train_df['comment_text'].astype(str)
sequences = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH, tokenizer)
train_df=train_df.fillna(0)

loaded 1804874 records


# 构造训练数据

In [12]:
x_train = sequences
y_train = np.vstack([(train_df['target'].values>=0.5).astype(np.int),weights]).T # 按照四舍五入将以分数表示的target转换成0或1
x_train_torch = torch.tensor(x_train, dtype=torch.long)
y_train_torch = torch.tensor(np.hstack([y_train, y_aux_train]), dtype=torch.float)

# 分桶加快训练

In [13]:
from torch.utils import data
from tqdm import tqdm_notebook as tqdm

class LenMatchBatchSampler(data.BatchSampler):
    def __iter__(self):
        buckets = [[]] * 100
        yielded = 0

        for idx in self.sampler:
            count_zeros = torch.sum(self.sampler.data_source[idx][0] == 0)
            count_zeros = int(count_zeros / 64) 
            if len(buckets[count_zeros]) == 0:  buckets[count_zeros] = []

            buckets[count_zeros].append(idx)

            if len(buckets[count_zeros]) == self.batch_size:
                batch = list(buckets[count_zeros])
                yield batch
                yielded += 1
                buckets[count_zeros] = []

        batch = []
        leftover = [idx for bucket in buckets for idx in bucket]

        for idx in leftover:
            batch.append(idx)
            if len(batch) == self.batch_size:
                yielded += 1
                yield batch
                batch = []

        if len(batch) > 0 and not self.drop_last:
            yielded += 1
            yield batch

        assert len(self) == yielded, "produced an inccorect number of batches. expected %i, but yielded %i" %(len(self), yielded)

def trim_tensors(tsrs):
    max_len = torch.max(torch.sum( (tsrs[0] != 0  ), 1))
    if max_len > 2: 
        tsrs = [tsr[:, :max_len] for tsr in tsrs]
    return tsrs 

train_dataset = data.TensorDataset(x_train_torch,y_train_torch)

ran_sampler = data.RandomSampler(train_dataset)
len_sampler = LenMatchBatchSampler(ran_sampler, batch_size = batch_size, drop_last = False)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler = len_sampler)

# 训练模型

In [20]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2,3,4"    # 需要根据实验环境修改

accumulation_steps=1
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained("../working", cache_dir=None, num_labels=7)
model.cuda()
model.zero_grad()

param_optimizer = list(model.named_parameters())

# Layer-wise learning rate setting

params_embeddings = {'params':[], 'lr': 1e-5 * (0.95**11)}
params_layers = [{'params':[],'lr': 1e-5 * (0.95**(11-i))} for i in range(0,12)]
params_others = {'params':[], 'lr': 1e-5 * (0.95**0)}
for name, para in param_optimizer:
    if 'embedding' in name:
        params_embeddings['params'].append(para)
    elif 'layer' in name:
        layer_num = int(re.findall('\d+', name)[0])
        params_layers[layer_num]['params'].append(para)
    else:
        params_others['params'].append(para)
optimizer_grouped_parameters = [params_embeddings] + params_layers + [params_others]

num_train_optimization_steps = int(EPOCHS*len(train_dataset)/batch_size/accumulation_steps)
optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     weight_decay=0.0005,
                     warmup=0.05,
                     t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model, optimizer, opt_level="O1",verbosity=0)
model = nn.DataParallel(model, device_ids=[0,1,2])
model.load_state_dict(torch.load("../pc/zy/epoch_add_" + str(0) + "_bert_pytorch_state.bin"))

<torch._C.Generator at 0x7f4f0b3452b0>

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, eleme

In [21]:
model = model.train()
for param in model.parameters():
    param.requires_grad = True

In [22]:
tq = tqdm_notebook(range(EPOCHS))
try:
    for epoch in tq:
        avg_loss = 0.
        avg_accuracy = 0.
        lossf = None
        optimizer.zero_grad()

        train_iter = tqdm_notebook(enumerate(train_loader),total=len(train_loader),leave=False)
        for i, batch in train_iter: 
            tsrs = trim_tensors(batch)
            if i>=12000 and epoch==0:
                break
            if i<=12000 and epoch>0:
                continue            
            x_batch, y_batch = tuple(t.to(device) for t in tsrs)
            y_pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
            loss = custom_loss(y_pred, y_batch.to(device))
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            if (i+1) % accumulation_steps == 0:             
                optimizer.step()                            
                optimizer.zero_grad()
            if lossf:
                lossf = 0.98*lossf + 0.02*loss.item()
            else:
                lossf = loss.item()
            train_iter.set_postfix(loss = lossf)
            avg_loss += loss.item() / len(train_loader)
            avg_accuracy += torch.mean(((torch.sigmoid(y_pred[:,0])>=0.5) == (y_batch[:,0]>=0.5).to(device)).to(torch.float)).item()/len(train_loader)
        tq.set_postfix(avg_loss=avg_loss,avg_accuracy=avg_accuracy)
finally:
    print("always excute")
    torch.save(model.state_dict(), "../pc/zy/new_epoch_add_" + str(epoch) + "_bert_pytorch_state.bin")

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=37602), HTML(value='')))

HBox(children=(IntProgress(value=0, max=37602), HTML(value='')))

always excute


# 保存模型

In [15]:
torch.save(model, '../pc/zy/new_bert_pytorch.bin')

  "type " + obj.__name__ + ". It won't be checked "


In [16]:
torch.save(model.state_dict(), '../pc/zy/new_bert_pytorch_dict.bin')

# 读入并处理训练集数据

In [17]:
for param in model.parameters():
    param.requires_grad = False
model.eval()

test_df = pd.read_csv(os.path.join(Data_dir,"test.csv"))

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): FusedLayerNorm(torch.Size([768]), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [18]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,do_lower_case=True)
test_sequences = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"),MAX_SEQUENCE_LENGTH, tokenizer)

HBox(children=(IntProgress(value=0, max=97320), HTML(value='')))


2191


In [19]:
test_preds = np.zeros((len(test_sequences)))
test = torch.utils.data.TensorDataset(torch.tensor(test_sequences, dtype=torch.long))
test_loader = torch.utils.data.DataLoader(test, batch_size=512, shuffle=False)

tk0 = tqdm_notebook(test_loader)
for i, (x_batch,) in enumerate(tk0):
    pred = model(x_batch.to(device), attention_mask=(x_batch>0).to(device), labels=None)
    test_preds[i*512:(i+1)*512] = pred[:,0].detach().cpu().squeeze().numpy(

HBox(children=(IntProgress(value=0, max=191), HTML(value='')))




# 测试并保存submission文件

In [20]:
test_preds_sigmoid = torch.sigmoid(torch.tensor(test_preds)).numpy()
submission = pd.DataFrame.from_dict({
    'id': test_df['id'],
    'prediction': test_preds_sigmoid
})
submission.to_csv('../pc/zy/new_toxic_alldata_epoch_4_submission.csv', index=False)

# 模型集成

In [21]:
import pandas as pd
import numpy as np
result_bert = np.array(pd.read_csv('../out/toxic_nopredata_epoch_0.7_newloss/submission.csv')['prediction'])
result_lstm = np.array(pd.read_csv('../out/lstm_ensemble_6/submission.csv')['prediction'])
result_list = []
result_list.append(result_bert)
result_list.append(result_lstm)

In [22]:
np.mean(result_list, axis=0)

In [23]:
np.average(result_list, axis=0, weights=[0.6,0.4])