In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm, tqdm_notebook

from scipy import stats
from sklearn.model_selection import GroupKFold

import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.utils.data
from transformers import *

import os
import re
import math
import random
from matplotlib import pyplot as plt
import warnings
from math import floor, ceil

warnings.filterwarnings('ignore')
device = torch.device('cuda')
torch.backends.cudnn.benchmark=True

%matplotlib inline

In [2]:
EXP = "20200207_xlnet_10fold"

# constant
DATA_ROOT = "../input/"
RESULT = "../output/"
N_FOLD = 10
BS = 8
SEED = 42

# parameter
n_epoch = 3
learning_rate = 8e-5
max_grad_norm = 1.0

# model
pretrained_weights = "xlnet-base-cased"

# Param Dict
transformer_models_dict = {
    'xlnet-base-cased': (XLNetForSequenceClassification, XLNetTokenizer)
}

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def spearman_corr(y_true, y_pred):
    if np.ndim(y_pred) == 2:
        corr = np.nan_to_num([stats.spearmanr(y_true[:, i], y_pred[:, i])[0] for i in range(y_true.shape[1])]).mean()
    else:
        corr = stats.spearmanr(y_true, y_pred)[0]
    return corr
  
def calc_each_spearman(valid_y, valid_pred):
    lst = []
    for idx in range(30):
        spearman = spearman_corr(valid_y[:,idx], valid_pred[:,idx])
        lst.append(spearman)
    df = pd.DataFrame(lst).T
    df.columns = class_names
    return df

In [4]:
def _get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "<sep>":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return [0] * (max_seq_length - len(tokens)) + segments

def _get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids =  [5] * (max_seq_length-len(token_ids)) + token_ids
    return input_ids

def _trim_input(title, question, answer, max_sequence_length=512-1, 
                t_max_len=70-1, q_max_len=219, a_max_len=219):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))

        t = t[:t_new_len]
        q = norm_token_length(q, q_new_len)
        a = norm_token_length(a, a_new_len)
    
    return t, q, a

def norm_token_length(tokens, l):
    if len(tokens) > l:
        head = l//2
        tail = l - head
        return tokens[:head] + tokens[-tail:]
    else:
        return tokens[:l]

def _convert_to_bert_inputs(title, question, answer, cate, max_sequence_length=512):
    """Converts tokenized input to ids, masks and segments for BERT"""
    stoken = [cate] + title + ["<sep>"] + question + ["<sep>"] + answer + ["<sep>", "<cls>"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)
    
    try:
      cls_index = input_segments.index(5) - 1
    except ValueError:
      cls_index = -1
    input_segments[cls_index] = 2

    return [input_ids, input_segments]

def convert_row(row):
    c = f"[{row['category']}]"
    t, q, a = row["question_title"], row["question_body"], row["answer"]
    t, q, a = _trim_input(t, q, a)
    ids, segments = _convert_to_bert_inputs(t, q, a, c)
    return np.array([[ids, segments]])

In [5]:
train = pd.read_csv(DATA_ROOT + '/train.csv').fillna(' ')
sub = pd.read_csv(DATA_ROOT + '/sample_submission.csv').fillna(' ')

model_class, tokenizer_class = transformer_models_dict[pretrained_weights]
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)

categories = train["category"].unique().tolist()
categories = [f"[{c}]" for c in categories]
tokenizer.add_tokens(categories)

tokenizer.added_tokens_encoder

{'[LIFE_ARTS]': 32000,
 '[CULTURE]': 32001,
 '[SCIENCE]': 32002,
 '[STACKOVERFLOW]': 32003,
 '[TECHNOLOGY]': 32004}

In [6]:
%%time
X = train.apply(convert_row, axis=1).values
X = np.vstack(X).reshape((len(X), 1024))
assert X.shape == (6079, 1024)

CPU times: user 20.5 s, sys: 64.1 ms, total: 20.6 s
Wall time: 20.6 s


In [7]:
class_names = list(sub.columns[1:])
y = train[class_names].values

lst = []
for idx in range(30):
    t = pd.DataFrame(y[:,idx])[0]
    w_df = (1-t.value_counts()/len(t)).reset_index()
    w_dic = {row["index"]: row[0] for _, row in w_df.iterrows()}
    w = t.map(w_dic).values
    lst.append(w)
weights = np.vstack(lst).T

import copy
y_true = copy.deepcopy(y)
y = np.hstack([y, weights])

In [8]:
def custom_loss(data, targets):
    mse = nn.MSELoss(reduction="none")(data[:,:30].sigmoid(), targets[:,:30])
    bce = nn.BCEWithLogitsLoss(reduction='none')(data[:,:30], targets[:,:30])
    w =  targets[:,30:]
    loss = (mse*w).sum() + bce.sum()
    return loss

class CustomXLNet(XLNetForSequenceClassification):
    def __init__(self, config):
        super(CustomXLNet, self).__init__(config)  
        config.num_labels = N_BERT_LABEL
        config.output_hidden_states = True
        self.n_use_layer = 4
        self.n_labels = config.num_labels
        self.transformer = XLNetModel(config)

        self.dense1 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)
        self.dense2 = nn.Linear(768*self.n_use_layer, 768*self.n_use_layer)
        self.logits_proj = nn.Linear(768*self.n_use_layer, config.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        mems=None,
        perm_mask=None,
        target_mapping=None,
        token_type_ids=None,
        input_mask=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        transformer_outputs = self.transformer(
            input_ids,
            attention_mask=attention_mask,
            mems=mems,
            perm_mask=perm_mask,
            target_mapping=target_mapping,
            token_type_ids=token_type_ids,
            input_mask=input_mask,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        output = torch.cat([transformer_outputs[1][-1*(i+1)][:,-1] for i in range(self.n_use_layer)], dim=1)
        output = self.dense1(output)
        output = self.dense2(output)
        
        logits = self.logits_proj(output)

        outputs = (logits,) + transformer_outputs[1:]

        return outputs

In [None]:
N_BERT_LABEL = 30
gkf = GroupKFold(n_splits=N_FOLD).split(X=train["question_body"], groups=train["question_body"])

spearman_scores = []
best_spearman_lst = []
losses_lst = []
epoch_spearman_lst = []
lr_lst_lst = []
each_speaman_dfs = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
  if fold in [0, 1]:
    continue

  seed_everything(SEED)

  # Load Model
  config = XLNetConfig.from_pretrained(pretrained_weights)
  model = CustomXLNet.from_pretrained(pretrained_weights, config=config)

  model = model.to(device)
  model.resize_token_embeddings(len(tokenizer))
  model = model.train()
  
  # optimizer setting
  param_optimizer = list(model.named_parameters())
  no_decay = ['layer_norm.weight', 'bias', 'gamma', 'beta']
  optimizer_grouped_parameters = []
  max_lrs = []
  for param in param_optimizer:
    if any(n in param[0] for n in no_decay):
      weight_decay = 0.0
    else:
      weight_decay = 0.01
    if param[0].find("transformer.layer") != -1:
      n_diff_last = 11 - int(param[0].split(".")[2])
      lr = learning_rate*0.9**n_diff_last
    elif "embeddings" in param[0]:
      lr = learning_rate*0.9**11
    else:
      lr = learning_rate
    max_lrs.append(lr)
    d = {"params": param[1], "weight_decay": weight_decay}
    optimizer_grouped_parameters.append(d)
  optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=True)

  # train valid split
  train_x = X[train_idx]
  valid_x = X[valid_idx]
  train_y = y[train_idx]
  valid_y = y[valid_idx]
  
  # set loader  
  train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_x, dtype=torch.long), torch.tensor(train_y, dtype=torch.float))
  train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BS, shuffle=True)
  valid_dataset = torch.utils.data.TensorDataset(torch.tensor(valid_x, dtype=torch.long), torch.tensor(valid_y, dtype=torch.float))
  valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=BS, shuffle=False)

  # set schedueler
  num_training_steps = len(train_loader)*n_epoch
  scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lrs, total_steps=num_training_steps)

  model.zero_grad()
  optimizer.zero_grad()
    
  best_spearman = 0
  losses = []
  epoch_spearman = []
  lr_lst = []
  for epoch in range(n_epoch):
    lr = np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean()
    tk0 = tqdm_notebook(enumerate(train_loader), total=len(train_loader), leave=False)
    for i, (x_batch, y_batch) in tk0:
      input_ids = x_batch[:, :512]
      token_ids = x_batch[:, 512:]
      y_pred = model(input_ids.to(device), attention_mask=(input_ids != 5).int().to(device), token_type_ids=token_ids.to(device))
      loss = custom_loss(y_pred[0], y_batch.to(device))
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 
      optimizer.step()
      optimizer.zero_grad()
      scheduler.step()
      lr_lst.append(np.array([param_group["lr"] for param_group in optimizer.param_groups]).mean())
      losses.append(float(loss))

    # epoch validation
    for param in model.parameters():
      param.requires_grad=False
    model.eval()

    lst = []
    sum_loss = 0
    for i, (x_batch, y_batch)  in enumerate(valid_loader):
      input_ids = x_batch[:, :512]
      token_ids = x_batch[:, 512:]
      with torch.no_grad():
        y_pred = model(input_ids.to(device), attention_mask=(input_ids != 5).int().to(device), token_type_ids=token_ids.to(device))
        loss = custom_loss(y_pred[0], y_batch.to(device))

      lst.append(y_pred[0].sigmoid().cpu().squeeze().numpy())
      sum_loss += loss.cpu().squeeze().numpy()
    valid_pred = np.vstack(lst)
    ave_loss = sum_loss/len(valid_loader)

    spearman_score = spearman_corr(valid_y[:,:N_BERT_LABEL], valid_pred)  
    epoch_spearman.append(spearman_score)
    
    for param in model.parameters():
      param.requires_grad=True
    model.train()
        
    if best_spearman <= spearman_score:
      torch.save(model.state_dict(), f"{pretrained_weights}_f{fold}_best")
      best_spearman = spearman_score
      each_speaman_df = calc_each_spearman(valid_y[:,:N_BERT_LABEL], valid_pred)
      display(each_speaman_df)

    print(f"fold-{fold} epoch {epoch}: {spearman_score} / loss avg: {ave_loss}")
    
  best_spearman_lst.append(best_spearman)
  losses_lst.append(losses)
  epoch_spearman_lst.append(epoch_spearman)
  lr_lst_lst.append(lr_lst)
  each_speaman_dfs.append(each_speaman_df)

  torch.cuda.empty_cache()

HBox(children=(FloatProgress(value=0.0, max=684.0), HTML(value='')))



Unnamed: 0,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,question_not_really_a_question,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,0.330455,0.431154,0.36326,0.206197,0.211705,0.408385,0.044134,0.413786,0.483175,0.041417,...,0.480429,0.129866,0.325859,0.101423,0.129613,0.263596,0.694358,0.196922,0.63164,0.144523


fold-2 epoch 0: 0.3253452203244118 / loss avg: 113.55420032300447


HBox(children=(FloatProgress(value=0.0, max=684.0), HTML(value='')))

In [None]:
out_path = f"{RESULT}/{EXP}/"
!mkdir out_path

model.save_pretrained(out_path)
tokenizer.save_pretrained(out_path)

pd.DataFrame(best_spearman_lst, columns=["best_spearman"]).to_csv(f"{out_path}/best_spearman.csv")

for fold, (best_spearman,losses, epoch_spearman, lr_lst) in enumerate(zip(best_spearman_lst, losses_lst, epoch_spearman_lst, lr_lst_lst)):
    print(f"Fold-{fold} Best Spearman: {best_spearman}")

    pd.DataFrame(losses, columns=["loss"]).plot();
    plt.savefig(f'{out_path}/loss_f{fold}.png')
    plt.show()

    pd.DataFrame(epoch_spearman, columns=["spearman"]).plot();
    plt.savefig(f'{out_path}/spearman_f{fold}.png')
    plt.show()

    pd.DataFrame(lr_lst, columns=["lr"]).plot();
    plt.savefig(f'{out_path}/lr_f{fold}.png')
    plt.show()