In [1]:
# Install from kaggle datasets for code competition

# Install transformers 2.3.0
! pip install -q ../input/sacremoses/sacremoses-master/
! pip install -q ../input/transformers-2-3-0/

# Install Nvidia Apex
! pip install -q -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" /kaggle/input/nvidia-apex/apex-880ab92

  cmdoptions.check_install_build_global(options)
Processing /kaggle/input/nvidia-apex/apex-880ab92
Skipping bdist_wheel for apex, due to binaries being disabled for it.
Installing collected packages: apex
  Running setup.py install for apex ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25hSuccessfully installed apex-0.1


In [2]:
from __future__ import absolute_import, division, print_function

import os, sys, re, gc, pickle, operator, shutil, copy, random
import time, datetime
from collections import namedtuple

from math import floor, ceil
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, Dataset, DataLoader, Sampler

from apex import amp
import transformers
from transformers import *

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

print(f'torch version: {torch.__version__}')
print(f'transformers version: {transformers.__version__}')

torch version: 1.2.0
transformers version: 2.3.0


# Configuration

In [3]:
DATA_DIR = "/kaggle/input/google-quest-challenge/"
VOCAB_PATH = '/kaggle/input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
MODEL_PATH = '/kaggle/input/bert-pytorch/'
output_model_file = "quest_bert_models.pt"

SEED = 2019
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

MAX_SEQUENCE_LENGTH = 512
batch_size = 8
grad_accumulation_steps = 2

epochs_for_sched = 3
checkpoint_iter = 470
lr = 1e-4
warmup_proportion = 0.2

In [4]:
# Seed for randomness in pytorch
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything()

# Datasets

In [5]:
def trim_and_tokenize(title, question, answer, max_sequence_length, tokenizer,
                      trunc_mode='head', t_max_len=18, q_max_len=245, a_max_len=244):
    """
    trunc_mode:
    - head: truncate sequence from head
    - tail: truncate sequence from tail
    - mix: concatenate truncated sequences from head and tail 6:4 for each
    """
    
    assert trunc_mode in {"head", "tail", "mix"}
    need_trunc = False

    tq_sep = tokenizer.tokenize("Details:")
    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+5) > max_sequence_length:
        need_trunc = True
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
        if t_new_len+a_new_len+q_new_len+5 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+5)))
        
        if trunc_mode == "head":
            t = t[:t_new_len]
            q = q[:q_new_len]
            a = a[:a_new_len]
        if trunc_mode == "tail":
            t = t[-t_new_len:]
            q = q[-q_new_len:]
            a = a[-a_new_len:]
        if trunc_mode == "mix":
            def trunc_seq(seq, seq_max_len, trunc_ratio=0.6):
                maj_len = int(seq_max_len * trunc_ratio)
                return seq[:maj_len] + seq[-(seq_max_len-maj_len):]
            t = trunc_seq(t, t_new_len)
            q = trunc_seq(q, q_new_len)
            a = trunc_seq(a, a_new_len)
    
    return t, tq_sep, q, a, need_trunc


# Tokenizing the lines to BERT token
def convert_lines(df, columns, max_sequence_length, tokenizer, trunc_mode='head',
                  misc_trunc=False, target=None, sample_weighting=False):
    """
    trunc_mode:
    - head: truncate sequence from head
    - tail: truncate sequence from tail
    - mix: concatenate truncated sequences from head and tail 6:4 for each
    
    misc_trunc: use miscellaneous truncation or not
    
    sample_weighting: when misc_trunc=True, weighting on different truncated samples
    """
    
    all_tokens = []
    segment_ids = []   # representing segmentation of sentence A and B
    if target is not None:
        labels = []
    
    for ind, (_, instance) in enumerate(df[columns].iterrows()):
        
        title, question, answer = instance.question_title, instance.question_body, instance.answer
        t, tq_sep, q, a, need_trunc = trim_and_tokenize(title, question, answer,
                                                        max_sequence_length, tokenizer, trunc_mode=trunc_mode)
        tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
        all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
        segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))
        if target is not None:
            if sample_weighting:
                labels.append(np.concatenate([target[ind], [3./7]]))
            else:
                labels.append(target[ind])
        
        if need_trunc and misc_trunc:
            t, tq_sep, q, a, _ = trim_and_tokenize(title, question, answer,
                                                   max_sequence_length, tokenizer, trunc_mode='tail')
            tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
            all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
            segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))

            t, tq_sep, q, a, _ = trim_and_tokenize(title, question, answer,
                                                   max_sequence_length, tokenizer, trunc_mode='mix')
            tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
            all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
            segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))

            if target is not None:
                if sample_weighting:
                    labels.extend([np.concatenate([target[ind], [2./7]])] * 2)
                else:
                    labels.extend([target[ind]] * 2)
    
    if target is not None:
        return np.array(all_tokens), np.array(segment_ids), np.array(labels)
    return np.array(all_tokens), np.array(segment_ids)

In [6]:
# Prepare dataset and dataloader

class QuestQAs(Dataset):

    def __init__(self, tokenized_comments, segment_ids, targets=None, split=None, maxlen=256):
        self.comments = tokenized_comments
        self.segment_ids = segment_ids
        self.targets = targets
        self.split = split
        assert self.split in {'train', 'valid', 'test'}
        self.maxlen = maxlen

    def __getitem__(self, index):
        comment = self.comments[index]
        segment_id = self.segment_ids[index]
        if self.targets is not None:
            target = self.targets[index]
            return comment, segment_id, torch.FloatTensor(target)
        else:
            return comment, segment_id

    def __len__(self):
        return len(self.comments)

    def get_lens(self):
        lengths = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.comments),
            dtype=np.int32)
        return lengths

    def collate_fn(self, batch):
        """
        Collate function for sequence bucketing
        Note: this need not be defined in this Class, can be standalone.

        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of comments, and targets
        """

        if self.split in ('train', 'valid'):
            comments, segment_ids, targets = zip(*batch)
        else:
            comments, segment_ids = zip(*batch)

        lengths = [len(c) for c in comments]
        maxlen = max(lengths)
        padded_comments, padded_seg_ids = [], []
        for i, (c, s) in enumerate(zip(comments, segment_ids)):
            padded_comments.append(c+[0]*(maxlen - lengths[i]))
            padded_seg_ids.append(s +[0]*(maxlen - lengths[i]))

        if self.split in ('train', 'valid'):
            return torch.LongTensor(padded_comments), torch.LongTensor(padded_seg_ids), torch.stack(targets)
        else:
            return torch.LongTensor(padded_comments), torch.LongTensor(padded_seg_ids)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_lens, bucket_size=None, batch_size=1024, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_lens = sort_lens
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_lens)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_lens)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_lens)

    def prepare_buckets(self, indices=None):
        lengths = - self.sort_lens
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lengths)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lengths), dtype=np.int32))
                lengths = lengths[indices]
            else:
                indices = np.arange(len(lengths), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch_idx = None
        for chunk_index, chunk in divide_chunks(lengths, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk)]

            batch_idxes = []
            for _, batch_idx in divide_chunks(indices_sorted, self.batch_size):
                if len(batch_idx) == self.batch_size:
                    batch_idxes.append(batch_idx.tolist())
                else:
                    assert extra_batch_idx is None
                    assert batch_idx is not None
                    extra_batch_idx = batch_idx.tolist()

            # shuffling batches within buckets
            if self.shuffle:
                batch_idxes = shuffle(batch_idxes)
            for batch_idx in batch_idxes:
                new_indices.extend(batch_idx)

        if extra_batch_idx is not None:
            new_indices.extend(extra_batch_idx)

        if not self.shuffle:
            self.original_indices = np.argsort(indices_sorted).tolist()
        return indices[new_indices]


def prepare_loader(x, seg_ids, y=None, batch_size=None, split=None):
    assert split in {'train', 'valid', 'test'}
    dataset = QuestQAs(x, seg_ids, y, split, MAX_SEQUENCE_LENGTH)
    if split == 'train':
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                bucket_size=batch_size*20, batch_size=batch_size)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn)
    else:
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                batch_size=batch_size, shuffle_data=False)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn), sampler.original_indices

In [7]:
def train_val_split(train_df):
    kf = GroupKFold(n_splits=5)
#     kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
    cv_indices = list(kf.split(train_df.question_body, groups=train_df.question_body))
#     cv_indices = list(kf.split(train_df))
    return cv_indices


def load_data():
    train_df = pd.read_csv(DATA_DIR+'train.csv')
    output_cols = list(train_df.columns[11:])
    input_cols = list(train_df.columns[[1,2,5]])
    
    train_tars = train_df[output_cols].values.astype('float32')
    
    return train_tars, train_df, input_cols


def load_and_preproc():
    train_df = pd.read_csv(DATA_DIR+'train.csv')
    output_cols = list(train_df.columns[11:])
    input_cols = list(train_df.columns[[1,2,5]])
    
    train_tars = train_df[output_cols].values.astype('float32')
    train_seq, train_seg_ids = convert_lines(train_df, input_cols, MAX_SEQUENCE_LENGTH, tokenizer)

    return train_seq, train_seg_ids, train_tars, train_df

In [8]:
tokenizer = BertTokenizer.from_pretrained(VOCAB_PATH + 'vocab.txt')

t0 = time.time()
print('Loading and tokenizing...')
train_tars, train_df, input_cols = load_data()
# train_seq, train_seg_ids, train_tars, train_df = load_and_preproc()
cv_indices = train_val_split(train_df)
print('tokenizing complete in {:.0f} seconds.'.format(time.time()-t0))

Loading and tokenizing...
tokenizing complete in 0 seconds.


# Model

In [9]:
# Classifier
class GQuestNet(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super(GQuestNet, self).__init__(config)
        self.bert = BertModel.from_pretrained(MODEL_PATH+'bert-base-uncased-pytorch_model.bin', config=config)
        self.dropouts = nn.ModuleList([nn.Dropout(0.3) for _ in range(5)])
        self.classifier = nn.Linear(config.hidden_size*4, num_labels)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        _, _, encoded_layers = self.bert(input_ids, attention_mask, token_type_ids)
        seq_op1 = encoded_layers[-1]
        seq_op2 = encoded_layers[-2]
        avg_pool1 = torch.mean(seq_op1, 1)
        max_pool1, _ = torch.max(seq_op1, 1)
        avg_pool2 = torch.mean(seq_op2, 1)
        max_pool2, _ = torch.max(seq_op2, 1)
        pooled_output = torch.cat((avg_pool1, max_pool1, avg_pool2, max_pool2), 1)
        for i, dropout in enumerate(self.dropouts):
            if i == 0:
                h = self.classifier(dropout(pooled_output))
            else:
                h += self.classifier(dropout(pooled_output))
        return h / len(self.dropouts)


class TruncLoss(nn.Module):
    def forward(self, pred_scores, labels):
        loss = 0
        for i in range(pred_scores.size(1)):
            loss += nn.BCEWithLogitsLoss(weight=labels[:,-1])(pred_scores[:,i], labels[:,i])
        return loss


# Build model and optimizer
def model_optimizer_init(ft_lrs, num_labels=30):
    print("Building model and optimizer...")
    cfg = BertConfig.from_pretrained(MODEL_PATH + 'bert-base-uncased-config.json')
    cfg.output_hidden_states = True
    model = GQuestNet(cfg, num_labels=num_labels)

    params_bert = list(model.bert.parameters())
    params_cls = list(model.classifier.parameters())

    optimizer_grouped_parameters = [
        {'params': params_bert, 'lr':ft_lrs[0]},
        {'params': params_cls, 'lr':ft_lrs[1]}
        ]
    
    optimizer = optim.AdamW(optimizer_grouped_parameters)
    
    return model, optimizer

In [10]:
def model_test():
    x = torch.randint(10, (8, 256))
    cfg = BertConfig.from_pretrained(MODEL_PATH + 'bert-base-uncased-config.json')
    cfg.output_hidden_states = True
    model = GQuestNet(cfg, num_labels=30)
    print(model(x).size())

model_test()

torch.Size([8, 30])


# Solver

In [11]:
# Compute metrics
def compute_rho(labels, preds):
    rhos = []
    for col_label, col_pred in zip(labels.T, preds.T):
        rhos.append(
            spearmanr(col_label, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


# Functions for the training process
class NetSolver(object):

    def __init__(self, model, criterion, optimizer, scheduler, print_freq, filepath):
        self.model = model
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.print_freq = print_freq
        self.filepath = filepath

        self.model = self.model.to(device)
        self.criterion = self.criterion.to(device)
        self._reset()

    def _reset(self):
        """Set up some book-keeping variables for optimization.
        """
        self.best_val_loss = 1e4
        self.best_val_rho = 0.
        self.loss_history = []
        self.val_loss_history = []
        self.rho_history = []
        self.val_rho_history = []
        self.val_preds = []
        self.models = {}

    def save_checkpoint(self, iteration):
        """Save model checkpoint.
        """
#         self.models[f'ckpt_{iteration}'] = self.model.state_dict()
        self.models['ckpt_best'] = self.model.state_dict()
    
    def save_final_state(self):
        """Save final states.
        """
        state = {'model': self.models,
#                  'optimizer': self.optimizer.state_dict()
                 'optimizer': None
                 }
        torch.save(state, self.filepath)

    def forward_pass(self, x, seg_ids, y):
        """Forward pass through the network.
        """
        x, y = x.to(device=device, dtype=torch.long), y.to(device=device, dtype=torch.float)
        seg_ids = seg_ids.to(device=device, dtype=torch.long)
        scores = self.model(x, token_type_ids=seg_ids, attention_mask=(x>0))
        loss = self.criterion(scores, y)
        return loss, torch.sigmoid(scores)

    def train(self, loaders, iterations, start_time):
        """Weight of network updated by apex, grad accumulation, model checkpoint.
        """
        train_loader, val_loader = loaders
        loader = iter(train_loader)
        running_loss = 0.
        self.optimizer.zero_grad()

        # start training for iterations
        for i in range(iterations):
            self.model.train()

            try:
                x, seg_ids, y = next(loader)
            except:  # after an loader running out
                loader = iter(train_loader)
                x, seg_ids, y = next(loader)
            loss, _ = self.forward_pass(x, seg_ids, y)
            
#             loss.backward()
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()

            # gradient accumulation for larger batch size effect with smaller memory usage
            if (i+1) % grad_accumulation_steps == 0:   # Wait for several backward steps
                self.optimizer.step()                  # Now we can do an optimizer step
                self.optimizer.zero_grad()
                self.scheduler.step()

            running_loss += loss.item()
            
            # verbose and checkpoint
            if (i+1) % self.print_freq == 0 or (i+1) == iterations:
                print(f'Iteration {i+1}:')
                train_rho, _ = self.check_metric(train_loader, num_batches=50)
                print('{"metric": "Loss", "value": %.4f}' % (running_loss/(i+1),))
                print('{"metric": "Rho", "value": %.4f}' % (train_rho,))
                
                val_rho, val_loss, val_scores = self.check_metric(val_loader)
                print('{"metric": "Val. Loss", "value": %.4f}' % (val_loss,))
                print('{"metric": "Val. Rho", "value": %.4f}' % (val_rho,))
                
                self.loss_history.append(running_loss/(i+1))
                self.val_loss_history.append(val_loss)
                self.rho_history.append(train_rho)
                self.val_rho_history.append(val_rho)
                self.val_preds.append(val_scores)
                
                if val_loss < self.best_val_loss:
                    print('updating best val loss...')
                    self.best_val_loss = val_loss
                if val_rho > self.best_val_rho:
                    print('updating best val Spearman R...')
                    self.best_val_rho = val_rho
                    self.save_checkpoint(i+1)
                
                torch.cuda.empty_cache()
                print()
                
            if (time.time() - start_time) > 29000:
                break
        
        self.save_final_state()

    def check_metric(self, loader, num_batches=None):
        """Calculate metrics for validation
        """
        self.model.eval()
        targets, scores, losses = [], [], []
        with torch.no_grad():
            for t, (x, seg_ids, y) in enumerate(loader):
                l, score = self.forward_pass(x, seg_ids, y)
                targets.append(y.cpu().numpy())
                scores.append(score.cpu().numpy())
                losses.append(l.item())
                if num_batches is not None and (t+1) == num_batches:
                    break

        targets = np.concatenate(targets)
        scores = np.concatenate(scores)
        rho = compute_rho(targets, scores)
        loss = np.mean(losses)
        
        if num_batches is None:
            return rho, loss, scores
        return rho, loss

In [12]:
# lr scheduler

class OneCycleScheduler(object):
    # one-cycle scheduler
    SCHEDULES = set(['cosine', 'linear', 'linear_cosine'])

    def __init__(self, optimizer, iterations, sched_profile='cosine', max_lr=3e-3,
                 moms=(.95, .85), div_factor=25, warmup=0.3, final_div=None):

        self.optimizer = optimizer
        assert sched_profile in self.SCHEDULES
        self.sched_profile = sched_profile

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
            self.init_lrs = [lr/div_factor for lr in self.max_lrs]
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)
            self.init_lrs = [max_lr/div_factor] * len(optimizer.param_groups)

        self.final_div = final_div
        if self.final_div is None: self.final_div = div_factor*1e4
        self.final_lrs = [lr/self.final_div for lr in self.max_lrs]
        self.moms = moms

        self.total_iteration = iterations
        self.up_iteration = int(self.total_iteration * warmup)
        self.down_iteration = self.total_iteration - self.up_iteration

        self.curr_iter = 0
        self._assign_lr_mom(self.init_lrs, [moms[0]]*len(optimizer.param_groups))

    def _assign_lr_mom(self, lrs, moms):
        for param_group, lr, mom in zip(self.optimizer.param_groups, lrs, moms):
            param_group['lr'] = lr
            param_group['betas'] = (mom, 0.999)

    def _annealing_cos(self, start, end, pct):
        cos_out = np.cos(np.pi * pct) + 1
        return end + (start-end)/2 * cos_out

    def _annealing_linear(self, start, end, pct):
        return start + pct * (end-start)
    
    def _annealing_function(self, curr_iter):
        if self.sched_profile == 'cosine':
            return self._annealing_cos
        if self.sched_profile == 'linear':
            return self._annealing_linear
        if self.sched_profile == 'linear_cosine':
            if curr_iter <= self.up_iteration:
                return self._annealing_linear
            else:
                return self._annealing_cos
    
    def step(self):
        self.curr_iter += 1
        anneal = self._annealing_function(self.curr_iter)

        if self.curr_iter <= self.up_iteration:
            pct = self.curr_iter / self.up_iteration
            curr_lrs = [anneal(min_lr, max_lr, pct) \
                            for min_lr, max_lr in zip(self.init_lrs, self.max_lrs)]
            curr_moms = [anneal(self.moms[0], self.moms[1], pct) \
                            for _ in range(len(self.optimizer.param_groups))]
        else:
            pct = (self.curr_iter-self.up_iteration) / self.down_iteration
            curr_lrs = [anneal(max_lr, final_lr, pct) \
                            for max_lr, final_lr in zip(self.max_lrs, self.final_lrs)]
            curr_moms = [anneal(self.moms[1], self.moms[0], pct) \
                            for _ in range(len(self.optimizer.param_groups))]

        self._assign_lr_mom(curr_lrs, curr_moms)

# Train

In [13]:
def run_train_and_valid():
    oof_preds = np.zeros_like(train_tars)
    logs = []

    for i, (trn_idx, val_idx) in enumerate(cv_indices):
        print(f'Fold {i+1}:')

        # prepare datasets
        x_train, seg_train, y_train = convert_lines(train_df.iloc[trn_idx], input_cols,
                                                    MAX_SEQUENCE_LENGTH, tokenizer,
                                                    misc_trunc=True, target=train_tars[trn_idx],
                                                    sample_weighting=True)
        x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                       MAX_SEQUENCE_LENGTH, tokenizer)
        y_val = train_tars[val_idx]
        y_val = np.hstack([y_val, np.ones(len(y_val), dtype='float32')[:,None]])

        train_loader = prepare_loader(x_train, seg_train, y_train, batch_size, split='train')
        val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')

        # initialize model and solver
        num_train_steps = int(epochs_for_sched * len(train_loader) / grad_accumulation_steps)
        ft_lrs = [0.8*lr, lr]
        model, optimizer = model_optimizer_init(ft_lrs)
        scheduler = OneCycleScheduler(optimizer, num_train_steps, sched_profile='linear', max_lr=ft_lrs,
                                      div_factor=40, warmup=warmup_proportion)
        model = model.to(device)
        criterion = TruncLoss().to(device)
        model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0)
        solver = NetSolver(model, criterion, optimizer, scheduler, checkpoint_iter, f'fld_{i}_'+output_model_file)

        # train
        n_iter = num_train_steps * grad_accumulation_steps
        print('Start training.')
        t0 = time.time()
        solver.train((train_loader, val_loader), n_iter, t0)
        print(f'Training fold-{i+1} complete in {time.time()-t0} seconds.')

        # record validation results
        oof_preds[val_idx] += solver.val_preds[-1][val_original_indices]
        History = namedtuple('History', 'history')
        history = History(history={'loss': solver.loss_history, 'val_loss': solver.val_loss_history,
                                   'rho': solver.rho_history, 'val_rho': solver.val_rho_history})
        logs.append(history)

        # clean cache
        torch.cuda.empty_cache()
        print(str(torch.cuda.memory_allocated(device)/1e6 ) + 'M')
        print(str(torch.cuda.memory_cached(device)/1e6 ) + 'M')
        print()

    oof_rho = compute_rho(train_tars, oof_preds)
    print('{"metric": "OOF Val. Rho", "value": %.4f}' % (oof_rho,))
    
    return logs

In [14]:
logs = run_train_and_valid()

Fold 1:
Building model and optimizer...
Start training.
Iteration 470:
{"metric": "Loss", "value": 5.0344}
{"metric": "Rho", "value": 0.3255}
{"metric": "Val. Loss", "value": 11.9982}
{"metric": "Val. Rho", "value": 0.2937}
updating best val loss...
updating best val Spearman R...

Iteration 940:
{"metric": "Loss", "value": 4.6805}
{"metric": "Rho", "value": 0.3878}
{"metric": "Val. Loss", "value": 11.4788}
{"metric": "Val. Rho", "value": 0.3620}
updating best val loss...
updating best val Spearman R...

Iteration 1410:


  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


{"metric": "Loss", "value": 4.5060}
{"metric": "Rho", "value": nan}
{"metric": "Val. Loss", "value": 11.2519}
{"metric": "Val. Rho", "value": 0.3756}
updating best val loss...
updating best val Spearman R...

Iteration 1880:
{"metric": "Loss", "value": 4.3933}
{"metric": "Rho", "value": nan}
{"metric": "Val. Loss", "value": 11.2878}
{"metric": "Val. Rho", "value": 0.3823}
updating best val Spearman R...

Iteration 2350:
{"metric": "Loss", "value": 4.2890}
{"metric": "Rho", "value": 0.4974}
{"metric": "Val. Loss", "value": 11.2489}
{"metric": "Val. Rho", "value": 0.3854}
updating best val loss...
updating best val Spearman R...

Iteration 2804:
{"metric": "Loss", "value": 4.2139}
{"metric": "Rho", "value": nan}
{"metric": "Val. Loss", "value": 11.2376}
{"metric": "Val. Rho", "value": 0.3895}
updating best val loss...
updating best val Spearman R...

Training fold-1 complete in 1161.8809950351715 seconds.
1989.543936M
3160.408064M

Fold 2:
Building model and optimizer...
Start training.


# Visualization

In [15]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [16]:
fig = make_subplots(rows=2, cols=1)

colors = [
    ('#d32f2f', '#ef5350'),
    ('#303f9f', '#5c6bc0'),
    ('#00796b', '#26a69a'),
    ('#fbc02d', '#ffeb3b'),
    ('#5d4037', '#8d6e63'),
]

for fold, history in enumerate(logs):
    
    fig.add_trace(go.Scatter(x=np.arange(epochs_for_sched),
                             y=history.history['loss'],
                             mode='lines',
                             visible='legendonly' if fold > 0 else True,
                             line=dict(color=colors[fold][0], width=2),
                             name='Train loss - Fold #{}'.format(fold)),
                 row=1, col=1)

    fig.add_trace(go.Scatter(x=np.arange(epochs_for_sched),
                             y=history.history['val_loss'],
                             mode='lines+markers',
                             visible='legendonly' if fold > 0 else True,
                             line=dict(color=colors[fold][1], width=2),
                             name='Valid loss - Fold #{}'.format(fold)),
                 row=1, col=1)
    
    fig.add_trace(go.Scatter(x=np.arange(epochs_for_sched),
                             y=history.history['val_rho'],
                             mode='lines+markers',
                             visible='legendonly' if fold > 0 else True,
                             line=dict(color=colors[fold][1], width=2),
                             name='Valid RHO - Fold #{}'.format(fold)),
                 row=2, col=1)

fig.update_layout({
  "annotations": [
    {
      "x": 0.5, 
      "y": 1.0, 
      "font": {"size": 14}, 
      "text": "Train / valid losses", 
      "xref": "paper", 
      "yref": "paper", 
      "xanchor": "center", 
      "yanchor": "bottom", 
      "showarrow": False
    }, 
    {
      "x": 0.5, 
      "y": 0.5, 
      "font": {"size": 14}, 
      "text": "Train / valid AUC", 
      "xref": "paper", 
      "yref": "paper", 
      "xanchor": "center", 
      "yanchor": "bottom", 
      "showarrow": False
    }, 
  ]
})

fig.show()