In [1]:
from __future__ import absolute_import, division, print_function

import os, sys, re, gc, pickle, operator, shutil, copy, random
import time, datetime

from math import floor, ceil
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, Dataset, DataLoader, Sampler

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm, tqdm_notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import warnings
warnings.filterwarnings(action='once')

%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pytorch_pretrained_bert import convert_tf_checkpoint_to_pytorch
from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam, BertConfig
from pytorch_pretrained_bert.modeling import BertModel, BertPreTrainedModel
device = torch.device('cuda')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
DATA_DIR = "/kaggle/input/google-quest-challenge/"
BERT_MODEL_PATH = '/kaggle/input/bert-pretrained-models/uncased_l-12_h-768_a-12/uncased_L-12_H-768_A-12/'
MODEL_PATH_1 = '../input/gq-bert-2pool-alltrunc/'
MODEL_PATH_2 = '../input/gq-bert-2pool-exp/'
MODEL_PATH_3 = '../input/gq-bert-2catpool/'
MODEL_PATH_4 = '../input/gq-bert-2pool/'

SEED = 2019

MAX_SEQUENCE_LENGTH = 512
batch_size = 16

In [4]:
# Seed for randomness in pytorch
def seed_everything(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

seed_everything()

## Functions of data preprocessing and pytorch datasets

In [5]:
# Thanks to https://www.kaggle.com/akensert/bert-base-tf2-0-minimalistic
def trim_and_tokenize(title, question, answer, max_sequence_length, tokenizer,
                      trunc_mode='head', t_max_len=18, q_max_len=245, a_max_len=244):
    
    assert trunc_mode in {"head", "tail", "mix"}
    need_trunc = False

    tq_sep = tokenizer.tokenize("Details:")
    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+5) > max_sequence_length:
        need_trunc = True
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
        if t_new_len+a_new_len+q_new_len+5 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+5)))
        
        if trunc_mode == "head":
            t = t[:t_new_len]
            q = q[:q_new_len]
            a = a[:a_new_len]
        if trunc_mode == "tail":
            t = t[-t_new_len:]
            q = q[-q_new_len:]
            a = a[-a_new_len:]
        if trunc_mode == "mix":
            def trunc_seq(seq, seq_max_len, trunc_ratio=0.6):
                maj_len = int(seq_max_len * trunc_ratio)
                return seq[:maj_len] + seq[-(seq_max_len-maj_len):]
            t = trunc_seq(t, t_new_len)
            q = trunc_seq(q, q_new_len)
            a = trunc_seq(a, a_new_len)
    
    return t, tq_sep, q, a, need_trunc


# Tokenizing the lines to BERT token
def convert_lines(df, columns, max_sequence_length, tokenizer, trunc_mode='head', misc_trunc=False, target=None):
    all_tokens = []
    segment_ids = []   # representing segmentation of sentence A and B
    if target is not None:
        labels = []
    
    for ind, (_, instance) in enumerate(df[columns].iterrows()):
        
        title, question, answer = instance.question_title, instance.question_body, instance.answer
        t, tq_sep, q, a, need_trunc = trim_and_tokenize(title, question, answer,
                                                        max_sequence_length, tokenizer, trunc_mode=trunc_mode)
        tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
        all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
        segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))
        if target is not None:
            labels.append(target[ind])
        
        if need_trunc and misc_trunc:
            t, tq_sep, q, a, _ = trim_and_tokenize(title, question, answer,
                                                   max_sequence_length, tokenizer, trunc_mode='tail')
            tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
            all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
            segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))

            t, tq_sep, q, a, _ = trim_and_tokenize(title, question, answer,
                                                   max_sequence_length, tokenizer, trunc_mode='mix')
            tokens = ["[CLS]"] + t + tq_sep + q + ["[SEP]"] + a + ["[SEP]"]
            all_tokens.append(tokenizer.convert_tokens_to_ids(tokens))
            segment_ids.append([0]*(len(t)+len(tq_sep)+len(q)+2) + [1]*(len(a)+1))

            if target is not None:
                labels.extend([target[ind], target[ind]])
    
    if target is not None:
        return np.array(all_tokens), np.array(segment_ids), np.array(labels)
    return np.array(all_tokens), np.array(segment_ids)

In [6]:
# Prepare dataset and dataloader

class QuestQAs(Dataset):

    def __init__(self, tokenized_comments, segment_ids, targets=None, split=None, maxlen=256):
        self.comments = tokenized_comments
        self.segment_ids = segment_ids
        self.targets = targets
        self.split = split
        assert self.split in {'train', 'valid', 'test'}
        self.maxlen = maxlen

    def __getitem__(self, index):
        comment = self.comments[index]
        segment_id = self.segment_ids[index]
        if self.targets is not None:
            target = self.targets[index]
            return comment, segment_id, torch.FloatTensor(target)
        else:
            return comment, segment_id

    def __len__(self):
        return len(self.comments)

    def get_lens(self):
        lengths = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.comments),
            dtype=np.int32)
        return lengths

    def collate_fn(self, batch):
        """
        Collate function for sequence bucketing
        Note: this need not be defined in this Class, can be standalone.

        :param batch: an iterable of N sets from __getitem__()
        :return: a tensor of comments, and targets
        """

        if self.split in ('train', 'valid'):
            comments, segment_ids, targets = zip(*batch)
        else:
            comments, segment_ids = zip(*batch)

        lengths = [len(c) for c in comments]
        maxlen = max(lengths)
        padded_comments, padded_seg_ids = [], []
        for i, (c, s) in enumerate(zip(comments, segment_ids)):
            padded_comments.append(c+[0]*(maxlen - lengths[i]))
            padded_seg_ids.append(s +[0]*(maxlen - lengths[i]))

        if self.split in ('train', 'valid'):
            return torch.LongTensor(padded_comments), torch.LongTensor(padded_seg_ids), torch.stack(targets)
        else:
            return torch.LongTensor(padded_comments), torch.LongTensor(padded_seg_ids)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_lens, bucket_size=None, batch_size=1024, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_lens = sort_lens
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_lens)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_lens)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_lens)

    def prepare_buckets(self, indices=None):
        lengths = - self.sort_lens
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lengths)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lengths), dtype=np.int32))
                lengths = lengths[indices]
            else:
                indices = np.arange(len(lengths), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch_idx = None
        for chunk_index, chunk in divide_chunks(lengths, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk)]

            batch_idxes = []
            for _, batch_idx in divide_chunks(indices_sorted, self.batch_size):
                if len(batch_idx) == self.batch_size:
                    batch_idxes.append(batch_idx.tolist())
                else:
                    assert extra_batch_idx is None
                    assert batch_idx is not None
                    extra_batch_idx = batch_idx.tolist()

            # shuffling batches within buckets
            if self.shuffle:
                batch_idxes = shuffle(batch_idxes)
            for batch_idx in batch_idxes:
                new_indices.extend(batch_idx)

        if extra_batch_idx is not None:
            new_indices.extend(extra_batch_idx)

        if not self.shuffle:
            self.original_indices = np.argsort(indices_sorted).tolist()
        return indices[new_indices]


def prepare_loader(x, seg_ids, y=None, batch_size=None, split=None):
    assert split in {'train', 'valid', 'test'}
    dataset = QuestQAs(x, seg_ids, y, split, MAX_SEQUENCE_LENGTH)
    if split == 'train':
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                bucket_size=batch_size*20, batch_size=batch_size)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn)
    else:
        sampler = BucketSampler(dataset, dataset.get_lens(),
                                batch_size=batch_size, shuffle_data=False)
        return DataLoader(dataset, batch_size=batch_size, sampler=sampler,
                          collate_fn=dataset.collate_fn), sampler.original_indices

## Model preparation

In [7]:
class BertLastTwoClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super(BertLastTwoClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size*4, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=True)
        seq_op1 = encoded_layers[-1]
        seq_op2 = encoded_layers[-2]
        avg_pool1 = torch.mean(seq_op1, 1)
        max_pool1, _ = torch.max(seq_op1, 1)
        avg_pool2 = torch.mean(seq_op2, 1)
        max_pool2, _ = torch.max(seq_op2, 1)
        pooled_output = torch.cat((avg_pool1, max_pool1, avg_pool2, max_pool2), 1)
        return self.classifier(self.dropout(pooled_output))


class BertCatLastTwoClassification(BertPreTrainedModel):
    def __init__(self, config, num_labels):
        super(BertCatLastTwoClassification, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size*4, num_labels)
        self.apply(self.init_bert_weights)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        encoded_layers, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=True)
        seq_op1 = encoded_layers[-1]   # (N, T, D)
        seq_op2 = encoded_layers[-2]   # (N, T, D)
        seq_cat = torch.cat((seq_op1, seq_op2), -1)   # (N, T, 2D)
        avg_pool = torch.mean(seq_cat, 1)   # (N, 2D)
        max_pool, _ = torch.max(seq_cat, 1)   # (N, 2D)
        pooled_output = torch.cat((avg_pool, max_pool), 1)   # (N, 4D)
        return self.classifier(self.dropout(pooled_output))

## Function for validation

In [8]:
def compute_rho(labels, preds):
    rhos = []
    for col_label, col_pred in zip(labels.T, preds.T):
        rhos.append(
            spearmanr(col_label, col_pred + np.random.normal(0, 1e-7, col_pred.shape[0])).correlation)
    return np.mean(rhos)


def eval_rho(labels, preds, columns):
    return {col: spearmanr(labels[:,i], preds[:,i] + \
                np.random.normal(0, 1e-7, len(preds))).correlation  for i, col in enumerate(columns)}


def validate(val_loader, model, val_original_indices, post_indices=None):
    model.eval()
    targets, scores = [], []

    with torch.no_grad():
        for x, seg_ids, y in val_loader:
            x, y = x.to(device=device, dtype=torch.long), y.to(device=device, dtype=torch.float)
            seg_ids = seg_ids.to(device=device, dtype=torch.long)
            score = torch.sigmoid(model(x, token_type_ids=seg_ids, attention_mask=(x>0)))
            targets.append(y.cpu().numpy())
            scores.append(score.cpu().numpy())

    targets = np.concatenate(targets)[val_original_indices]
    scores = np.concatenate(scores)[val_original_indices]

    if post_indices is not None:
        scores[:, 19] = 0
        scores[post_indices, 19] += 0.5

    val_rho = compute_rho(targets, scores)
    print('{"metric": "Val. Rho", "value": %.4f}' % (val_rho, ))

    return scores


def eval_model(model, test_loader):
    model.eval()
    test_scores = []
    with torch.no_grad():
        for x, seg_ids in test_loader:
            x = x.to(device=device, dtype=torch.long)
            seg_ids = seg_ids.to(device=device, dtype=torch.long)
            score = torch.sigmoid(model(x, token_type_ids=seg_ids, attention_mask=(x>0)))
            test_scores.append(score.cpu().numpy())
    return np.concatenate(test_scores)

## Load/preprocess the data, validation

In [9]:
def train_val_split(train_df):
#     kf = GroupKFold(n_splits=5)
    kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
#     cv_indices = [(tr_idx, val_idx) for tr_idx, val_idx in kf.split(train_df.question_body, groups=train_df.question_body)]
    cv_indices = [(tr_idx, val_idx) for tr_idx, val_idx in kf.split(train_df)]
    return cv_indices


def load_data():
    train_df = pd.read_csv(DATA_DIR+'train.csv')
    output_cols = list(train_df.columns[11:])
    input_cols = list(train_df.columns[[1,2,5]])
    
    train_tars = train_df[output_cols].values.astype('float32')
    
    return train_tars, train_df, input_cols

In [10]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, do_lower_case=True)
train_tars, train_df, input_cols = load_data()
cv_indices = train_val_split(train_df)

bert_config = BertConfig(BERT_MODEL_PATH + 'bert_config.json')
model = BertLastTwoClassification(bert_config, num_labels=30)
model = model.to(device)
model2 = BertCatLastTwoClassification(bert_config, num_labels=30)
model2 = model2.to(device)

In [11]:
%%time

oof_targets = []
oof_preds_head = []
oof_preds_tail = []
oof_preds_mix = []

for i, (trn_idx, val_idx) in enumerate(cv_indices):
    print(f'Fold {i+1} validating:')
    
    model_ckpts = torch.load(MODEL_PATH_1+f'fld_{i}_quest_bert_models.pt')['model']
    model.load_state_dict(model_ckpts[list(model_ckpts.keys())[-1]])
    y_val = train_tars[val_idx]
    oof_targets.append(y_val)
    print()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer)
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_preds_head.append(validate(val_loader, model, val_original_indices))
    torch.cuda.empty_cache()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer,
                                   trunc_mode='tail')
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_preds_tail.append(validate(val_loader, model, val_original_indices))
    torch.cuda.empty_cache()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer,
                                   trunc_mode='mix')
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_preds_mix.append(validate(val_loader, model, val_original_indices))
    torch.cuda.empty_cache()
    
    print()

oof_targets = np.concatenate(oof_targets)
oof_preds_head = np.concatenate(oof_preds_head)
oof_preds_tail = np.concatenate(oof_preds_tail)
oof_preds_mix = np.concatenate(oof_preds_mix)
oof_preds = np.mean([oof_preds_head, oof_preds_tail, oof_preds_mix], 0)

print(str(torch.cuda.memory_allocated(device)/1e6 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1e6 ) + 'M')
print()

Fold 1 validating:

{"metric": "Val. Rho", "value": 0.4049}
{"metric": "Val. Rho", "value": 0.4060}
{"metric": "Val. Rho", "value": 0.4075}

Fold 2 validating:

{"metric": "Val. Rho", "value": 0.3928}
{"metric": "Val. Rho", "value": 0.3926}
{"metric": "Val. Rho", "value": 0.3941}

Fold 3 validating:

{"metric": "Val. Rho", "value": 0.4029}
{"metric": "Val. Rho", "value": 0.4022}
{"metric": "Val. Rho", "value": 0.4038}

Fold 4 validating:

{"metric": "Val. Rho", "value": 0.3844}
{"metric": "Val. Rho", "value": 0.3835}
{"metric": "Val. Rho", "value": 0.3871}

Fold 5 validating:

{"metric": "Val. Rho", "value": 0.3959}
{"metric": "Val. Rho", "value": 0.3971}
{"metric": "Val. Rho", "value": 0.3977}

1318.291968M
1482.686464M

CPU times: user 5min 33s, sys: 1min 24s, total: 6min 57s
Wall time: 7min


In [12]:
print(f'OOF rho of head trunc: {compute_rho(oof_targets, oof_preds_head)}')
print(f'OOF rho of tail trunc: {compute_rho(oof_targets, oof_preds_tail)}')
print(f'OOF rho of mix trunc: {compute_rho(oof_targets, oof_preds_mix)}')
print(f'OOF rho of all truncs: {compute_rho(oof_targets, oof_preds)}')
print(f'OOF rho of weighted truncs: {compute_rho(oof_targets, (3*oof_preds_head + 2*oof_preds_tail + 2*oof_preds_mix) / 7)}')

OOF rho of head trunc: 0.39220573170012807
OOF rho of tail trunc: 0.39238062891537673
OOF rho of mix trunc: 0.3940194586152391
OOF rho of all truncs: 0.39502341805226227
OOF rho of weighted truncs: 0.3949563597837244


In [13]:
# # Post processing

# col_ind = np.where(train_df.columns[11:] == 'question_type_spelling')[0][0]
# post_idx = np.where((train_df.category == "CULTURE") & \
#                     ((train_df.host == "english.stackexchange.com") | (train_df.host == "ell.stackexchange.com")))[0]

# pred_post = np.zeros(len(train_df))
# pred_post[post_idx] += 1

# oof_preds_head[:, col_ind] = pred_post
# oof_preds_tail[:, col_ind] = pred_post
# oof_preds_mix[:, col_ind] = pred_post

In [14]:
# print('After post-processing')
# print(f'OOF rho of head trunc: {compute_rho(oof_targets, oof_preds_head)}')
# print(f'OOF rho of tail trunc: {compute_rho(oof_targets, oof_preds_tail)}')
# print(f'OOF rho of mix trunc: {compute_rho(oof_targets, oof_preds_mix)}')
# print(f'OOF rho of all truncs: {compute_rho(oof_targets, np.mean([oof_preds_head, oof_preds_tail, oof_preds_mix], 0))}')
# print(f'OOF rho of weighted truncs: {compute_rho(oof_targets, (3*oof_preds_head + 2*oof_preds_tail + 2*oof_preds_mix) / 7)}')

In [15]:
%%time

oof_2_preds = []

for i, (trn_idx, val_idx) in enumerate(cv_indices):
    print(f'Fold {i+1} validating:')
    
    model_ckpts = torch.load(MODEL_PATH_2+f'fld_{i}_quest_bert_models.pt')['model']
    model.load_state_dict(model_ckpts[list(model_ckpts.keys())[-1]])
    y_val = train_tars[val_idx]
    print()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer)
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_2_preds.append(validate(val_loader, model, val_original_indices))
    torch.cuda.empty_cache()
    
    print()

oof_2_preds = np.concatenate(oof_2_preds)
    
print(str(torch.cuda.memory_allocated(device)/1e6 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1e6 ) + 'M')
print()

print(f'OOF rho of all truncs: {compute_rho(oof_targets, oof_2_preds)}')

Fold 1 validating:

{"metric": "Val. Rho", "value": 0.4007}

Fold 2 validating:

{"metric": "Val. Rho", "value": 0.3858}

Fold 3 validating:

{"metric": "Val. Rho", "value": 0.3944}

Fold 4 validating:

{"metric": "Val. Rho", "value": 0.3829}

Fold 5 validating:

{"metric": "Val. Rho", "value": 0.3925}

1318.291968M
1566.572544M

OOF rho of all truncs: 0.3887391530691789
CPU times: user 1min 51s, sys: 28.9 s, total: 2min 20s
Wall time: 2min 22s


In [16]:
%%time

oof_3_preds = []

for i, (trn_idx, val_idx) in enumerate(cv_indices):
    print(f'Fold {i+1} validating:')
    
    model_ckpts = torch.load(MODEL_PATH_3+f'fld_{i}_quest_bert_models.pt')['model']
    model2.load_state_dict(model_ckpts[list(model_ckpts.keys())[-1]])
    y_val = train_tars[val_idx]
    print()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer)
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_3_preds.append(validate(val_loader, model2, val_original_indices))
    torch.cuda.empty_cache()
    
    print()

oof_3_preds = np.concatenate(oof_3_preds)
    
print(str(torch.cuda.memory_allocated(device)/1e6 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1e6 ) + 'M')
print()

print(f'OOF rho of all truncs: {compute_rho(oof_targets, oof_3_preds)}')

Fold 1 validating:

{"metric": "Val. Rho", "value": 0.4015}

Fold 2 validating:

{"metric": "Val. Rho", "value": 0.3923}

Fold 3 validating:

{"metric": "Val. Rho", "value": 0.3981}

Fold 4 validating:

{"metric": "Val. Rho", "value": 0.3835}

Fold 5 validating:

{"metric": "Val. Rho", "value": 0.3921}

1318.291968M
1587.544064M

OOF rho of all truncs: 0.3913146618495562
CPU times: user 1min 53s, sys: 29 s, total: 2min 22s
Wall time: 2min 23s


In [17]:
%%time

oof_4_preds = []

for i, (trn_idx, val_idx) in enumerate(cv_indices):
    print(f'Fold {i+1} validating:')
    
    model_ckpts = torch.load(MODEL_PATH_4+f'fld_{i}_quest_bert_models.pt')['model']
    model.load_state_dict(model_ckpts[list(model_ckpts.keys())[-1]])
    y_val = train_tars[val_idx]
    print()
    
    x_val, seg_val = convert_lines(train_df.iloc[val_idx], input_cols,
                                   MAX_SEQUENCE_LENGTH, tokenizer,
                                   trunc_mode='tail')
    val_loader, val_original_indices = prepare_loader(x_val, seg_val, y_val, 16, split='valid')
    oof_4_preds.append(validate(val_loader, model, val_original_indices))
    torch.cuda.empty_cache()
    
    print()

oof_4_preds = np.concatenate(oof_4_preds)
    
print(str(torch.cuda.memory_allocated(device)/1e6 ) + 'M')
print(str(torch.cuda.memory_cached(device)/1e6 ) + 'M')
print()

print(f'OOF rho of all truncs: {compute_rho(oof_targets, oof_4_preds)}')

Fold 1 validating:

{"metric": "Val. Rho", "value": 0.4033}

Fold 2 validating:

{"metric": "Val. Rho", "value": 0.3859}

Fold 3 validating:

{"metric": "Val. Rho", "value": 0.3909}

Fold 4 validating:

{"metric": "Val. Rho", "value": 0.3843}

Fold 5 validating:

{"metric": "Val. Rho", "value": 0.3944}

1318.291968M
1692.401664M

OOF rho of all truncs: 0.3892913935729739
CPU times: user 1min 53s, sys: 29 s, total: 2min 22s
Wall time: 2min 23s


In [18]:
def post_processing_and_eval(y_true, y_pred):
    
    y = copy.deepcopy(y_pred)
    list_of_max_voters = []
    # list_of_max_voters[i] will be
    # how many voters did label the data (instead of 90 for all the columns)
    # from https://www.kaggle.com/c/google-quest-challenge/discussion/129831
    
    for i in (range(y_pred.shape[1])):
        
        best_score = 0
        best_max_voters = 1
        history_score = []
        
        for max_voters in range(1,200):
            y[:,i] = (y_pred[:,i] // (1/max_voters)) * (1/max_voters)
            score = spearmanr(y_true[:, i], y[:, i]).correlation
            history_score.append(score)
            if score > best_score:
                best_score = score
                best_max_voters = max_voters
        
        list_of_max_voters.append(best_max_voters)
        y[:,i] = (y_pred[:,i] // (1/best_max_voters)) * (1/best_max_voters)
    
    return compute_rho(y_true, y), list_of_max_voters
#     return np.mean([spearmanr(y_true[:, ind], y[:, ind]).correlation for ind in range(y.shape[1])]), list_of_max_voters

In [19]:
# ensemble models
oof_preds = oof_preds*0.3 + oof_2_preds*0.25 + oof_3_preds*0.25 + oof_4_preds*0.2
print(f'OOF rho before post-processing: {compute_rho(oof_targets, oof_preds)}')
oof_df = pd.DataFrame(oof_preds, columns=list(train_df.columns[11:]))
oof_df.to_csv('oof_cv.csv', index=False)

cv_rho, list_of_max_voters = post_processing_and_eval(oof_targets, oof_preds)
print(f'OOF rho after post-processing: {cv_rho}')
print(list_of_max_voters)

OOF rho before post-processing: 0.4064911201282894


  c /= stddev[:, None]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


OOF rho after post-processing: 0.37910860143401703
[142, 30, 6, 33, 13, 4, 86, 23, 7, 142, 15, 9, 6, 9, 8, 7, 5, 50, 36, 149, 52, 24, 49, 81, 88, 64, 17, 54, 50, 127]


In [20]:
raters = np.array([18, 18,  6,  6,  6,  6, 18, 18,  6,  6,  6,  6,  6,  6,  6,  6,  6, 6,  6,  3, 18, 18, 18, 18, 18, 90,  6,  6,  6, 18])

mins = np.min(oof_preds, axis=0)
maxs = np.max(oof_preds, axis=0)
oof_post = (oof_preds - mins)/(maxs - mins)

oof_post =  np.round(raters * oof_post).astype(np.float) / raters
print(f'OOF rho after post-processing: {compute_rho(oof_targets, oof_post)}')

OOF rho after post-processing: 0.3805010308746449
