In [None]:
import os
import numpy as np
import pandas as pd
import random

from transformers import (AutoConfig, AutoModel, AutoTokenizer, AdamW, 
                          get_linear_schedule_with_warmup, logging, 
                          RobertaConfig, PreTrainedModel, RobertaModel)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold

logging.set_verbosity_error()

In [None]:
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_NAME = 'roberta-large'

MAX_LENGTH = 256
LR = 2e-5
EPS = 1e-8

SEED = 42

NUM_FOLDS = 5
SEEDS = [113, 71, 17, 43, 37]

EPOCHS = 5
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 32

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def set_seed(seed = 0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed=1112
random_state = set_seed(seed)

In [None]:
class ContinuousStratifiedKFold(StratifiedKFold):
    def split(self, x, y, groups=None):
        num_bins = int(np.floor(1 + np.log2(len(y))))
        bins = pd.cut(y, bins=num_bins, labels=False)
        return super().split(x, bins, groups)
    
def get_data_loaders(data, fold):
    
    x_train = data.loc[data.fold != fold, ['sentence1', 'sentence2']].values.astype("str").tolist()
    y_train = data.loc[data.fold != fold, 'target'].values
    x_val = data.loc[data.fold == fold, ['sentence1', 'sentence2']].values.astype("str").tolist()
    y_val = data.loc[data.fold == fold, 'target'].values
    
    encoded_train = tokenizer.batch_encode_plus(
        x_train, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    encoded_val = tokenizer.batch_encode_plus(
        x_val, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    dataset_train = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask'],
        torch.tensor(y_train)
    )
    dataset_val = TensorDataset(
        encoded_val['input_ids'],
        encoded_val['attention_mask'],
        torch.tensor(y_val)
    )
    
    dataloader_train = DataLoader(
        dataset_train,
        sampler = RandomSampler(dataset_train),
        batch_size=TRAIN_BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler = SequentialSampler(dataset_val),
        batch_size=VAL_BATCH_SIZE
    )

    return dataloader_train, dataloader_val

In [None]:
class RobertaPreTrainedModel(PreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def update_keys_to_ignore(self, config, del_keys_to_ignore):
        """Remove some keys from ignore list"""
        if not config.tie_word_embeddings:
            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
            self._keys_to_ignore_on_load_missing = [
                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
            ]

class RobertaForRegression(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.bi_lstm = nn.LSTM(input_size=config.hidden_size, hidden_size=config.hidden_size, bidirectional=True, batch_first=True)
        self.avg_pool = nn.AvgPool1d(1024,1)
        self.max_pool = nn.MaxPool1d(1024,1)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(2050, 1)
        self.loss = nn.MSELoss()
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        mean_embeddings = mean_embeddings.unsqueeze(dim=1)
        bi_lstm = self.bi_lstm(mean_embeddings)
        avg_pool = self.avg_pool(bi_lstm[0])
        max_pool = self.max_pool(bi_lstm[0])
        concatenate = torch.cat((avg_pool, max_pool), -1)
        drop = self.dropout(concatenate)
        logits = self.linear(drop.squeeze(dim=1))
        
        preds = logits.squeeze(-1).squeeze(-1)

        if labels is not None:
            loss = torch.sqrt(self.loss(preds.view(-1).float(), labels.view(-1).float()))
            return loss
        else:
            return preds

In [None]:
def generate_pair_sentences(data):
    sentence2_list = list()
    sentence1 = data.loc[data.target == 0, 'excerpt'].values.astype(str).tolist()[0]
    sentence1_list = [sentence1] * data.shape[0]
    for index, row in data.iterrows():
        sentence2_list.append(row.excerpt)
    data['sentence1'] = sentence1_list
    data['sentence2'] = sentence2_list
    data.drop(columns=['excerpt'], axis=1, inplace=True)
    return data

In [None]:
data = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
data = generate_pair_sentences(data)
# Create stratified folds
kf = ContinuousStratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for f, (t_, v_) in enumerate(kf.split(data, data.target)):
    data.loc[v_, 'fold'] = f
data['fold'] = data['fold'].astype(int)

In [None]:
def evaluate(model, val_dataloader):
#     model.eval()
    loss_val_total = 0
    for batch in val_dataloader:
        batch = tuple(b.to(DEVICE) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        with torch.no_grad():        
            loss = model(**inputs)
        loss_val_total += loss.item()
    loss_val_avg = loss_val_total/len(val_dataloader) 
    return loss_val_avg

def train(model, train_dataloader, val_dataloader):
    optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * EPOCHS)
    best_val_loss = 1
    model.train()
    for epoch in range(EPOCHS):
        loss_train_total = 0
        for batch in tqdm(train_dataloader):
            model.zero_grad()
            batch = tuple(b.to(DEVICE) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            loss = model(**inputs)
            loss_train_total += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        loss_train_avg = loss_train_total / len(train_dataloader)
        loss_val_avg = evaluate(model, val_dataloader)
        print(f'epoch:{epoch+1}/{EPOCHS} train loss={loss_train_avg}  val loss={loss_val_avg}')
        
        if loss_val_avg < best_val_loss:
            best_val_loss = loss_val_avg    
    return best_val_loss

In [None]:
losses = []

MAX_RUNS = 2
runs = 0   # Variable to control termination condition

model = RobertaForRegression.from_pretrained(MODEL_NAME)
model.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

for i, seed in enumerate(SEEDS):       
    # Termination condition
    if runs == MAX_RUNS:
        print(f'{runs} runs termination condition reached.')
        break    
    
    print(f'********* seed({i}) = {seed} ***********')
    
    for fold in range(NUM_FOLDS):
        print(f'*** fold = {fold} ***')
        set_seed(seed)
        train_dataloader, val_dataloader = get_data_loaders(data, fold)
            
        loss = train(model, train_dataloader, val_dataloader)
        losses.append(loss)
        
        # Termination condition
        runs += 1
        if runs == MAX_RUNS:
            break

In [None]:
model.save_pretrained("/kaggle/working")