In [None]:
import os
import numpy as np
import pandas as pd
import random
import copy

from transformers import (AutoConfig, AutoModel, AutoTokenizer, AdamW, 
                          get_linear_schedule_with_warmup, logging, 
                          RobertaConfig, PreTrainedModel, RobertaModel)

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, SequentialSampler, RandomSampler, DataLoader

from tqdm.notebook import tqdm

import gc; gc.enable()
from IPython.display import clear_output

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import f1_score

logging.set_verbosity_error()

In [None]:
INPUT_DIR = '../input/commonlitreadabilityprize'
MODEL_NAME = 'roberta-large'

MAX_LENGTH = 256
LR = 2e-5
EPS = 1e-8

SEED = 42

NUM_FOLDS = 5
SEEDS = [113, 71, 17, 43, 37]

EPOCHS = 5
TRAIN_BATCH_SIZE = 8
VAL_BATCH_SIZE = 32

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
def set_seed(seed = 0):
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state

seed=1112
random_state = set_seed(seed)

In [None]:
def get_data_loaders(data, fold):
    
    x_train = data.loc[data.fold != fold, 'excerpt'].tolist()
    y_train_labels = data.loc[data.fold != fold, 'coded_target'].values
    x_val = data.loc[data.fold == fold, 'excerpt'].tolist()
    y_val_labels = data.loc[data.fold == fold, 'coded_target'].values
    
    y_train = [[float(i) for i in x] for x in y_train_labels]
    y_val = [[float(i) for i in x] for x in y_val_labels]
    
    encoded_train = tokenizer.batch_encode_plus(
        x_train, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    encoded_val = tokenizer.batch_encode_plus(
        x_val, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        padding='max_length', 
        truncation=True,
        max_length=MAX_LENGTH, 
        return_tensors='pt'
    )
    
    dataset_train = TensorDataset(
        encoded_train['input_ids'],
        encoded_train['attention_mask'],
        torch.tensor(y_train)
    )
    dataset_val = TensorDataset(
        encoded_val['input_ids'],
        encoded_val['attention_mask'],
        torch.tensor(y_val)
    )
    
    dataloader_train = DataLoader(
        dataset_train,
        sampler = RandomSampler(dataset_train),
        batch_size=TRAIN_BATCH_SIZE
    )

    dataloader_val = DataLoader(
        dataset_val,
        sampler = SequentialSampler(dataset_val),
        batch_size=VAL_BATCH_SIZE
    )

    return dataloader_train, dataloader_val

In [None]:
class RobertaPreTrainedModel(PreTrainedModel):
    config_class = RobertaConfig
    base_model_prefix = "roberta"

    def _init_weights(self, module):
        """Initialize the weights"""
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def update_keys_to_ignore(self, config, del_keys_to_ignore):
        """Remove some keys from ignore list"""
        if not config.tie_word_embeddings:
            self._keys_to_ignore_on_save = [k for k in self._keys_to_ignore_on_save if k not in del_keys_to_ignore]
            self._keys_to_ignore_on_load_missing = [
                k for k in self._keys_to_ignore_on_load_missing if k not in del_keys_to_ignore
            ]

class RobertaForMultiClass(RobertaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.config = config

        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.relu = nn.ReLU()
        self.out_proj = nn.Linear(config.hidden_size, 3)
        self.loss = nn.BCEWithLogitsLoss()
        
        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        last_hidden_state = outputs[0]
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        
        mean_embeddings = self.dropout(mean_embeddings)
        mean_embeddings = self.dense(mean_embeddings)
        mean_embeddings = self.relu(mean_embeddings)
        mean_embeddings = self.dropout(mean_embeddings)
        logits = self.out_proj(mean_embeddings)

        if labels is not None:
            loss = self.loss(logits, labels)
            return loss
        else:
            return logits

In [None]:
def create_segments(total_bins, bin_len):
    bins = [(0, bin_len)]
    for seg in range(total_bins-1):
        if seg == total_bins-2:
            bins.append((bins[-1][1], np.nan))
        else:
            bins.append((bins[-1][1], bins[-1][1]+bin_len))
    return bins

def create_bins(df, column, num_of_bins=5):
    bin_len = int(len(df)/num_of_bins)
    bins = create_segments(num_of_bins, bin_len)
    df = df.sort_values(column, ascending=True).reset_index(drop=True)
    column_name = column+"_segment"
    df[column_name] = 0
    for index, seg in enumerate(bins):
        start = seg[0]
        end = seg[1]
        if end is not np.nan:
            df.loc[start:end, column_name] = str(index)
        else:
            df.loc[start:, column_name] = str(index)
    return df

In [None]:
data = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))

bins = 4
data = create_bins(copy.deepcopy(data), "standard_error", bins)

bi_modal = data.loc[data['standard_error_segment'] == str(3)].sort_values(by='target').reset_index(drop=True)
G_keys = data.loc[(data['standard_error_segment'] == '0') | (data['standard_error_segment'] == '1') | (data['standard_error_segment'] == '2'), 'target'].values
B0_keys = bi_modal[:355]['target'].values
B1_keys = bi_modal[355:]['target'].values

def assign_class_labels(row):
    if row['target'] in G_keys:
        return 'A'
    elif row['target'] in B0_keys:
        return 'B'
    else: return 'C'

data['class_labels'] = data.apply(lambda row: assign_class_labels(row), axis=1)

lb = LabelBinarizer()
lb = lb.fit(data.class_labels.values)
data['coded_target'] = lb.transform(data.class_labels.values).tolist()

# Create stratified folds
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for f, (t_, v_) in enumerate(kf.split(data, data.class_labels)):
    data.loc[v_, 'fold'] = f
data['fold'] = data['fold'].astype(int)

In [None]:
def evaluate(model, val_dataloader):
    model.eval()
    loss_val_total = 0
    for batch in val_dataloader:
        batch = tuple(b.to(DEVICE) for b in batch)
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }
        with torch.no_grad():        
            loss = model(**inputs)
        loss_val_total += loss.item()
    loss_val_avg = loss_val_total/len(val_dataloader) 
    return loss_val_avg

def train(model, train_dataloader, val_dataloader):
    optimizer = AdamW(model.parameters(), lr = LR, eps = EPS)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * EPOCHS)
    best_val_loss = 1
    model.train()
    for epoch in range(EPOCHS):
        loss_train_total = 0
        for batch in tqdm(train_dataloader):
            model.zero_grad()
            batch = tuple(b.to(DEVICE) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            loss = model(**inputs)
            loss_train_total += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()
        loss_train_avg = loss_train_total / len(train_dataloader)
        loss_val_avg = evaluate(model, val_dataloader)
        print(f'epoch:{epoch+1}/{EPOCHS} train loss={loss_train_avg}  val loss={loss_val_avg}')
        
        if loss_val_avg < best_val_loss:
            best_val_loss = loss_val_avg    
    return best_val_loss

In [None]:
losses = []

MAX_RUNS = 2
runs = 0   # Variable to control termination condition

model = RobertaForMultiClass.from_pretrained(MODEL_NAME)
model.to(DEVICE)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

for i, seed in enumerate(SEEDS):       
    # Termination condition
    if runs == MAX_RUNS:
        print(f'{runs} runs termination condition reached.')
        break    
    
    print(f'********* seed({i}) = {seed} ***********')
    
    for fold in range(NUM_FOLDS):
        print(f'*** fold = {fold} ***')
        set_seed(seed)
        train_dataloader, val_dataloader = get_data_loaders(data, fold)
            
        loss = train(model, train_dataloader, val_dataloader)
        losses.append(loss)
        
        # Termination condition
        runs += 1
        if runs == MAX_RUNS:
            break

In [None]:
train_dataloader, val_dataloader = get_data_loaders(data, 4)

model.eval()
predictions = []
labels = []
with torch.no_grad():
    for batch in val_dataloader:
        batch = tuple(b.to(DEVICE) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
        }
        labels.extend(batch[2].cpu().detach().numpy())
        outputs = model(**inputs)
        predictions.extend(outputs.cpu().detach().numpy())
y_pred = np.argmax(predictions, axis=-1)
y_true = np.asarray([0 if v == 'A' else 1 if v == 'B' else 2 for v in lb.inverse_transform(np.asarray([[int(i) for i in x] for x in labels]))]).reshape(-1)

f1_score(y_true, y_pred, average='micro')

In [None]:
model.save_pretrained("/kaggle/working")