In [1]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm, trange

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.calibration import calibration_curve
from scipy.special import expit
import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch import nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader
from sklearn.isotonic import IsotonicRegression

from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AdamW
import matplotlib.patches as mpatches

In [2]:
from transformers.utils import logging
logging.set_verbosity_error()

In [3]:
def get_ids_mask(sentences, tokenizer, max_length):
    tokenized = [
        tokenizer.encode(s, add_special_tokens=True, max_length=100, truncation=True, return_tensors='pt')[0]
        for s in sentences
    ]
    
    ids = pad_sequence(tokenized, batch_first=True)
    amasks = (ids > 0).float()

    return ids, amasks

def load_cm_sentences(split="train", has_labels=False):
    df = pd.read_csv(f"{split}.csv")
    
    sentences = df['input'].tolist()
    
    if not has_labels:
        return sentences
    
    labels = df['label'].tolist()
    
    return sentences, labels

def load_process_data(model_name, max_length, split="train", has_labels=False):
    data = load_cm_sentences(split=split, has_labels=has_labels)
    
    if has_labels:
        sentences, labels = data
        labels = torch.tensor(labels)
    else:
        sentences = data
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    ids, amasks = get_ids_mask(sentences, tokenizer, max_length)
    
    if has_labels:
        data = TensorDataset(ids, amasks, labels)
    else:
        data = TensorDataset(ids, amasks)
        
    return data

In [4]:
def prepare_loaders(model_name, batch_size, max_length, add_val=False):
    loaders = dict()
    
    train_data = load_process_data(model_name, max_length, 'train', has_labels=True)
    test_data = load_process_data(model_name, max_length, 'test')

    if add_val:
        train_data, val_data = torch.utils.data.random_split(train_data, [0.85, 0.15])
        loaders['val'] = DataLoader(val_data, batch_size=batch_size, shuffle=False)

    loaders['train'] = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    loaders['test'] = DataLoader(test_data, batch_size=batch_size, shuffle=False)
        
    return loaders

In [5]:
def load_model(model_name, learning_rate, weight_decay, cache_dir=None):
    if cache_dir is not None:
        config = AutoConfig.from_pretrained(model_name, num_labels=1, cache_dir=cache_dir)
    else:
        config = AutoConfig.from_pretrained(model_name, num_labels=1)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

    model.cuda()

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-8, no_deprecation_warning=True)

    return model, optimizer

In [6]:
def dump_pred(y_pred, threshold, save_path):
    if y_pred.min() < 0 or y_pred.max() > 1:
        raise ValueError('Wrong pred values!!!')
        
    if y_pred.shape[0] != 2771:
        raise ValueError('Wrong pred size!!!')
        
    neg_scale = 1 / threshold
    pos_scale = 1 / (1 - threshold)
    
    df = pd.DataFrame({
        'class': (y_pred > threshold).astype('int'),
        'uncertainty': np.where(y_pred < threshold, y_pred * neg_scale, (1 - y_pred) * pos_scale)
    })
    
    if df['uncertainty'].min() < 0 or df['uncertainty'].max() > 1:
        raise ValueError('Wrong pred result!!!')
    
    df.to_csv(save_path, index=False)

In [7]:
class Predictor:
    def __init__(self, model_name, config, n_splits=3):
        self.model_name = model_name
        self.loaders = prepare_loaders(
            model_name=model_name,
            batch_size=config['batch_size'],
            max_length=config['max_length'],
        )
        self.config = config
        self.criterion = nn.BCEWithLogitsLoss()
        
    def _train(self, n_epochs, loader):
        model, optimizer = load_model(
            model_name=self.model_name,
            learning_rate=self.config['learning_rate'],
            weight_decay=self.config['weight_decay']
        )
        model.train()
        gradient_acc_steps = self.config['gradient_acc_steps']
        
        for _ in trange(n_epochs, desc='Epoch'):
            for step, batch in enumerate(tqdm(loader), 1):
                batch = tuple(t.cuda() for t in batch)
                ids, amasks, labels = batch
                
                output = model(ids, attention_mask=amasks)[0].squeeze(-1)
                loss = self.criterion(output, labels.float()) / gradient_acc_steps
                
                loss.backward()
                
                if (step % gradient_acc_steps == 0) or (step == len(train_dataloader)):
                    # Update weights
                    optimizer.step()

                    # Zero gradient buffers
                    optimizer.zero_grad()
        
        return model
    
    def _eval(self, model, loader):
        model.eval()
        y_pred = []
        y_true = []
        
        with torch.no_grad():
            for step, batch in enumerate(loader, 1):
                batch = tuple(t.cuda() for t in batch)
                ids, amasks, labels = batch
                
                output = model(ids, attention_mask=amasks)[0].squeeze(-1)
                
                y_pred.append(output.cpu().numpy())
                y_true.append(labels.cpu().numpy())
        
        y_pred = expit(np.hstack(y_pred))
        y_true = np.hstack(y_true)
        
        return y_pred, y_true
    
    def _predict(self, model, loader):
        model.eval()
        y_pred = []
        
        with torch.no_grad():
            for step, batch in enumerate(loader, 1):
                batch = tuple(t.cuda() for t in batch)
                ids, amasks = batch
                
                output = model(ids, attention_mask=amasks)[0].squeeze(-1)
                
                y_pred.append(output.cpu().numpy())
        
        y_pred = expit(np.hstack(y_pred))
        
        return y_pred
    
    def train_predict(self, n_epochs):
        model = self._train(n_epochs, self.loaders['train'])
    
        y_pred = self._predict(model, self.loaders['test'])
        
        return y_pred

In [8]:
model_name = 'microsoft/deberta-v3-large'
config = {
    'batch_size': 16,
    'max_length': 512,
    'learning_rate': 1e-5,
    'weight_decay': 0.01,
    'gradient_acc_steps': 1
}

scorer = Predictor(model_name, config)

y_pred = scorer.train_predict(4)

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

  0%|          | 0/870 [00:00<?, ?it/s]

In [9]:
dump_pred(y_pred, 0.5, 'submissions/submission_0.csv')

In [10]:
class CalibratedPredictor(Predictor):
    def __init__(self, model_name, config, n_splits=3):
        self.model_name = model_name
        self.loaders = prepare_loaders(
            model_name=model_name,
            batch_size=config['batch_size'],
            max_length=config['max_length'],
            add_val=True
        )
        self.config = config
        self.criterion = nn.BCEWithLogitsLoss()
    
    def train_predict(self, n_epochs):
        model = self._train(n_epochs, self.loaders['train'])
        
        val_pred, val_true = self._eval(model, self.loaders['val'])
        
        calibrator = IsotonicRegression(y_min=0, y_max=1, out_of_bounds='clip')
        calibrator.fit(val_pred, val_true)
    
        test_pred = self._predict(model, self.loaders['test'])
        test_pred = calibrator.predict(test_pred)
        
        return test_pred

In [11]:
model_name = 'microsoft/deberta-v3-large'
config = {
    'batch_size': 16,
    'max_length': 512,
    'learning_rate': 1e-5,
    'weight_decay': 0.01,
    'gradient_acc_steps': 1
}

scorer = CalibratedPredictor(model_name, config)

y_pred = scorer.train_predict(4)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/739 [00:00<?, ?it/s]

  0%|          | 0/739 [00:00<?, ?it/s]

  0%|          | 0/739 [00:00<?, ?it/s]

  0%|          | 0/739 [00:00<?, ?it/s]

In [12]:
dump_pred(y_pred, 0.5, 'submissions/submission_1.csv')