In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math 
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")
from IPython. display import clear_output

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install iterative-stratification==0.1.7')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels tokenizers')
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

clear_output()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print('device:', device)

tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
device: cuda


In [2]:
BASE_PATH = '/kaggle/input/feedback-prize-english-language-learning'
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

In [3]:
class FB3Dataset(Dataset):
    def __init__(self, cfg, data):
        self.cfg = cfg
        self.xs = preprocess(data['full_text'])
        self.ys = data[cfg.target_cols].values 
        
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, idx):
        x = encode_text(self.cfg, self.xs[idx])
        y = torch.tensor(self.ys[idx], dtype=torch.float)
        return x, y
    
def collate(inputs):
    # Trimming input.
    mask_len = int(inputs['attention_mask'].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    #print(type(inputs), inputs['attention_mask'].size())
    return inputs

##################################################################################

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
        
    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features, *args):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

def GlobalAveragePool1d(x):
    return F.avg_pool1d(x, x.size()[-1]).squeeze(-1)

def GlobalMaxPool1d(x):
    return F.max_pool1d(x, x.size()[-1]).squeeze(-1)

def Conv1dReg(x, in_channels, out_channels, kernel_size, device):
    out = nn.Conv1d(in_channels, out_channels, kernel_size, padding='same', stride=1, device=device)(x)
    out = nn.BatchNorm1d(out_channels, device=device)(out)
    out = F.relu(out)
    return out

class MultiSampleDropout(nn.Module):
    def __init__(self, fc, num_dropout, prob_dropout):
        super(MultiSampleDropout, self).__init__()
        self.dropout = nn.Dropout
        self.num_dropout = num_dropout
        self.prob_dropout = prob_dropout
        self.classifier = fc
    def forward(self, out):
        if not type(self.prob_dropout) in [float, int]:            
            fcs = [self.classifier(self.dropout(p)(out)) for p in self.prob_dropout]
        else:
            fcs = [self.classifier(self.dropout(self.prob_dropout)(out)) for _ in range(self.num_dropout)]
        
        return torch.mean(torch.stack(fcs, dim=0), dim=0)

# ====================================================
# Model class
# ====================================================
class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False, pool='mean'):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
            # Expand embedding dim for new tokens.
            self.deberta_v3.resize_token_embeddings(len(cfg.tokenizer))
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)
        
        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.deberta_v3.encoder.layer[-1].modules():
                self._init_weights(module)
        self.deberta_v3.gradient_checkpointing_enable()
        
        # Define model layers.
        self.pool_name = pool
        self.fc = nn.Linear(self.config.hidden_size, cfg.num_target)
        if pool in ['mean', 'attention', 'weighted']:
            self.pool = self._pool_layer(pool)
        elif '-' in pool:
            pools = pool.split('-')
            self.pool = nn.ModuleList([])
            for pool_ in pools:
                self.pool.append(self._pool_layer(pool_))
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), cfg.num_target)
    
        if cfg.reinit_fc:
            self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, cfg.num_dropout, cfg.prob_dropout)
    
    def _pool_layer(self, pool_name):
        assert pool_name in ['mean', 'attention', 'weighted']
        if pool_name == 'mean':
            pool = MeanPooling()
        elif pool_name == 'attention':
            pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif pool_name == 'weighted':
            pool = WeightedLayerPooling(
                self.config.num_hidden_layers, 
                layer_start=9,
                layer_weights=None)
        return pool
    
    def _pool_feature(self, pool, pool_name, pt_outputs, attention_mask):
        assert pool_name in ['mean', 'attention', 'weighted']
        last_hidden_state = pt_outputs.last_hidden_state #batch_size x max_len x hidden_size
        all_hidden_states = torch.stack(pt_outputs.hidden_states) #num_layer x batch_size x max_len x hidden_size
        
        if pool_name == 'mean':
            pool_feature = pool(last_hidden_state, attention_mask)
        elif pool_name == 'attention':
            pool_feature = pool(last_hidden_state)
        elif pool_name == 'weighted':
            # Take the CLS token only.
            pool_feature = pool(all_hidden_states)[:, 0]
        return pool_feature

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_features = []
            pool_names = self.pool_name.split('-')
            
            for pool_name, pool in zip(pool_names, self.pool):
                pool_features.append(self._pool_feature(pool, pool_name, pt_outputs, x['attention_mask']))
            pool_features = torch.cat(pool_features, dim=1)
        else:
            pool_features = self._pool_feature(self.pool, self.pool_name, pt_outputs, x['attention_mask'])
        return pool_features
    
    def forward(self, x, y=None, loss_fn=None):
        feature = self.feature(x)
        if self.cfg.use_dropout and self.training:
            out = self.multi_dropout(feature)
        else:
            out = self.fc(feature)
        return out

In [4]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def init(self, kwargs):
        super().init(kwargs)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

In [5]:
def load_config(input_path, inference_weight=1):
    # Load CFG class.
    cfg = Config(**json.load(open(os.path.join(input_path, 'CFG.json'), 'r')))
    cfg.path = '../input/fb3-colab-models/v114' #TODO: change to input_path
    cfg.config_path = os.path.join(cfg.path, 'config.pth')
    # Load tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(cfg.path, 'tokenizer'))
    cfg.tokenizer = tokenizer
    
    cfg.inference_weight = inference_weight
    return cfg

def load_model(cfg, fold, **model_kwargs):
    # Load torch model.
    model = FB3Model(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    state = torch.load(
        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"),
        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    
    return model

In [6]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores

def get_result(cfg, oof_df):
    labels = oof_df[cfg.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in cfg.target_cols]].values
    score, scores = mc_rmse(labels, preds)
    print(f'score: {score:<.6f}  scores: {scores}')

In [7]:
# ====================================================
# Dataset
# ====================================================
def encode_text(cfg, text):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        #max_length=CFG.max_len,
        #pad_to_max_length=True,
        #truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs 

def preprocess(texts):
    texts = (
        texts
        .str.replace(r'\r\n', '<newline>', regex=True)
        .str.replace(r'\n', '<newline>', regex=True)
        .str.replace('<newline><newline>', '<newline>', regex=False)
        .values 
    )
    return texts

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = preprocess(df['full_text'])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = encode_text(self.cfg, self.texts[item])
        return inputs

In [8]:
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    #tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

class Inferencer:
    def __init__(self, input_path=None, cfg=None, inference_weight=1):
        if cfg == None:
            self.cfg = load_config(input_path, inference_weight)
        else:
            self.cfg = cfg
        self.pool = cfg.pool_head
    
    def predict(self, test_loader, device, stat_fn=np.mean):
        preds = []
        start = time.time()
        print('#'*10, cfg.path, '#'*10)
        for fold in self.cfg.trn_fold:
            print(f'Predicting fold {fold}...')
            model = load_model(self.cfg, fold, pool=self.pool)
            pred = inference_fn(test_loader, model, device)
            preds.append(pred)
            del model; gc.collect()
            torch.cuda.empty_cache()
        end = time.time() - start
        print('#'*10, f'ETA: {end:.2f}s', '#'*10, '\n')
        
        self.preds = stat_fn(preds, axis=0) 
        self.preds = np.clip(self.preds, 1, 5)
        return self.preds
    
    def get_oof_result(self):
        return get_result(pd.read_pickle(os.path.join(cfg.path, 'oof_df.pkl')))
    
    def get_text_embedding(self, data_loader, device, fold): 
        model = load_model(self.cfg, fold, pool=self.pool)
        model.eval()
        model.to(device)

        fold_emb = []
        for inputs in data_loader:
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            with torch.no_grad():
                emb = model.feature(inputs)
            fold_emb.extend(emb.to('cpu').numpy())
        fold_emb = np.array(fold_emb)
        return fold_emb

In [9]:
v112_CFG = Config(
    competition='FB3',
    debug=False,

    ####################

    apex=True,
    print_freq=20,
    num_workers=1,
    model='microsoft/deberta-v3-base',
    model_type='deberta_v3',
    gradient_checkpointing=True,
    scheduler='cosine', # ['linear', 'cosine']
    batch_scheduler=True,
    num_cycles=0.5,
    num_warmup_steps=0.6,
    
    ####################

    epochs=6,
    val_step=60,
    encoder_lr=2e-5,
    decoder_lr=2e-5,
    min_lr=1e-6,
    eps=1e-6,
    betas=(0.9, 0.999),
    batch_size=8,
    max_len=512,

    ####################

    reinit_last_layer=True,
    reinit_fc=True,

    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,

    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,

    pool_head='attention',
    
    ####################
    
    gradient_accumulation_steps=1,
    max_grad_norm=10,
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    num_target=6,
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    
    path='../input/fb3-colab-models/v112/',
    config_path='../input/fb3-colab-models/v112/config.pth',
    inference_weight=0.7
)
v114_CFG = Config(
    competition='FB3',
    debug=False,

    ####################

    apex=True,
    print_freq=20,
    num_workers=1,
    model='microsoft/deberta-v3-base',
    model_type='deberta_v3',
    gradient_checkpointing=True,
    scheduler='cosine', # ['linear', 'cosine']
    batch_scheduler=True,
    num_cycles=0.5,
    num_warmup_steps=0.6,
    
    ####################

    epochs=6,
    val_step=60,
    encoder_lr=2e-5,
    decoder_lr=2e-5,
    min_lr=1e-6,
    eps=1e-6,
    betas=(0.9, 0.999),
    batch_size=8,
    max_len=512,

    ####################

    reinit_last_layer=True,
    reinit_fc=True,

    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,

    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,

    pool_head='mean-attention',
    
    ####################
    
    gradient_accumulation_steps=1,
    max_grad_norm=10,
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    num_target=6,
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    
    path='../input/fb3-colab-models/v114/',
    config_path='../input/fb3-colab-models/v114/config.pth',
    inference_weight=0.3
)

tokenizer = AutoTokenizer.from_pretrained(os.path.join(v112_CFG.path, 'tokenizer'))
v112_CFG.tokenizer = tokenizer

tokenizer = AutoTokenizer.from_pretrained(os.path.join(v114_CFG.path, 'tokenizer'))
v114_CFG.tokenizer = tokenizer


setups = [
    v112_CFG, v114_CFG
]

In [10]:
train = pd.read_csv(TRAIN_PATH)

train_dataset = TestDataset(v112_CFG, train)
train_loader = DataLoader(
    train_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=DataCollatorWithPadding(tokenizer=v112_CFG.tokenizer, padding='longest'),
    num_workers=6, 
    pin_memory=True, 
    drop_last=False)

all_train_text_emb = []
for setup in tqdm(setups):
    # infer_ = Inferencer(setup['path'], setup['inference_weight'])
    infer_ = Inferencer(cfg=setup, inference_weight=setup.inference_weight)
    
    # Text embedding for SVM
    train_text_emb = []
    for fold in infer_.cfg.trn_fold:
        print(fold)
        train_text_emb.append(infer_.get_text_embedding(train_loader, device, fold))
    all_train_text_emb.append(np.mean(train_text_emb, axis=0))
final_train_text_emb = np.concatenate(all_train_text_emb, axis=1)

  0%|          | 0/2 [00:00<?, ?it/s]

0
1
2
3
0
1
2
3


In [11]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

svr_folds = 4
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
skf = MultilabelStratifiedKFold(n_splits=svr_folds, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train,train[target_cols])):
    train.loc[val_index,'fold'] = i

In [12]:
from joblib import dump, load
from sklearn.multioutput import MultiOutputRegressor
from cuml.svm import SVR
import cuml

from sklearn.metrics import mean_squared_error

scores = []
def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores


for fold in range(svr_folds):
    print('#'*25)
    print('### Fold',fold+1)
    print('#'*25)
    
    train_ = train[train['fold']!=fold]
    eval_ = train[train['fold']==fold]
    
    tr_text_feats = final_train_text_emb[list(train_.index),:]
    ev_text_feats = final_train_text_emb[list(eval_.index),:]
    
    clf = MultiOutputRegressor(SVR(C=1))
    clf.fit(tr_text_feats, train_[target_cols].values)
    
    ev_preds = clf.predict(ev_text_feats)
    score, _ = mc_rmse(eval_[target_cols].values, ev_preds)
    scores.append(score)
    
    print("Fold : {}  score: {}".format(fold,score))
    dump(clf, f'svr_{fold}.model')
    
print('#'*25)
print('Overall CV =', np.mean(scores))

#########################
### Fold 1
#########################
Fold : 0  score: 0.41851912303372973
#########################
### Fold 2
#########################
Fold : 1  score: 0.4097098036753621
#########################
### Fold 3
#########################
Fold : 2  score: 0.42114867082485113
#########################
### Fold 4
#########################
Fold : 3  score: 0.40808792034056657
#########################
Overall CV = 0.41436637946862737


In [13]:
test = pd.read_csv(TEST_PATH)

test_dataset = TestDataset(v112_CFG, test)
test_loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False,
    collate_fn=DataCollatorWithPadding(tokenizer=v112_CFG.tokenizer, padding='longest'),
    num_workers=6, 
    pin_memory=True, 
    drop_last=False)

all_test_text_emb = []
for setup in tqdm(setups):
    # infer_ = Inferencer(setup['path'], setup['inference_weight'])
    infer_ = Inferencer(cfg=setup, inference_weight=setup.inference_weight)
    
    # Text embedding for SVM
    test_text_emb = []
    for fold in infer_.cfg.trn_fold:
        print(fold)
        test_text_emb.append(infer_.get_text_embedding(test_loader, device, fold))
    all_test_text_emb.append(np.mean(test_text_emb, axis=0))
final_test_text_emb = np.concatenate(all_test_text_emb, axis=1)

  0%|          | 0/2 [00:00<?, ?it/s]

0
1
2
3
0
1
2
3


In [14]:
def svr_inference_fn(model_path, te_text_feats):
    model = load(model_path)
    preds = model.predict(te_text_feats)
    return preds

predictions = []
for fold in range(svr_folds):
    model_path = f'svr_{fold}.model'
    preds = svr_inference_fn(model_path, final_test_text_emb)
    predictions.append(preds)
predictions = np.mean(predictions, axis=0)
predictions = np.clip(predictions, 1, 5)

In [15]:
submission = pd.read_csv(SUBMISSION_PATH)
test[target_cols] = predictions
submission = submission.drop(columns=target_cols).merge(test[['text_id'] + target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + target_cols].to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.068707,2.900846,3.222927,3.118434,2.752864,2.719249
1,000BAD50D026,2.779561,2.510198,2.840374,2.479988,2.201199,2.673922
2,00367BB2546B,3.731355,3.398375,3.550659,3.671269,3.419664,3.253832
