In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math 
import string
import pickle
import random
import joblib
import itertools
from IPython. display import clear_output

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install iterative-stratification==0.1.7')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels tokenizers')
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

clear_output()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print('device:', device)

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1
device: cuda


In [2]:
BASE_PATH = '/kaggle/input/feedback-prize-english-language-learning'
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

In [3]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
        
    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features, *args):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings
    
def GlobalAveragePool1d(x):
    return F.avg_pool1d(x, x.size()[-1]).squeeze(-1)

def GlobalMaxPool1d(x):
    return F.max_pool1d(x, x.size()[-1]).squeeze(-1)

def Conv1dReg(x, in_channels, out_channels, kernel_size, device):
    out = nn.Conv1d(in_channels, out_channels, kernel_size, padding='same', stride=1, device=device)(x)
    out = nn.BatchNorm1d(out_channels, device=device)(out)
    out = F.relu(out)
    return out

class MultiSampleDropout(nn.Module):
    def __init__(self, fc, num_dropout, prob_dropout):
        super(MultiSampleDropout, self).__init__()
        self.dropout = nn.Dropout
        self.num_dropout = num_dropout
        self.prob_dropout = prob_dropout
        self.classifier = fc
    def forward(self, out):
        if not type(self.prob_dropout) in [float, int]:            
            fcs = [self.classifier(self.dropout(p)(out)) for p in self.prob_dropout]
        else:
            fcs = [self.classifier(self.dropout(self.prob_dropout)(out)) for _ in range(self.num_dropout)]
        
        return torch.mean(torch.stack(fcs, dim=0), dim=0)

In [4]:
# ====================================================
# Model class
# ====================================================
class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)

        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.deberta_v3.encoder.layer[-1:].modules():
                self._init_weights(module)
        self.deberta_v3.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            #nn.init.xavier_uniform_(module.weight.data, gain=1.0)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class WMPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.mean_head = MeanPooling()
        self.wpool_head = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=12)

        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.qa_output = torch.nn.Linear(self.config.hidden_size, 2)
        self.attention_head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        
    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        all_hidden_states = torch.stack(pt_out.hidden_states)
        # Weighted pooling of last n layers.
        logits = self.wpool_head(all_hidden_states)[:, 0] # Bx768
        y_hat = self.fc_out(logits)
        return y_hat

class MultiPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False, pool='mean'):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        # Define model layers.
        self.pool_name = cfg.pool_head
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_target)
        if cfg.pool_head in ['mean', 'attention', 'weighted']:
            self.pool = self._pool_layer(cfg.pool_head)
        elif '-' in cfg.pool_head:
            pools = cfg.pool_head.split('-')
            self.pool = nn.ModuleList([])
            for pool_ in pools:
                self.pool.append(self._pool_layer(pool_))
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), self.cfg.num_target)
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, self.cfg.num_dropout, self.cfg.prob_dropout)
    
    def _pool_layer(self, pool_name):
        assert pool_name in ['mean', 'attention', 'weighted']
        if pool_name == 'mean':
            pool = MeanPooling()
        elif pool_name == 'attention':
            pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif pool_name == 'weighted':
            pool = WeightedLayerPooling(
                self.config.num_hidden_layers, 
                layer_start=9,
                layer_weights=None)
        return pool
    
    def _pool_feature(self, pool, pool_name, pt_outputs, attention_mask):
        assert pool_name in ['mean', 'attention', 'weighted']
        last_hidden_state = pt_outputs.last_hidden_state #batch_size x max_len x hidden_size
        all_hidden_states = torch.stack(pt_outputs.hidden_states) #num_layer x batch_size x max_len x hidden_size
        
        if pool_name == 'mean':
            pool_feature = pool(last_hidden_state, attention_mask)
        elif pool_name == 'attention':
            pool_feature = pool(last_hidden_state)
        elif pool_name == 'weighted':
            # Take the CLS token only.
            pool_feature = pool(all_hidden_states)[:, 0]
        return pool_feature

    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_features = []
            pool_names = self.pool_name.split('-')
            
            for pool_name, pool in zip(pool_names, self.pool):
                pool_features.append(self._pool_feature(pool, pool_name, pt_outputs, x['attention_mask']))
            pool_features = torch.cat(pool_features, dim=1)
        else:
            pool_features = self._pool_feature(self.pool, self.pool_name, pt_outputs, x['attention_mask'])
        return pool_features
    
    def forward(self, x):
        feature = self.feature(x)
        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(feature)
        else:
            y_hat = self.fc(feature)
        return y_hat


class Attention4Model(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        self.head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        self.fc_out = nn.Linear(self.config.hidden_size*4*2, self.cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        
        all_hidden_states = torch.stack(pt_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),
            -1)
        # [CLS] embedding.
        cls_pooling = cat_over_last_layers[:, 0]   
        # Concat of 4 last layers.
        head_logits = self.head(cat_over_last_layers)

        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(torch.cat([head_logits, cls_pooling], -1))
        else:
            y_hat = self.fc_out(torch.cat([head_logits, cls_pooling], -1))

        return y_hat

######################################

class FB3Model1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.model.encoder.layer[-1:].modules():
                self._init_weights(module)
        self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
class WeightedAttentionModel(FB3Model1):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        self.weighted_pool = WeightedLayerPooling(
            self.config.num_hidden_layers, layer_start=9, layer_weights=None)
        self.att_pool = AttentionPooling(self.config.hidden_size)

        self.fc_out = nn.Linear(self.config.hidden_size*2, cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.model(**x)
        hidden_states = pt_out.hidden_states
        last_hidden_state = pt_out.last_hidden_state

        x1 = self.weighted_pool(torch.stack(hidden_states))[:, 0]
        x2 = self.att_pool(last_hidden_state, x['attention_mask'])

        y_hat = self.fc_out(torch.cat([x1, x2], dim=1))
        return y_hat
    
##########
class WeightedLayerPooling_(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layers = None, layer_weights = None):
        super(WeightedLayerPooling_, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        
        if layers:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                    torch.tensor([1] * len(layers), dtype=torch.float)
                )
            self.layers = layers
        else:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                   torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
                )
            self.layers = list(range(layer_start, num_hidden_layers+1))
            

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layers, :, :, :]
        
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average
    
class CustomModel(nn.Module):

    def __init__(self, CFG, config_path = None, pretrained = False):
        super().__init__()
        self.CFG = CFG
        self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        self.model = AutoModel.from_config(self.config)
        self.pretrained = pretrained
                        
        fc_hidden_size = self.config.hidden_size
        if CFG.pooling == 'mean':
            self.pool = MeanPooling()
        elif CFG.pooling == 'max':
            self.pool = MaxPooling()
        elif CFG.pooling == 'min':
            self.pool = MinPooling()
        elif CFG.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif CFG.pooling == 'weightedlayer':
            self.pool = WeightedLayerPooling_(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)
        elif CFG.pooling == 'weightedlayer-mean':
            self.pool = WeightedLayerPooling_(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)
            self.mean_pool = MeanPooling()
        elif self.CFG.pooling == 'attention4':
            self.pool = AttentionHead(fc_hidden_size*4, 512)
            fc_hidden_size = fc_hidden_size*8
        self.fc = nn.Linear(fc_hidden_size, 6)
                        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        if self.CFG.pooling == 'attention4':
            all_layer_embeddings = torch.stack(outputs.hidden_states)
            cat_over_last_layers = torch.cat((all_layer_embeddings[-1], all_layer_embeddings[-2], all_layer_embeddings[-3], all_layer_embeddings[-4]), -1)
            cls_pooling = cat_over_last_layers[:, 0]
            head_logits = self.pool(cat_over_last_layers)
            feature = torch.cat([head_logits, cls_pooling], -1)
        elif self.CFG.pooling == 'weightedlayer':
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)[:, 0]
        elif self.CFG.pooling == 'weightedlayer-mean':
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)
            feature = self.mean_pool(feature, inputs['attention_mask'])
        else:
            last_hidden_states = outputs[0]
            feature = self.pool(last_hidden_states, inputs['attention_mask'])    
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
class MeanAttentionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
            # Expand embedding dim for new tokens.
            self.deberta_v3.resize_token_embeddings(len(cfg.tokenizer))
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)
            
        self.deberta_v3.gradient_checkpointing_enable()
        
        # Define model layers.
        self.fc = nn.Linear(self.config.hidden_size, 6)

        if cfg.pool == 'mean':
            self.pool = MeanPooling()
        elif cfg.pool == 'attention':
            self.pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif cfg.pool == 'mean-attention':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionHead(self.config.hidden_size, self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        elif cfg.pool == 'mean-attention-with-mask':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionPooling(self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        # Re-init weights.
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, cfg.num_dropout, cfg.prob_dropout)
        
    def global_avg_pool(x):
        return torch.mean(x.view(x.size(0), x.size(1), -1), dim=-1)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        last_hidden_states = pt_outputs[0] # N x max_len x 768
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_feature = [pool(last_hidden_states, x['attention_mask']) for pool in self.pool]
            pool_feature = torch.cat(pool_feature, dim=1)
        else:
            pool_feature = self.pool(last_hidden_states, x['attention_mask']) # N x 768
        return pool_feature
    
    def forward(self, x, y=None, loss_fn=None):
        feature = self.feature(x)
        out = self.fc(feature)
        return out

In [5]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def init(self, kwargs):
        super().init(kwargs)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

In [6]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
def get_result(cfg, oof_df):
    labels = oof_df[target_cols].values
    preds = oof_df[[f"pred_{c}" for c in target_cols]].values
    score, scores = mc_rmse(labels, preds)
    print(f'score: {score:<.6f}  scores: {scores}')

In [7]:
def encode_text(cfg, text):
    if cfg.pretrained:
        inputs = cfg.tokenizer(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=cfg.max_len,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        inputs = {k:v.squeeze(0) for k,v in inputs.items()}
    else:
        if 'roberta' in cfg.model:
            inputs = cfg.tokenizer.encode_plus(
                text,
                return_tensors = None,
                add_special_tokens = True,
                max_length = cfg.max_len,
                pad_to_max_length = True,
                truncation = True)
        elif '512' in cfg.name or '768' in cfg.name:
            inputs = cfg.tokenizer.encode_plus(
                text,
                return_tensors = None,
                add_special_tokens = True,
                max_length = cfg.max_len,
                truncation = True)
        else:
            inputs = cfg.tokenizer.encode_plus(
                text, 
                return_tensors=None, 
                add_special_tokens=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs 

def preprocess(texts):
    texts = (
        texts
        .str.replace(r'\r\n', '<newline>', regex=True)
        .str.replace(r'\n', '<newline>', regex=True)
        .str.replace('<newline><newline>', '<newline>', regex=False)
        .values 
    )
    return texts

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        #if not cfg.pretrained and cfg.version in ['1', 'mean-attention']:
        if not cfg.pretrained:
            print('preprocess')
            self.texts = preprocess(df['full_text'])
        else:
            self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = encode_text(self.cfg, self.texts[item])
        return inputs

In [8]:
def load_config(input_path, inference_weight=1):
    # Load CFG class.
    cfg = Config(**json.load(open(os.path.join(input_path, 'CFG.json'), 'r')))
    cfg.path = input_path
    cfg.config_path = os.path.join(cfg.path, 'config.pth')
    # Load tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(cfg.path, 'tokenizer'))
    cfg.tokenizer = tokenizer
    
    cfg.inference_weight = inference_weight
    return cfg

def load_model(cfg, fold, version='1', **model_kwargs):
    # Load torch model.
    if version == '1':
        model = MultiPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '2':
        model = Attention4Model(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '21':
        model = WMPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'weighted-attention':
        model = WeightedAttentionModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'custom':
        model = CustomModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'mean-attention':
        model = MeanAttentionModel(cfg, config_path=cfg.config_path, pretrained=False)
    state = torch.load(
        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"),
        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    return model

In [9]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    #tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

class Inferencer:
    def __init__(self, input_path=None, cfg=None, inference_weight=1):
        if cfg == None:
            self.cfg = load_config(input_path, inference_weight)
        else:
            self.cfg = cfg
    
    def predict(self, test_loader, device, stat_fn=np.mean):
        preds = []
        
        for fold in self.cfg.trn_fold:
            start = time.time()
            print('#'*10, cfg.path, '#'*10)
            
            print(f'Predicting fold {fold}...')
            model = load_model(self.cfg, fold, version=self.cfg.version)
            pred = inference_fn(test_loader, model, device)
            preds.append(pred)
            del model, pred; gc.collect()
            torch.cuda.empty_cache()
            
            end = time.time() - start
            print('#'*10, f'ETA: {end:.2f}s', '#'*10, '\n')
        
        
        self.preds = stat_fn(preds, axis=0) 
        self.preds = np.clip(self.preds, 1, 5)
        return self.preds
    
    def get_oof_result(self, file_type='pkl'):
        return get_result(self.cfg, self.get_oof_df(file_type))
    
    def get_oof_df(self, file_type='pkl'):
        if file_type == 'pkl':
            return pd.read_pickle(os.path.join(cfg.path, 'oof_df.pkl'))
        return pd.read_csv(os.path.join(cfg.path, 'oof_df.csv'))
    
    def get_text_embedding(self, data_loader, device, fold=None): 
        # pretrained=True: not fine-tuned models.
        if not self.cfg.pretrained:
            model = load_model(self.cfg, fold, pool=self.cfg.pool_head)            
        else:
            model = AutoModel.from_pretrained(self.cfg.model)
        model.to(device)
        model.eval()
            
        fold_emb = []
        for inputs in data_loader:
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            if not self.cfg.pretrained:
                with torch.no_grad():
                    emb = model.feature(**inputs)
            else:
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                token_type_ids = inputs['token_type_ids'].to(device)
                
                with torch.no_grad():
                    try:
                        output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                    except:
                        output = model(input_ids=input_ids, attention_mask=attention_mask)
                emb = mean_pooling(output, attention_mask.detach().cpu())
                emb = F.normalize(emb, p=2, dim=1)
                emb = emb.squeeze(0)
            fold_emb.extend(emb.detach().cpu().numpy())
            del emb; gc.collect(); torch.cuda.empty_cache();
            #print(torch.cuda.memory_allocated() /1024/1024)
            
        fold_emb = np.array(fold_emb)
        return fold_emb


In [10]:
cfg = load_config('../input/fb3models/v21/', inference_weight=1)
cfg.name = 'v21'
cfg.version = '21'

test = pd.read_csv(TEST_PATH)
test['tokenize_length'] = [len(cfg.tokenizer(text)['input_ids']) for text in test['full_text'].values]
test = test.sort_values('tokenize_length', ascending=True).reset_index(drop=True)

In [11]:
##################################################
deberta_base = Config(
    model='../input/huggingface-deberta-variants/deberta-base/deberta-base',
    file_name='microsoft_deberta_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_large = Config(
    model='../input/huggingface-deberta-variants/deberta-large/deberta-large', 
    file_name='microsoft_deberta_large_1024',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_xlarge = Config(
    model='../input/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge', 
    file_name='microsoft_deberta_xlarge_1024',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xlarge', 
    file_name='microsoft_deberta_v2_xlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xxlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xxlarge', 
    file_name='microsoft_deberta_v2_xxlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)

deberta_v3_base = Config(
    model='../input/bert-shopping-mall/deberta-v3-base',
    file_name='microsoft_deberta_v3_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_v3_large = Config(
    model='../input/bert-shopping-mall/deberta-v3-large', 
    file_name='microsoft_deberta_v3_large_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

deberta_large_mnli = Config(
    model='../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli',
    file_name='microsoft_deberta_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

gpt2 = Config(
    model='../input/hugging-face-gpt2/gpt2',
    file_name='gpt2_768',
    pretrained=True, inference_weight=1, max_len=512) #

roberta_base = Config(
    model='../input/transformers/roberta-base', 
    file_name='roberta_base_768',
    pretrained=True, inference_weight=1, max_len=512) #
roberta_large = Config(
    model='../input/transformers/roberta-large',
    file_name='roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512) # 

xlnet_base = Config(
    model='../input/transformers/xlnet-base-cased',
    file_name='xlnet_base_cased_768',
    pretrained=True, inference_weight=1, max_len=640) #
xlnet_large = Config(
    model='../input/transformers/xlnet-large-cased', 
    file_name='xlnet_large_cased_1024',
    pretrained=True, inference_weight=1, max_len=640) #

bart_base = Config(
    model='../input/transformers/facebook-bart-base',
    file_name='facebook_bart_base_768',
    pretrained=True, inference_weight=1, max_len=640)
bart_large = Config(
    model='../input/transformers/facebook-bart-large',
    file_name='facebook_bart_large_1024',
    pretrained=True, inference_weight=1, max_len=640)
bart_lage_mnli = Config(
    model='../input/facebook-bart-large-mnli',
    file_name='facebook_bart_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640)

bert_base_uncased = Config(
    model='../input/transformers/bert-base-uncased/',
    file_name='bert_base_uncased_768',
    pretrained=True, inference_weight=1, max_len=512)
bert_large_uncased = Config(
    model='../input/transformers/bert-large-uncased',
    file_name='bert_large_uncased_1024',
    pretrained=True, inference_weight=1, max_len=512)

muppet_roberta_large = Config(
    model='../input/muppet-roberta-large',
    file_name='facebook_muppet_roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512)
# muppet_roberta_base = Config(model='facebook/muppet-roberta-base', pretrained=True, inference_weight=1, max_len=512)

funnel_small = Config(
    model='../input/transformers/funnel-transformer-small',
    file_name='funnel_transformer_small_768',
    pretrained=True, inference_weight=1, max_len=640)
funnel_large = Config(
    model='../input/transformers/funnel-transformer-large',
    file_name='funnel_transformer_large_1024',
    pretrained=True, inference_weight=1, max_len=640)

##################################################

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

# SVR from pretrained models

In [12]:
def get_text_embedding(cfg, dfs):
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model)
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    if 'gpt2' in cfg.model:
        cfg.tokenizer.pad_token = cfg.tokenizer.eos_token
    text_embs = []
    for df in dfs:
        dataset = TestDataset(cfg, df)
        loader = DataLoader(
            dataset,
            batch_size=4,
            shuffle=False)

        # Text embedding for SVM
        test_text_emb = []
        if not cfg.pretrained:
            for fold in infer_.cfg.trn_fold:
                test_text_emb.append(infer_.get_text_embedding(loader, device, fold))
            text_emb = np.mean(text_emb, axis=0)
        else:
            text_emb = infer_.get_text_embedding(loader, device)
        text_embs.append(text_emb)
        del dataset, loader; gc.collect(); torch.cuda.empty_cache();
    del infer_; gc.collect(); torch.cuda.empty_cache();
    return text_embs

In [13]:
pretrained_models_cfg = [
    deberta_large_mnli,
    #gpt2,
    roberta_base,
    roberta_large,
    #xlnet_base, 
    #xlnet_large,
    deberta_base, 
    deberta_large, 
    deberta_xlarge,
    deberta_v2_xlarge, 
    deberta_v2_xxlarge,
    deberta_v3_base, 
    deberta_v3_large,
    
    #bart_base,
    bart_large,
    #bart_lage_mnli,
    #bert_base_uncased,
    bert_large_uncased,
    #muppet_roberta_large,
    funnel_small,
    funnel_large
]
len(pretrained_models_cfg)

14

In [14]:
all_test_text_emb = []
for cfg in tqdm(pretrained_models_cfg):
    test_text_emb = get_text_embedding(cfg, [test])[0]
    all_test_text_emb.append(test_text_emb)
    
    del test_text_emb; gc.collect(); torch.cuda.empty_cache();
    print(f'{cfg.model} loaded.')
    
gc.collect(); torch.cuda.empty_cache();

final_test_text_emb = np.concatenate(all_test_text_emb, axis=1)

  0%|          | 0/14 [00:00<?, ?it/s]

Some weights of the model checkpoint at ../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli were not used when initializing DebertaModel: ['config', 'pooler.dense.weight', 'classifier.weight', 'pooler.dense.bias', 'classifier.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli loaded.


Some weights of the model checkpoint at ../input/transformers/roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/transformers/roberta-base loaded.


Some weights of the model checkpoint at ../input/transformers/roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/transformers/roberta-large loaded.


Some weights of the model checkpoint at ../input/huggingface-deberta-variants/deberta-base/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'config', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/huggingface-deberta-variants/deberta-base/deberta-base loaded.


Some weights of the model checkpoint at ../input/huggingface-deberta-variants/deberta-large/deberta-large were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'config', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/huggingface-deberta-variants/deberta-large/deberta-large loaded.


Some weights of the model checkpoint at ../input/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge were not used when initializing DebertaModel: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge loaded.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at ../input/bert-shopping-mall/deberta-v2-xlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequen

../input/bert-shopping-mall/deberta-v2-xlarge loaded.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at ../input/bert-shopping-mall/deberta-v2-xxlarge were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSeque

../input/bert-shopping-mall/deberta-v2-xxlarge loaded.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at ../input/bert-shopping-mall/deberta-v3-base were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.dense.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with ano

../input/bert-shopping-mall/deberta-v3-base loaded.


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at ../input/bert-shopping-mall/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.bias', 'mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining mode

../input/bert-shopping-mall/deberta-v3-large loaded.
../input/transformers/facebook-bart-large loaded.


Some weights of the model checkpoint at ../input/transformers/bert-large-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


../input/transformers/bert-large-uncased loaded.




../input/transformers/funnel-transformer-small loaded.
../input/transformers/funnel-transformer-large loaded.


In [15]:
# del all_test_text_emb; gc.collect()
final_test_text_emb.shape

(3, 14336)

In [16]:
import glob
from joblib import dump, load
from cuml.svm import SVR
import cuml

def svr_inference_fn(model_path, te_text_feats):
    model = load(model_path)
    preds = model.predict(te_text_feats)
    return preds

predictions = []
svr_model_paths = glob.glob('../input/fb3-svr-no-train/*.model')
for model_path in tqdm(svr_model_paths):
    #model_path = os.path.join('../input/fb3-svr-train/', model_path)
    preds = svr_inference_fn(model_path, final_test_text_emb)
    predictions.append(preds)
svr_predictions = np.mean(predictions, axis=0)
svr_predictions

  0%|          | 0/15 [00:00<?, ?it/s]

array([[2.6882355, 2.475212 , 2.6849074, 2.2693486, 1.9479848, 2.6718163],
       [3.6548452, 3.4533987, 3.5731485, 3.6607795, 3.4216313, 3.3330445],
       [2.9247746, 2.7894423, 3.095936 , 2.9856098, 2.6844265, 2.609502 ]],
      dtype=float32)

# Fine-tuned models

In [17]:
v114_CFG = Config(
    model="microsoft/deberta-v3-base",
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='mean-attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    path='../input/fb3models/v114/',
    config_path='../input/fb3models/v114/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v114/tokenizer')
)

weightedpool_CFG = Config(
    model='microsoft/deberta-v3-base',
    name='weightedpool',
    version='1',
    num_target=6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=1.5e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='weighted',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3-train/',
    config_path='../input/fb3-train/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3-train/tokenizer'),
    inference_weight=0.1)

v116_CFG = load_config('../input/fb3-colab-models/v116', inference_weight=0.1)
v116_CFG.path = '../input/fb3models/v116'
v116_CFG.config_path = '../input/fb3models/v116/config.pth'
v116_CFG.version = '1'

In [18]:
v112_CFG = Config(
    num_workers=1,
    batch_size=3,
    max_len=512,
    model="microsoft/deberta-v3-base",
    name='v112',
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3models/v112/',
    config_path='../input/fb3models/v112/config.pth',
    inference_weight=1.0,
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v112/tokenizer')
)

#####
v2_CFG = load_config('../input/fb3models/v2/', inference_weight=1.0)
v2_CFG.name = 'v2'
v2_CFG.version = '2'
v2_CFG.trn_fold = [0,1,2,3]

#####
v21_CFG = load_config('../input/fb3models/v21/', inference_weight=1)
v21_CFG.name = 'v21'
v21_CFG.version = '21'

#####
attention_fgm_CFG = load_config('../input/fb3models/20221114-192943-deberta-v3-base/', inference_weight=1.0)
attention_fgm_CFG.name = 'attention_fgm'
attention_fgm_CFG.version = 'custom'
attention_fgm_CFG.config_path = '../input/fb3models/20221114-192943-deberta-v3-base/config/config.json'

weighted_fgm_CFG = Config(
    pretrained=False,
    path='../input/fb3models/20221115-061243-deberta-v3-base',
    config_path='../input/fb3models/20221115-061243-deberta-v3-base/config/config.json',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/20221115-061243-deberta-v3-base/tokenizer'),
    name='weighted_fgm',
    version='custom',
    train = True,
    debug = False,
    offline = False,
    models_path = 'FB3-models',
    epochs = 5,
    save_all_models = False,
    competition = 'FB3',
    apex = True,
    print_freq = 20,
    num_workers = 4,
    model = 'microsoft/deberta-v3-base', #If you want to train on the kaggle platform, v3-base is realistic. v3-large will time out.
    loss_func = 'SmoothL1', # 'SmoothL1', 'RMSE'
    gradient_checkpointing = True,
    scheduler = 'cosine',
    batch_scheduler = True,
    num_cycles = 0.5,
    num_warmup_steps = 0,
    encoder_lr = 2e-5,
    decoder_lr = 2e-5,
    min_lr = 1e-6,
    #Layer-Wise Learning Rate Decay
    llrd = True,
    layerwise_lr = 5e-5,
    layerwise_lr_decay = 0.9,
    layerwise_weight_decay = 0.01,
    layerwise_adam_epsilon = 1e-6,
    layerwise_use_bertadam = False,
    #pooling
    pooling = 'weightedlayer', # mean, max, min, attention, weightedlayer
    layer_start = 11,
    layers=None,
    #init_weight
    init_weight = 'normal', # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True,
    reinit_n = 1,
    #adversarial
    fgm = True,
    awp = False,
    adv_lr = 1,
    adv_eps = 0.2,
    unscale = False,
    eps = 1e-6,
    betas = (0.9, 0.999),
    max_len = 512,
    weight_decay = 0.01,
    gradient_accumulation_steps = 1,
    max_grad_norm = 1000,
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    seed = 42,
    cv_seed = 42,
    n_fold = 4,
    trn_fold = [0,1,2,3],
    batch_size = 8,
    n_targets = 6,
    gpu_id = 0) 

weighted_attention_CFG = load_config('../input/fb3models/weighted_attention_v3', inference_weight=1.0)
weighted_attention_CFG.name = 'weighted_attention'
weighted_attention_CFG.version = 'weighted-attention'

mean_attention_no_fgm_CFG = load_config('../input/fb3models/20221117-183420-deberta-v3-base-mean-attention-with-mask', inference_weight=1.0)
mean_attention_no_fgm_CFG.name = 'mean_attention_no_fgm'
mean_attention_no_fgm_CFG.version = 'mean-attention'

attention_large_fgm_CFG = load_config('../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm')
attention_large_fgm_CFG.name = 'attention_large_fgm'
attention_large_fgm_CFG.version = 'custom'
attention_large_fgm_CFG.config_path = '../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm/config/config.json'

attention_fgm_512_CFG = load_config('../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512')
attention_fgm_512_CFG.name = 'attention_fgm_512'
attention_fgm_512_CFG.version = 'custom'
attention_fgm_512_CFG.config_path = '../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512/config/config.json'

attention_fgm_768_CFG = load_config('../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm')
attention_fgm_768_CFG.name = 'attention_fgm_768'
attention_fgm_768_CFG.version = 'custom'
attention_fgm_768_CFG.config_path = '../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm/config/config.json'

weighted2last_fgm_512_CFG = load_config('../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm')
weighted2last_fgm_512_CFG.name = 'weighted2last_fgm_512'
weighted2last_fgm_512_CFG.version = 'custom'
weighted2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm/config/config.json'

weightedmean2last_fgm_512_CFG = load_config('../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm')
weightedmean2last_fgm_512_CFG.name = 'weightedmean2last_fgm_512'
weightedmean2last_fgm_512_CFG.version = 'custom'
weightedmean2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm/config/config.json'

roberta_attention_fgm_CFG = load_config('../input/fb3models/20221121-173739-roberta-base')
roberta_attention_fgm_CFG.name = 'roberta_attention_large_fgm'
roberta_attention_fgm_CFG.version = 'custom'
roberta_attention_fgm_CFG.config_path = '../input/fb3models/20221121-173739-roberta-base/config/config.json'

##########
v112_CFG.pretrained = False
v114_CFG.pretrained = False
v116_CFG.pretrained = False
v21_CFG.pretrained = False
v2_CFG.pretrained = False
attention_fgm_CFG.pretrained = False
weighted_attention_CFG.pretrained = False
weightedpool_CFG.pretrained = False
mean_attention_no_fgm_CFG.pretrained=False
attention_large_fgm_CFG.pretrained=False
attention_fgm_512_CFG.pretrained = False
attention_fgm_768_CFG.pretrained = False
weighted2last_fgm_512_CFG.pretrained = False
weightedmean2last_fgm_512_CFG.pretrained = False
roberta_attention_fgm_CFG.pretrained = False


weighted_fgm_CFG.inference_weight = 1.0
v114_CFG.inference_weight = 1.0 
v2_CFG.inference_weight = 1.0 
v21_CFG.inference_weight = 1.0
attention_fgm_CFG.inference_weight = 1.0
weighted_attention_CFG.inference_weight = 1.0
attention_large_fgm_CFG.inference_weight = 1.0
attention_fgm_512_CFG.inference_weight = 1.0
attention_fgm_768_CFG.inference_weight = 1.0
weighted2last_fgm_512_CFG.inference_weight = 1.0
weightedmean2last_fgm_512_CFG.inference_weight = 1.0
roberta_attention_fgm_CFG.inference_weight = 1.0

In [19]:
fine_tuned_models_cfg = [
    attention_large_fgm_CFG, weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, 
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, v2_CFG, weightedpool_CFG]
optimal_weights = sorted([
    0.353356890459364,
    0.1660777385159011,
    0.10247349823321557,
    0.05653710247349824,
    0.06007067137809188,
    0.049469964664310966,
    0.08127208480565372,
    0.06713780918727916,
    0.03886925795053004,
    0.02473498233215548
])

for i,cfg in enumerate(fine_tuned_models_cfg):
    fine_tuned_models_cfg[i].inference_weight = optimal_weights[i]
    print(fine_tuned_models_cfg[i].name, fine_tuned_models_cfg[i].inference_weight)
    
print('Total number of models:', len(fine_tuned_models_cfg))
stacking = False

attention_large_fgm 0.02473498233215548
weighted2last_fgm_512 0.03886925795053004
weightedmean2last_fgm_512 0.049469964664310966
attention_fgm_512 0.05653710247349824
attention_fgm_768 0.06007067137809188
v21 0.06713780918727916
v112 0.08127208480565372
mean_attention_no_fgm 0.10247349823321557
v2 0.1660777385159011
weightedpool 0.353356890459364
Total number of models: 10


In [20]:
fine_tuned_predictions = []
total_weight = 0
for cfg in tqdm(fine_tuned_models_cfg):
    # infer_ = Inferencer(setup['path'], setup['inference_weight'])
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    if cfg.path in [
        attention_large_fgm_CFG.path,
        attention_fgm_CFG.path, attention_fgm_768_CFG.path, roberta_attention_fgm_CFG.path, attention_fgm_512_CFG.path, 
        weighted2last_fgm_512_CFG.path, weightedmean2last_fgm_512_CFG.path, 
    ]:
        file_type = 'csv'
    else:
        file_type = 'pkl'
    
    infer_.get_oof_result(file_type)
    
    if cfg.path == roberta_attention_fgm_CFG.path:
        collate_fn = DataCollatorWithPadding(tokenizer=cfg.tokenizer, max_length=cfg.max_len, padding='max_length')
    elif '512' in cfg.name:
        collate_fn = DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='max_length', max_length=512)
        print('max_len=512')
    elif '768' in cfg.name:
        collate_fn = DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='max_length', max_length=768)
        print('max_len=768')
    else:
        collate_fn = DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest')
    test_dataset = TestDataset(cfg, test)
    test_loader = DataLoader(
        test_dataset,
        batch_size=12,
        shuffle=False,
        collate_fn=collate_fn,
        num_workers=2, 
        pin_memory=True, 
        drop_last=False)
    
    if stacking:
        prediction = infer_.predict(test_loader, device)
    else:
        prediction = infer_.predict(test_loader, device) * cfg.inference_weight
    
    fine_tuned_predictions.append(prediction)
    total_weight += cfg.inference_weight
    
    del infer_, test_dataset, test_loader, prediction; gc.collect; torch.cuda.empty_cache();
    
if stacking:
    from glob import glob
    from sklearn.linear_model import Ridge, Lasso, BayesianRidge
    from joblib import dump, load
    if str(device) == 'cpu':
        from sklearn.svm import SVR
    else:
        from cuml.svm import SVR

    final_fine_tuned_predictions = np.stack(fine_tuned_predictions, axis=1).reshape(-1, len(fine_tuned_models_cfg)*6)
    final_fine_tuned_predictions_ = []
    for meta_model_path in sorted(glob('../input/fb3-stacking/*.model')):
        meta_model = load(meta_model_path)
        meta_model_preds = meta_model.predict(final_fine_tuned_predictions)
        final_fine_tuned_predictions_.append(meta_model_preds)
    final_fine_tuned_predictions = np.mean(final_fine_tuned_predictions_, axis=0)
else:
    final_fine_tuned_predictions = np.sum(fine_tuned_predictions, axis=0)/total_weight
    
final_fine_tuned_predictions

  0%|          | 0/10 [00:00<?, ?it/s]

score: 0.452063  scores: [0.48157374223152627, 0.4452090627651757, 0.4154085184288849, 0.4536476489584571, 0.47240178087223106, 0.44413673294015477]
preprocess
########## ../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm ##########
Predicting fold 0...
########## ETA: 27.44s ########## 

########## ../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm ##########
Predicting fold 1...
########## ETA: 26.37s ########## 

########## ../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm ##########
Predicting fold 2...
########## ETA: 27.87s ########## 

########## ../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm ##########
Predicting fold 3...
########## ETA: 26.70s ########## 

score: 0.453033  scores: [0.48389889660631685, 0.44678944048963176, 0.41465519997691047, 0.45360604538329913, 0.4717536101123449, 0.4474973036716195]
max_len=512
preprocess
########## ../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm #######

array([[2.6278195, 2.4231508, 2.6908426, 2.359161 , 2.1601028, 2.5797715],
       [3.481747 , 3.3418107, 3.51337  , 3.4832504, 3.3819103, 3.305533 ],
       [2.9401727, 2.8120873, 3.152896 , 3.0040162, 2.7296119, 2.702186 ]],
      dtype=float32)

In [21]:
total_weight

1.0000000000000002

# Stackings

In [22]:
# from glob import glob
# from sklearn.linear_model import Ridge, Lasso, BayesianRidge
# from joblib import dump, load
# if str(device) == 'cpu':
#     from sklearn.svm import SVR
# else:
#     from cuml.svm import SVR

## bad models

In [23]:
# use_weights = True

# stacking_models_cfg = [
#     mean_attention_no_fgm_CFG,
#     v2_CFG, 
#     attention_large_fgm_CFG,
#     v116_CFG, 
#     weightedpool_CFG, 
#     weighted_fgm_CFG,
#     roberta_attention_fgm_CFG,
#     attention_fgm_512_CFG,
# ]
# optimal_weights = sorted([0.10381541,0.02391661,-0.01459959,0.00525827,0.03253774,0.66132155,-0.1136627,0.30141271], reverse=True)

# for i, ftm_cfg in enumerate(stacking_models_cfg):
#     if use_weights:
#         stacking_models_cfg[i].inference_weight = optimal_weights[i]
#     else:
#         stacking_models_cfg[i].inference_weight = 1.0
    
# weak_stack_predictions = []
# for cfg in tqdm(stacking_models_cfg):
#     infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
#     if cfg.path in [attention_fgm_CFG.path, attention_fgm_768_CFG.path, roberta_attention_fgm_CFG.path, attention_fgm_512_CFG.path, attention_large_fgm_CFG.path]:
#         file_type = 'csv'
#     else:
#         file_type = 'pkl'
#     infer_.get_oof_result(file_type)
    
#     test_dataset = TestDataset(cfg, test)
#     test_loader = DataLoader(
#         test_dataset,
#         batch_size=12,
#         shuffle=False,
#         collate_fn=DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest'),
#         num_workers=2, 
#         pin_memory=True, 
#         drop_last=False)
    
#     prediction = infer_.predict(test_loader, device)
#     weak_stack_predictions.append(prediction)
    
#     del infer_, test_dataset, test_loader, prediction; gc.collect; torch.cuda.empty_cache();

# if use_weights:
#     ensemble_predictions = np.zeros((len(test), 6))
#     for i, cfg in enumerate(stacking_models_cfg):
#         ensemble_predictions += weak_stack_predictions[i] * cfg.inference_weight
#     weak_stack_predictions.append(ensemble_predictions)
    
# final_weak_stack_predictions = np.concatenate(weak_stack_predictions, axis=1)
# final_weak_stack_predictions_ = []
# for meta_model_path in sorted(glob('../input/fb3-stacking/*.model')):
#     if '_weak_' in meta_model_path:
#         meta_model = load(meta_model_path)
#         meta_model_preds = meta_model.predict(final_weak_stack_predictions)
#         final_weak_stack_predictions_.append(meta_model_preds)
# final_weak_stack_predictions = np.mean(final_weak_stack_predictions_, axis=0)
    
# final_weak_stack_predictions

In [24]:
# final_weak_stack_predictions

## good models

In [25]:
# use_weights = True

In [26]:
# optimal_weights = [0.29754274, 0.44465927, 0.25779799, 1]
# stacking_models_cfg = [v112_CFG, v21_CFG, weighted_attention_CFG, attention_fgm_CFG]

# if use_weights:
#     for i, ftm_cfg in enumerate(stacking_models_cfg):
#         stacking_models_cfg[i].inference_weight = optimal_weights[i]
    
# strong_stack_predictions = []
# for cfg in tqdm(stacking_models_cfg):
#     if not use_weights:
#         cfg.inference_weight = 1.0
#     infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
#     if cfg.path == attention_fgm_CFG.path:
#         file_type = 'csv'
#     else:
#         file_type = 'pkl'
#     infer_.get_oof_result(file_type)
    
#     test_dataset = TestDataset(cfg, test)
#     test_loader = DataLoader(
#         test_dataset,
#         batch_size=12,
#         shuffle=False,
#         collate_fn=DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest'),
#         num_workers=2, 
#         pin_memory=True, 
#         drop_last=False)
    
#     prediction = infer_.predict(test_loader, device)
#     strong_stack_predictions.append(prediction)
    
#     del infer_, test_dataset, test_loader, prediction; gc.collect; torch.cuda.empty_cache();

# if use_weights:
#     ensemble_predictions = np.zeros((len(test), 6))
#     for i, cfg in enumerate(stacking_models_cfg):
#         ensemble_predictions += strong_stack_predictions[i] * cfg.inference_weight
#     strong_stack_predictions.append(ensemble_predictions)
    
# # final_strong_stack_predictions = np.stack(strong_stack_predictions, axis=1).reshape(-1, len(stacking_models_cfg)*6)
# final_strong_stack_predictions = np.concatenate(strong_stack_predictions, axis=1)
# final_strong_stack_predictions_ = []
# for meta_model_path in sorted(glob('../input/fb3-stacking/*.model')):
#     if '_strong_' in meta_model_path:
#         meta_model = load(meta_model_path)
#         meta_model_preds = meta_model.predict(final_strong_stack_predictions)
#         final_strong_stack_predictions_.append(meta_model_preds)
# final_strong_stack_predictions = np.mean(final_strong_stack_predictions_, axis=0)

# final_strong_stack_predictions

# Combine predictions

In [27]:
final_predictions = (0.345*svr_predictions + 0.655*final_fine_tuned_predictions)/1
final_predictions = np.clip(final_predictions, 1, 5)
test[target_cols] = final_predictions

# Postprocess

In [28]:
# !mkdir -p /tmp/pip/cache/
# !cp ../input/hdbscan0828whl/hdbscan-0.8.28-cp37-cp37m-linux_x86_64.whl /tmp/pip/cache/
# !ls /tmp/pip/cache/
# !pip install --no-index --find-links /tmp/pip/cache/ hdbscan

# sys.path.append('../input/sentence-transformers/sentence-transformers-master/')
# sys.path.append('../input/bertopic/BERTopic/')
# from bertopic import BERTopic

In [29]:
# stopwords_ = stopwords.words('english') + ["n't",  "'s", "'ve"]
# test_docs = []
# for fl in tqdm(test['full_text']):
#     word_tokens = word_tokenize(fl)
#     txt = " ".join([w for w in word_tokens if not w.lower() in stopwords_])
#     test_docs.append(txt)
    
# bertopic_model = BERTopic.load('../input/fb3-bertopic/bertopic_train.model')
# topics, probs = bertopic_model.transform(test_docs)
# test['topic_idx'] = topics

In [30]:
# train_topics = pd.read_csv('../input/fb3-bertopic/train_topics.csv')
# target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
# df_ = train_topics.set_index('topic_name')[target_cols].stack() .reset_index()
# df_.columns = ['topic', 'score', 'value']
# topic_stats = df_.groupby(['topic', 'score'])['value'].agg(['min', 'mean', 'median', 'max']).reset_index()
# topic_stats['topic_idx'] = topic_stats['topic'].str.split('_', expand=True)[0].astype(int)

In [31]:
# for target in target_cols:
#     for topic in sorted(test['topic_idx'].unique()):
#         if topic == -1:
#             continue
#         topic_max = topic_stats[(topic_stats['topic_idx'] == topic) & (topic_stats['score'] == target)]['max'].values[0]
#         topic_min = topic_stats[(topic_stats['topic_idx'] == topic) & (topic_stats['score'] == target)]['min'].values[0]
#         print(f'topic={topic}    target={target}    min,max={topic_min,topic_max}')
#         test.loc[test['topic_idx'] == topic, target] = test[target].apply(lambda x: np.clip(x, topic_min, topic_max))
#     print('='*50)

# submit

In [32]:
submission = pd.read_csv(SUBMISSION_PATH)
submission = submission.drop(columns=target_cols).merge(test[['text_id'] + target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + target_cols].to_csv('submission.csv', index=False)

Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,2.93486,2.804275,3.133245,2.997666,2.714023,2.67021
1,000BAD50D026,2.648663,2.441112,2.688795,2.328176,2.086922,2.611527
2,00367BB2546B,3.541466,3.380309,3.533994,3.544498,3.395614,3.315024
