In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math 
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")
from IPython. display import clear_output

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install iterative-stratification==0.1.7')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels tokenizers')
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

clear_output()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print('device:', device)

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1
device: cuda


In [2]:
BASE_PATH = '/kaggle/input/feedback-prize-english-language-learning'
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

In [3]:
class FB3Dataset(Dataset):
    def __init__(self, cfg, data):
        self.cfg = cfg
        self.xs = preprocess(data['full_text'])
        self.ys = data[cfg.target_cols].values 
        
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, idx):
        x = encode_text(self.cfg, self.xs[idx])
        y = torch.tensor(self.ys[idx], dtype=torch.float)
        return x, y
    
def collate(inputs):
    # Trimming input.
    mask_len = int(inputs['attention_mask'].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    #print(type(inputs), inputs['attention_mask'].size())
    return inputs

##################################################################################

class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
        
    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features, *args):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings
    
def GlobalAveragePool1d(x):
    return F.avg_pool1d(x, x.size()[-1]).squeeze(-1)

def GlobalMaxPool1d(x):
    return F.max_pool1d(x, x.size()[-1]).squeeze(-1)

def Conv1dReg(x, in_channels, out_channels, kernel_size, device):
    out = nn.Conv1d(in_channels, out_channels, kernel_size, padding='same', stride=1, device=device)(x)
    out = nn.BatchNorm1d(out_channels, device=device)(out)
    out = F.relu(out)
    return out

class MultiSampleDropout(nn.Module):
    def __init__(self, fc, num_dropout, prob_dropout):
        super(MultiSampleDropout, self).__init__()
        self.dropout = nn.Dropout
        self.num_dropout = num_dropout
        self.prob_dropout = prob_dropout
        self.classifier = fc
    def forward(self, out):
        if not type(self.prob_dropout) in [float, int]:            
            fcs = [self.classifier(self.dropout(p)(out)) for p in self.prob_dropout]
        else:
            fcs = [self.classifier(self.dropout(self.prob_dropout)(out)) for _ in range(self.num_dropout)]
        
        return torch.mean(torch.stack(fcs, dim=0), dim=0)

In [4]:
# ====================================================
# Model class
# ====================================================
class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)

        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.deberta_v3.encoder.layer[-1:].modules():
                self._init_weights(module)
        self.deberta_v3.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            #nn.init.xavier_uniform_(module.weight.data, gain=1.0)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class WMPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.mean_head = MeanPooling()
        self.wpool_head = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=12)

        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.qa_output = torch.nn.Linear(self.config.hidden_size, 2)
        self.attention_head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        
    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        all_hidden_states = torch.stack(pt_out.hidden_states)
        # Weighted pooling of last n layers.
        logits = self.wpool_head(all_hidden_states)[:, 0] # Bx768
        y_hat = self.fc_out(logits)
        return y_hat

class MultiPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False, pool='mean'):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        # Define model layers.
        self.pool_name = cfg.pool_head
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_target)
        if cfg.pool_head in ['mean', 'attention', 'weighted']:
            self.pool = self._pool_layer(cfg.pool_head)
        elif '-' in cfg.pool_head:
            pools = cfg.pool_head.split('-')
            self.pool = nn.ModuleList([])
            for pool_ in pools:
                self.pool.append(self._pool_layer(pool_))
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), self.cfg.num_target)
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, self.cfg.num_dropout, self.cfg.prob_dropout)
    
    def _pool_layer(self, pool_name):
        assert pool_name in ['mean', 'attention', 'weighted']
        if pool_name == 'mean':
            pool = MeanPooling()
        elif pool_name == 'attention':
            pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif pool_name == 'weighted':
            pool = WeightedLayerPooling(
                self.config.num_hidden_layers, 
                layer_start=9,
                layer_weights=None)
        return pool
    
    def _pool_feature(self, pool, pool_name, pt_outputs, attention_mask):
        assert pool_name in ['mean', 'attention', 'weighted']
        last_hidden_state = pt_outputs.last_hidden_state #batch_size x max_len x hidden_size
        all_hidden_states = torch.stack(pt_outputs.hidden_states) #num_layer x batch_size x max_len x hidden_size
        
        if pool_name == 'mean':
            pool_feature = pool(last_hidden_state, attention_mask)
        elif pool_name == 'attention':
            pool_feature = pool(last_hidden_state)
        elif pool_name == 'weighted':
            # Take the CLS token only.
            pool_feature = pool(all_hidden_states)[:, 0]
        return pool_feature

    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_features = []
            pool_names = self.pool_name.split('-')
            
            for pool_name, pool in zip(pool_names, self.pool):
                pool_features.append(self._pool_feature(pool, pool_name, pt_outputs, x['attention_mask']))
            pool_features = torch.cat(pool_features, dim=1)
        else:
            pool_features = self._pool_feature(self.pool, self.pool_name, pt_outputs, x['attention_mask'])
        return pool_features
    
    def forward(self, x):
        feature = self.feature(x)
        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(feature)
        else:
            y_hat = self.fc(feature)
        return y_hat


class Attention4Model(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        self.head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        self.fc_out = nn.Linear(self.config.hidden_size*4*2, self.cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        
        all_hidden_states = torch.stack(pt_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),
            -1)
        # [CLS] embedding.
        cls_pooling = cat_over_last_layers[:, 0]   
        # Concat of 4 last layers.
        head_logits = self.head(cat_over_last_layers)

        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(torch.cat([head_logits, cls_pooling], -1))
        else:
            y_hat = self.fc_out(torch.cat([head_logits, cls_pooling], -1))

        return y_hat

######################################

class FB3Model1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.model.encoder.layer[-1:].modules():
                self._init_weights(module)
        self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
            
class WeightedAttentionModel(FB3Model1):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        self.weighted_pool = WeightedLayerPooling(
            self.config.num_hidden_layers, layer_start=9, layer_weights=None)
        self.att_pool = AttentionPooling(self.config.hidden_size)

        self.fc_out = nn.Linear(self.config.hidden_size*2, cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.model(**x)
        hidden_states = pt_out.hidden_states
        last_hidden_state = pt_out.last_hidden_state

        x1 = self.weighted_pool(torch.stack(hidden_states))[:, 0]
        x2 = self.att_pool(last_hidden_state, x['attention_mask'])

        y_hat = self.fc_out(torch.cat([x1, x2], dim=1))
        return y_hat
    
class MeanAttentionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
            # Expand embedding dim for new tokens.
            self.deberta_v3.resize_token_embeddings(len(cfg.tokenizer))
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)
            
        self.deberta_v3.gradient_checkpointing_enable()
        
        # Define model layers.
        self.fc = nn.Linear(self.config.hidden_size, 6)

        if cfg.pool == 'mean':
            self.pool = MeanPooling()
        elif cfg.pool == 'attention':
            self.pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif cfg.pool == 'mean-attention':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionHead(self.config.hidden_size, self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        elif cfg.pool == 'mean-attention-with-mask':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionPooling(self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        # Re-init weights.
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, cfg.num_dropout, cfg.prob_dropout)
        
    def global_avg_pool(x):
        return torch.mean(x.view(x.size(0), x.size(1), -1), dim=-1)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        last_hidden_states = pt_outputs[0] # N x max_len x 768
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_feature = [pool(last_hidden_states, x['attention_mask']) for pool in self.pool]
            pool_feature = torch.cat(pool_feature, dim=1)
        else:
            pool_feature = self.pool(last_hidden_states, x['attention_mask']) # N x 768
        return pool_feature
    
    def forward(self, x, y=None, loss_fn=None):
        feature = self.feature(x)
        # if self.training:
        #    out = self.multi_dropout(feature)
        # else:
        #    out = self.fc(feature)
        out = self.fc(feature)
        return out
    
##########
class WeightedLayerPooling_(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layers = None, layer_weights = None):
        super(WeightedLayerPooling_, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        
        if layers:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                    torch.tensor([1] * len(layers), dtype=torch.float)
                )
            self.layers = layers
        else:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                   torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
                )
            self.layers = list(range(layer_start, num_hidden_layers+1))
            

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layers, :, :, :]
        
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average
    
class CustomModel(nn.Module):

    def __init__(self, CFG, config_path = None, pretrained = False):
        super().__init__()
        self.CFG = CFG
        self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        self.model = AutoModel.from_config(self.config)
        self.pretrained = pretrained
                        
        fc_hidden_size = self.config.hidden_size
        if CFG.pooling == 'mean':
            self.pool = MeanPooling()
        elif CFG.pooling == 'max':
            self.pool = MaxPooling()
        elif CFG.pooling == 'min':
            self.pool = MinPooling()
        elif CFG.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif CFG.pooling == 'weightedlayer':
            self.pool = WeightedLayerPooling_(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)
        elif CFG.pooling == 'weightedlayer-mean':
            self.pool = WeightedLayerPooling_(self.config.num_hidden_layers, layer_start = CFG.layer_start, layer_weights = None)
            self.mean_pool = MeanPooling()
        elif self.CFG.pooling == 'attention4':
            self.pool = AttentionHead(fc_hidden_size*4, 512)
            fc_hidden_size = fc_hidden_size*8
        self.fc = nn.Linear(fc_hidden_size, 6)
                        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        if self.CFG.pooling == 'attention4':
            all_layer_embeddings = torch.stack(outputs.hidden_states)
            cat_over_last_layers = torch.cat((all_layer_embeddings[-1], all_layer_embeddings[-2], all_layer_embeddings[-3], all_layer_embeddings[-4]), -1)
            cls_pooling = cat_over_last_layers[:, 0]
            head_logits = self.pool(cat_over_last_layers)
            feature = torch.cat([head_logits, cls_pooling], -1)
        elif self.CFG.pooling == 'weightedlayer':
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)[:, 0]
        elif self.CFG.pooling == 'weightedlayer-mean':
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)
            feature = self.mean_pool(feature, inputs['attention_mask'])
        else:
            last_hidden_states = outputs[0]
            feature = self.pool(last_hidden_states, inputs['attention_mask'])    
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [5]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def init(self, kwargs):
        super().init(kwargs)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

In [6]:
loggers = {}
def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.propagate = False
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()

In [7]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
pred_cols = [f'pred_{col}' for col in target_cols]
def get_result(cfg, oof_df):
    labels = oof_df[target_cols].values
    preds = oof_df[[f"pred_{c}" for c in target_cols]].values
    score, scores = mc_rmse(labels, preds)
    print(f'score: {score:<.6f}  scores: {scores}')

In [8]:
# ====================================================
# Dataset
# ====================================================
def encode_text(cfg, text):
    if cfg.pretrained:
        inputs = cfg.tokenizer(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=cfg.max_len,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        inputs = {k:v.squeeze(0) for k,v in inputs.items()}
    else:
        if 'roberta' in cfg.model:
            inputs = cfg.tokenizer.encode_plus(
                text,
                return_tensors = None,
                add_special_tokens = True,
                max_length = cfg.max_len,
                pad_to_max_length = True,
                truncation = True)
        elif '512' in cfg.name or '768' in cfg.name:
            inputs = cfg.tokenizer.encode_plus(
                text,
                return_tensors = None,
                add_special_tokens = True,
                max_length = cfg.max_len,
                truncation = True)
        else:
            inputs = cfg.tokenizer.encode_plus(
                text, 
                return_tensors=None, 
                add_special_tokens=True)
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs 

def preprocess(texts):
    texts = (
        texts
        .str.replace(r'\r\n', '<newline>', regex=True)
        .str.replace(r'\n', '<newline>', regex=True)
        .str.replace('<newline><newline>', '<newline>', regex=False)
        .values 
    )
    return texts

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        if cfg.pretrained:
            self.texts = df['full_text'].values
        else:
            self.texts = preprocess(df['full_text'])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = encode_text(self.cfg, self.texts[item])
        return inputs

In [9]:
def load_config(input_path, inference_weight=1):
    # Load CFG class.
    cfg = Config(**json.load(open(os.path.join(input_path, 'CFG.json'), 'r')))
    cfg.path = input_path
    cfg.config_path = os.path.join(cfg.path, 'config.pth')
    # Load tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(cfg.path, 'tokenizer'))
    cfg.tokenizer = tokenizer
    
    cfg.inference_weight = inference_weight
    return cfg

def load_model(cfg, fold, version='1', **model_kwargs):
    # Load torch model.
    if version == '1':
        model = MultiPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '2':
        model = Attention4Model(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '21':
        model = WMPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'weighted-attention':
        model = WeightedAttentionModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'custom':
        model = CustomModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'mean-attention':
        model = MeanAttentionModel(cfg, config_path=cfg.config_path, pretrained=False)
    state = torch.load(
        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"),
        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    return model

In [10]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    #tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

class Inferencer:
    def __init__(self, input_path=None, cfg=None, inference_weight=1):
        if cfg == None:
            self.cfg = load_config(input_path, inference_weight)
        else:
            self.cfg = cfg
    
    def predict(self, test_loader, device, stat_fn=np.mean):
        preds = []
        
        for fold in self.cfg.trn_fold:
            start = time.time()
            print('#'*10, cfg.path, '#'*10)
            
            print(f'Predicting fold {fold}...')
            model = load_model(self.cfg, fold, version=self.cfg.version)
            pred = inference_fn(test_loader, model, device)
            preds.append(pred)
            del model, pred; gc.collect()
            torch.cuda.empty_cache()
            
            end = time.time() - start
            print('#'*10, f'ETA: {end:.2f}s', '#'*10, '\n')
        
        
        self.preds = stat_fn(preds, axis=0) 
        self.preds = np.clip(self.preds, 1, 5)
        return self.preds
    
    def get_oof_result(self, file_type='pkl'):
        return get_result(self.cfg, self.get_oof_df(file_type))
    
    def get_oof_df(self, file_type='pkl'):
        if file_type == 'pkl':
            return pd.read_pickle(os.path.join(cfg.path, 'oof_df.pkl'))
        return pd.read_csv(os.path.join(cfg.path, 'oof_df.csv'))
    
    def get_text_embedding(self, data_loader, device, fold=None): 
        # pretrained=True: not fine-tuned models.
        if not self.cfg.pretrained:
            model = load_model(self.cfg, fold, pool=self.cfg.pool_head)            
        else:
            model = AutoModel.from_pretrained(self.cfg.model)
        model.to(device)
        model.eval()
            
        fold_emb = []
        for inputs in data_loader:
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            if not self.cfg.pretrained:
                with torch.no_grad():
                    emb = model.feature(**inputs)
            else:
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                token_type_ids = inputs['token_type_ids'].to(device)
                
                with torch.no_grad():
                    try:
                        output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                    except:
                        output = model(input_ids=input_ids, attention_mask=attention_mask)
                emb = mean_pooling(output, attention_mask.detach().cpu())
                emb = F.normalize(emb, p=2, dim=1)
                emb = emb.squeeze(0)
            fold_emb.extend(emb.detach().cpu().numpy())
            del emb; gc.collect(); torch.cuda.empty_cache();
            #print(torch.cuda.memory_allocated() /1024/1024)
            
        fold_emb = np.array(fold_emb)
        return fold_emb


In [11]:
def get_text_embedding(cfg, dfs):
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model)
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    if 'gpt2' in cfg.model:
        cfg.tokenizer.pad_token = cfg.tokenizer.eos_token
    text_embs = []
    for df in dfs:
        dataset = TestDataset(cfg, df)
        loader = DataLoader(
            dataset,
            batch_size=4,
            shuffle=False)

        # Text embedding for SVM
        test_text_emb = []
        if not cfg.pretrained:
            for fold in infer_.cfg.trn_fold:
                test_text_emb.append(infer_.get_text_embedding(loader, device, fold))
            text_emb = np.mean(text_emb, axis=0)
        else:
            text_emb = infer_.get_text_embedding(loader, device)
        text_embs.append(text_emb)
        del dataset, loader; gc.collect(); torch.cuda.empty_cache();
    del infer_; gc.collect(); torch.cuda.empty_cache();
    return text_embs

##################################################
deberta_base = Config(
    model='../input/huggingface-deberta-variants/deberta-base/deberta-base',
    file_name='microsoft_deberta_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_large = Config(
    model='../input/huggingface-deberta-variants/deberta-large/deberta-large', 
    file_name='microsoft_deberta_large_1024',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_xlarge = Config(
    model='../input/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge', 
    file_name='microsoft_deberta_xlarge_1024',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xlarge', 
    file_name='microsoft_deberta_v2_xlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xxlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xxlarge', 
    file_name='microsoft_deberta_v2_xxlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)

deberta_v3_base = Config(
    model='../input/bert-shopping-mall/deberta-v3-base',
    file_name='microsoft_deberta_v3_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_v3_large = Config(
    model='../input/bert-shopping-mall/deberta-v3-large', 
    file_name='microsoft_deberta_v3_large_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

deberta_large_mnli = Config(
    model='../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli',
    file_name='microsoft_deberta_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

gpt2 = Config(
    model='../input/hugging-face-gpt2/gpt2',
    file_name='gpt2_768',
    pretrained=True, inference_weight=1, max_len=512) #

roberta_base = Config(
    model='../input/transformers/roberta-base', 
    file_name='roberta_base_768',
    pretrained=True, inference_weight=1, max_len=512) #
roberta_large = Config(
    model='../input/transformers/roberta-large',
    file_name='roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512) # 

xlnet_base = Config(
    model='../input/transformers/xlnet-base-cased',
    file_name='xlnet_base_cased_768',
    pretrained=True, inference_weight=1, max_len=640) #
xlnet_large = Config(
    model='../input/transformers/xlnet-large-cased', 
    file_name='xlnet_large_cased_1024',
    pretrained=True, inference_weight=1, max_len=640) #

bart_base = Config(
    model='../input/transformers/facebook-bart-base',
    file_name='facebook_bart_base_768',
    pretrained=True, inference_weight=1, max_len=640)
bart_large = Config(
    model='../input/transformers/facebook-bart-large',
    file_name='facebook_bart_large_1024',
    pretrained=True, inference_weight=1, max_len=640)
bart_lage_mnli = Config(
    model='../input/facebook-bart-large-mnli',
    file_name='facebook_bart_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640)

bert_base_uncased = Config(
    model='../input/transformers/bert-base-uncased/',
    file_name='bert_base_uncased_768',
    pretrained=True, inference_weight=1, max_len=512)
bert_large_uncased = Config(
    model='../input/transformers/bert-large-uncased',
    file_name='bert_large_uncased_1024',
    pretrained=True, inference_weight=1, max_len=512)

muppet_roberta_large = Config(
    model='../input/muppet-roberta-large',
    file_name='facebook_muppet_roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512)
# muppet_roberta_base = Config(model='facebook/muppet-roberta-base', pretrained=True, inference_weight=1, max_len=512)

funnel_small = Config(
    model='../input/transformers/funnel-transformer-small',
    file_name='funnel_transformer_small_768',
    pretrained=True, inference_weight=1, max_len=640)
funnel_large = Config(
    model='../input/transformers/funnel-transformer-large',
    file_name='funnel_transformer_large_1024',
    pretrained=True, inference_weight=1, max_len=640)


In [43]:
def build_preds(fine_tuned_models_cfg):
    pretrained_oof_results = {
        0: [],
        1: [],
        2: [],
        3: []}

    total_weight = 0

    for cfg in tqdm(fine_tuned_models_cfg):
        # infer_ = Inferencer(setup['path'], setup['inference_weight'])
        infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
        oof = test.copy()
        for fold in range(CFG.n_folds):
            if 'oof_results' in cfg and fold in cfg['oof_results']:
                predictions = cfg['oof_results'][fold]
            else:
                try:
                    predictions = torch.load(
                        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"), 
                        map_location = torch.device('cpu'))['predictions']
                except KeyError:
                    predictions = torch.load(
                        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"), 
                        map_location = torch.device('cpu'))['valid_preds']
                if 'oof_results' not in cfg:
                    cfg['oof_results'] = {}
                cfg['oof_results'][fold] = predictions
            oof.loc[oof['fold'] == fold, [f'pred_{target_col}' for target_col in target_cols]] = predictions
            predictions = predictions * cfg.inference_weight
            pretrained_oof_results[fold].append(predictions)

            total_weight += cfg.inference_weight
            cfg['oof'] = oof.copy()
    oof_preds = [cfg['oof'][[f'pred_{target_col}' for target_col in target_cols]].values for cfg in fine_tuned_models_cfg]
    return oof_preds

In [44]:
from scipy import optimize
def find_optimal_weights(oof_preds, labels, bounds=(0.0, 1.0), method='SLSQP'):
    weights = [1] * len(oof_preds)
    def loss(weights):
        return mc_rmse(labels, np.clip(np.average(oof_preds, weights=weights, axis=0), 1, 5))[0]

    opt_weights = optimize.minimize(
        loss,
        [1/len(oof_preds)] * len(oof_preds),
        constraints=({'type': 'eq','fun': lambda w: 1-sum(w)}),
        method= method, #'Nelder-Mead',
        bounds=[bounds] * len(oof_preds),
        options = {'ftol':1e-10},
    )['x']

    opt_weights = np.array(opt_weights) / sum(opt_weights)
    #print('\n')
    #print("score:", loss(opt_weights))
    #print(', '.join([str(i) for i in opt_weights]))
    return opt_weights, loss(opt_weights)

In [13]:
CFG = Config(
    n_folds=4,
    random_state=42
)

# Dataset

In [14]:
test = pd.read_csv(TRAIN_PATH)
mskfold = MultilabelStratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.random_state)
for n, (train_index, val_index) in enumerate(mskfold.split(test, test[target_cols])):
    test.loc[val_index, 'fold'] = int(n)

# Fine-tuned models

In [15]:
v114_CFG = Config(
    model="microsoft/deberta-v3-base",
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='mean-attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    path='../input/fb3models/v114/',
    config_path='../input/fb3models/v114/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v114/tokenizer')
)

weightedpool_CFG = Config(
    model='microsoft/deberta-v3-base',
    name='weightedpool',
    version='1',
    num_target=6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=1.5e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='weighted',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3-train/',
    config_path='../input/fb3-train/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3-train/tokenizer'),
    inference_weight=1.0)

v116_CFG = load_config('../input/fb3-colab-models/v116', inference_weight=1.0)
v116_CFG.path = '../input/fb3models/v116'
v116_CFG.config_path = '../input/fb3models/v116/config.pth'
v116_CFG.version = '1'
v116_CFG.name = 'v116'

v112_CFG = Config(
    num_workers=1,
    batch_size=3,
    max_len=512,
    model="microsoft/deberta-v3-base",
    name='v112',
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3models/v112/',
    config_path='../input/fb3models/v112/config.pth',
    inference_weight=1.0,
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v112/tokenizer')
)

#####
v2_CFG = load_config('../input/fb3models/v2/', inference_weight=1.0)
v2_CFG.name = 'v2'
v2_CFG.version = '2'
v2_CFG.trn_fold = [0,1,2,3]

#####
v21_CFG = load_config('../input/fb3models/v21/', inference_weight=1)
v21_CFG.name = 'v21'
v21_CFG.version = '21'

#####
attention_fgm_CFG = load_config('../input/fb3models/20221114-192943-deberta-v3-base/', inference_weight=1.0)
attention_fgm_CFG.name = 'attention_fgm'
attention_fgm_CFG.version = 'custom'
attention_fgm_CFG.config_path = '../input/fb3models/20221114-192943-deberta-v3-base/config/config.json'

weighted_fgm_CFG = Config(
    pretrained=False,
    path='../input/fb3models/20221115-061243-deberta-v3-base',
    config_path='../input/fb3models/20221115-061243-deberta-v3-base/config/config.json',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/20221115-061243-deberta-v3-base/tokenizer'),
    name='weighted_fgm',
    version='custom',
    train = True,
    debug = False,
    offline = False,
    models_path = 'FB3-models',
    epochs = 5,
    save_all_models = False,
    competition = 'FB3',
    apex = True,
    print_freq = 20,
    num_workers = 4,
    model = 'microsoft/deberta-v3-base', #If you want to train on the kaggle platform, v3-base is realistic. v3-large will time out.
    loss_func = 'SmoothL1', # 'SmoothL1', 'RMSE'
    gradient_checkpointing = True,
    scheduler = 'cosine',
    batch_scheduler = True,
    num_cycles = 0.5,
    num_warmup_steps = 0,
    encoder_lr = 2e-5,
    decoder_lr = 2e-5,
    min_lr = 1e-6,
    #Layer-Wise Learning Rate Decay
    llrd = True,
    layerwise_lr = 5e-5,
    layerwise_lr_decay = 0.9,
    layerwise_weight_decay = 0.01,
    layerwise_adam_epsilon = 1e-6,
    layerwise_use_bertadam = False,
    #pooling
    pooling = 'weightedlayer', # mean, max, min, attention, weightedlayer
    layer_start = 11,
    layers=None,
    #init_weight
    init_weight = 'normal', # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True,
    reinit_n = 1,
    #adversarial
    fgm = True,
    awp = False,
    adv_lr = 1,
    adv_eps = 0.2,
    unscale = False,
    eps = 1e-6,
    betas = (0.9, 0.999),
    max_len = 512,
    weight_decay = 0.01,
    gradient_accumulation_steps = 1,
    max_grad_norm = 1000,
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    seed = 42,
    cv_seed = 42,
    n_fold = 4,
    trn_fold = [0,1,2,3],
    batch_size = 8,
    n_targets = 6,
    gpu_id = 0) 

weighted_attention_CFG = load_config('../input/fb3models/weighted_attention_v3', inference_weight=1.0)
weighted_attention_CFG.name = 'weighted_attention'
weighted_attention_CFG.version = 'weighted-attention'

mean_attention_no_fgm_CFG = load_config('../input/fb3models/20221117-183420-deberta-v3-base-mean-attention-with-mask', inference_weight=1.0)
mean_attention_no_fgm_CFG.name = 'mean_attention_no_fgm'
mean_attention_no_fgm_CFG.version = 'mean-attention'

attention_large_fgm_CFG = load_config('../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm')
attention_large_fgm_CFG.name = 'attention_large_fgm'
attention_large_fgm_CFG.version = 'custom'
attention_large_fgm_CFG.config_path = '../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm/config/config.json'

attention_fgm_512_CFG = load_config('../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512')
attention_fgm_512_CFG.name = 'attention_fgm_512'
attention_fgm_512_CFG.version = 'custom'
attention_fgm_512_CFG.config_path = '../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512/config/config.json'

attention_fgm_768_CFG = load_config('../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm')
attention_fgm_768_CFG.name = 'attention_fgm_768'
attention_fgm_768_CFG.version = 'custom'
attention_fgm_768_CFG.config_path = '../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm/config/config.json'

weighted2last_fgm_512_CFG = load_config('../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm')
weighted2last_fgm_512_CFG.name = 'weighted2last_fgm_512'
weighted2last_fgm_512_CFG.version = 'custom'
weighted2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm/config/config.json'

weightedmean2last_fgm_512_CFG = load_config('../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm')
weightedmean2last_fgm_512_CFG.name = 'weightedmean2last_fgm_512'
weightedmean2last_fgm_512_CFG.version = 'custom'
weightedmean2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm/config/config.json'

roberta_attention_fgm_CFG = load_config('../input/fb3models/20221121-173739-roberta-base')
roberta_attention_fgm_CFG.name = 'roberta_attention_large_fgm'
roberta_attention_fgm_CFG.version = 'custom'
roberta_attention_fgm_CFG.config_path = '../input/fb3models/20221121-173739-roberta-base/config/config.json'

##########
v112_CFG.pretrained = False
v114_CFG.pretrained = False
v116_CFG.pretrained = False
v21_CFG.pretrained = False
v2_CFG.pretrained = False
attention_fgm_CFG.pretrained = False
weighted_attention_CFG.pretrained = False
weightedpool_CFG.pretrained = False
mean_attention_no_fgm_CFG.pretrained=False
attention_large_fgm_CFG.pretrained=False
attention_fgm_512_CFG.pretrained = False
attention_fgm_768_CFG.pretrained = False
weighted2last_fgm_512_CFG.pretrained = False
weightedmean2last_fgm_512_CFG.pretrained = False
roberta_attention_fgm_CFG.pretrained = False

weighted_fgm_CFG.inference_weight = 1.0
v114_CFG.inference_weight = 1.0 
v116_CFG.inference_weight = 1.0
v2_CFG.inference_weight = 1.0 
v21_CFG.inference_weight = 1.0
attention_fgm_CFG.inference_weight = 1.0
weighted_attention_CFG.inference_weight = 1.0
attention_large_fgm_CFG.inference_weight = 1.0
attention_fgm_512_CFG.inference_weight = 1.0
attention_fgm_768_CFG.inference_weight = 1.0
weighted2last_fgm_512_CFG.inference_weight = 1.0
weightedmean2last_fgm_512_CFG.inference_weight = 1.0
roberta_attention_fgm_CFG.inference_weight = 1.0

In [16]:
def get_ensemble_score(fine_tuned_models_cfg):
    pretrained_oof_results = {
        0: [],
        1: [],
        2: [],
        3: []}

    total_weight = 0
    
    for cfg in tqdm(fine_tuned_models_cfg):
        # infer_ = Inferencer(setup['path'], setup['inference_weight'])
        infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
        oof = test.copy()
        for fold in range(CFG.n_folds):
            if 'oof_results' in cfg and fold in cfg['oof_results']:
                predictions = cfg['oof_results'][fold]
            else:
                try:
                    predictions = torch.load(
                        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"), 
                        map_location = torch.device('cpu'))['predictions']
                except KeyError:
                    predictions = torch.load(
                        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"), 
                        map_location = torch.device('cpu'))['valid_preds']
                if 'oof_results' not in cfg:
                    cfg['oof_results'] = {}
                cfg['oof_results'][fold] = predictions
            oof.loc[oof['fold'] == fold, [f'pred_{target_col}' for target_col in target_cols]] = predictions
            predictions = predictions * cfg.inference_weight
            pretrained_oof_results[fold].append(predictions)
            
        total_weight += cfg.inference_weight

        cfg['oof'] = oof.copy()
    
    weighted_mean_oof = test.copy()
    for fold in range(CFG.n_folds):
        weighted_mean_oof.loc[
            weighted_mean_oof['fold']==fold, 
            [f"pred_{c}" for c in target_cols]
        ] = np.array(pretrained_oof_results[fold]).sum(axis=0)/total_weight
        get_result(CFG, weighted_mean_oof[weighted_mean_oof['fold']==fold])
    print('\n')
    get_result(CFG, weighted_mean_oof)
    oof_preds = [cfg['oof'][[f'pred_{target_col}' for target_col in target_cols]].values for cfg in fine_tuned_models_cfg]
    
    return oof_preds, weighted_mean_oof

## all models

In [17]:
fine_tuned_models_cfg = [
    weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, attention_fgm_CFG,
    
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, weighted_attention_CFG, attention_large_fgm_CFG, roberta_attention_fgm_CFG,
    
    #v116_CFG, v2_CFG, weightedpool_CFG, weighted_fgm_CFG
]

In [18]:
oof_preds, weighted_mean_oof = get_ensemble_score(fine_tuned_models_cfg)

  0%|          | 0/11 [00:00<?, ?it/s]

score: 0.442586  scores: [0.4720707712448348, 0.4355797749846355, 0.40576698432082664, 0.44899382866400633, 0.4593160512276641, 0.43378811829965147]
score: 0.450717  scores: [0.4841311235160003, 0.44270950380457397, 0.4140075444874199, 0.4489088277434105, 0.4679909010945405, 0.44655664110850923]
score: 0.454329  scores: [0.47638724159007645, 0.4469377533509186, 0.41335473912549847, 0.4621074164389505, 0.4770209752763032, 0.4501638648142493]
score: 0.439753  scores: [0.47382198477708376, 0.43959541318438017, 0.4047618580088698, 0.4320182282758933, 0.45902760084537103, 0.4292924064407926]


score: 0.446901  scores: [0.4766231371417863, 0.4412248819958981, 0.4094934656205698, 0.4481341588050277, 0.46589700260883876, 0.44003367127242504]


In [19]:
find_optimal_weights(oof_preds, test[target_cols].values, bounds=(-1.0, 1.0))

(array([ 0.03053147,  0.11300867,  0.13574549,  0.15840433, -0.23651732,
         0.09266294,  0.16754533,  0.16681941, -0.00694779,  0.32735722,
         0.05139025]),
 0.44550548104196364)

In [20]:
find_optimal_weights(oof_preds, test[target_cols].values, bounds=(0.01, 1.0))

(array([0.0409746 , 0.11473658, 0.02306713, 0.06851962, 0.01      ,
        0.06476458, 0.15137027, 0.15345884, 0.01      , 0.30815366,
        0.05495472]),
 0.4457266903344244)

## stacking (optinal)

In [21]:
from glob import glob
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
from joblib import dump, load
if str(device) == 'cpu':
    from sklearn.svm import SVR
else:
    from cuml.svm import SVR

### strong models

In [22]:
strong_models = [
    weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, attention_fgm_CFG
]
oof_preds, weighted_mean_oof = get_ensemble_score(strong_models)

optimal_weights, optimal_score = find_optimal_weights(oof_preds, test[target_cols].values, bounds=(-1.0, 1.0))
print('\n')
print('Optimal score:', optimal_score)
print('Optimal weights:', optimal_weights)

  0%|          | 0/5 [00:00<?, ?it/s]

score: 0.446933  scores: [0.4753812173163872, 0.43871284543114714, 0.40946724912529026, 0.45404290019775057, 0.4645873941594094, 0.43940815673455463]
score: 0.453408  scores: [0.48725456086384644, 0.44524752907446424, 0.4149177826972703, 0.4517977913330938, 0.4700491976532688, 0.45117936972871886]
score: 0.456213  scores: [0.4781926486156212, 0.4493594682500296, 0.4162138916638546, 0.4625873626903548, 0.47868859488293003, 0.4522335181234556]
score: 0.442238  scores: [0.47657228960055403, 0.44098628673290685, 0.4086124353200152, 0.4336169815683645, 0.46216288146169143, 0.4314777676885187]


score: 0.449748  scores: [0.4793709089272753, 0.44359487646221163, 0.41231544939967507, 0.45063451956939254, 0.46891467924967745, 0.4436563105215204]


Optimal score: 0.44956837283536816
Optimal weights: [0.23303419 0.28660832 0.12286597 0.26682334 0.09066818]


### weak models

In [23]:
weak_models = [
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, weighted_attention_CFG, attention_large_fgm_CFG, roberta_attention_fgm_CFG
]
oof_preds, weighted_mean_oof = get_ensemble_score(weak_models)

optimal_weights, optimal_score = find_optimal_weights(oof_preds, test[target_cols].values, bounds=(-1.0, 1.0))
print('\n')
print('Optimal score:', optimal_score)
print('Optimal weights:', optimal_weights)

  0%|          | 0/6 [00:00<?, ?it/s]

score: 0.441358  scores: [0.4723432817516506, 0.43475533526581256, 0.40438267956673907, 0.4466873529750857, 0.4579991563759526, 0.4319815080751596]
score: 0.450743  scores: [0.4847768527770989, 0.44236350308832534, 0.41502939442167147, 0.44874839009192147, 0.46852998482643593, 0.4450126024812419]
score: 0.454960  scores: [0.4778479759679727, 0.4467714726012902, 0.4129731049101703, 0.46378947095062967, 0.47811288763594223, 0.4502621289769793]
score: 0.440245  scores: [0.47500970833239853, 0.44059740556898846, 0.40356991996169567, 0.433061497825408, 0.45910059517169005, 0.43013179660209655]


score: 0.446886  scores: [0.4775150622743201, 0.4411426611784146, 0.40901868872780806, 0.4482039557681362, 0.466006022477172, 0.4394281900204986]


Optimal score: 0.44609322341329655
Optimal weights: [0.13595877 0.20074556 0.19400446 0.0293235  0.35083864 0.08912907]


## ew models

In [24]:
eww_models = [v116_CFG, v2_CFG, weightedpool_CFG, weighted_fgm_CFG]

_, eww_oof = get_ensemble_score(eww_models)

optimal_weights, optimal_score = find_optimal_weights(oof_preds, test[target_cols].values, bounds=(-1.0, 1.0))
print('\n')
print('Optimal score:', optimal_score)
print('Optimal weights:', optimal_weights)

  0%|          | 0/4 [00:00<?, ?it/s]

score: 0.446501  scores: [0.47471472794293434, 0.44000923123030955, 0.40905796462951055, 0.4526002092572088, 0.46565641016142073, 0.43697033835690124]
score: 0.453990  scores: [0.4852156761732504, 0.4456777427146603, 0.4154352683337913, 0.45350188889397264, 0.4736715588387051, 0.4504360085755037]
score: 0.457225  scores: [0.48330686353579266, 0.4489836500837208, 0.4168993660186848, 0.46182656052028287, 0.47762095142810335, 0.45471306383244536]
score: 0.444855  scores: [0.4777196906444851, 0.4397781698779775, 0.41114604184823744, 0.43849825411669596, 0.4654621202481103, 0.4365287082374503]


score: 0.450680  scores: [0.48025645669970124, 0.4436288052924128, 0.41314618791695784, 0.45168399378503193, 0.4706310818703639, 0.4447335546336968]


Optimal score: 0.44609322341329655
Optimal weights: [0.13595877 0.20074556 0.19400446 0.0293235  0.35083864 0.08912907]


In [25]:
eww_oof_preds = eww_oof[pred_cols].values

## forward selection

In [26]:
 # START ENSEMBLE USING MODEL WITH LARGEST CV
#   Repeat until CV does not increase by 0.0003+ :
#     # TRY ADDING EVERY MODEL ONE AT A TIME AND REMEMBER 
#     # HOW MUCH EACH INCREASES THE ENSEMBLE CV SCORE
#     for k in range( len(models) ):
#         for w in [0.01, 0.02, ..., 0.98, 0.99]:
#             # TRY ADDING MODEL k WITH WEIGHT w TO ENSEMBLE
#             trial = w * model[k,] + (1-w) * ensemble
#             auc_trial = roc_auc_score(true, trial)
#     # ADD ONE NEW MODEL TO ENSEMBLE THAT INCREASED CV THE MOST
#     # CHECK NEW CV SCORE. IF IT INCREASED REPEAT LOOP

In [27]:
labels = test[target_cols].values

In [28]:
def get_optimal_score(features, labels):
    return find_optimal_weights(build_preds(features), labels)[1], features

def mc_rmse_(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score

def median_avg(predictions,beta=0.5):
    '''
    predictions should be of a vector shape n_models
    beta: if beta is 0.5, then the middle 50% will be averaged
    '''
    sorted_predictions=np.sort(predictions)
    n_model=len(sorted_predictions)
    mid_point=n_model//2+1
    n_avg=int(n_model*beta)
    debug = True
    if debug:
        print('sorted_list',sorted_predictions.shape)
        print('after_cut',sorted_predictions[mid_point-n_avg//2-1:mid_point+n_avg//2].shape)
    to_avg=sorted_predictions[mid_point-n_avg//2-1:mid_point+n_avg//2]
    return np.mean(to_avg)

In [29]:
# all_models = [
#     v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, weighted_attention_CFG,
#     weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, roberta_attention_fgm_CFG
# ]
all_models = [
    weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, attention_fgm_CFG,
    
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, weighted_attention_CFG, roberta_attention_fgm_CFG,
    
    v2_CFG, weightedpool_CFG,
    #v116_CFG, v2_CFG, weightedpool_CFG, weighted_fgm_CFG
]
PATIENCE = 10
TOL = -0.003

In [45]:
weights_range = np.arange(0.01, 1.01, 0.01)
scores = [mc_rmse_(labels, w * build_preds([attention_large_fgm_CFG])[0]) for w in weights_range]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [46]:
all_preds = build_preds(all_models)
all_preds_df = pd.DataFrame()

for pred in all_preds:
    pred_df = pd.DataFrame(pred, columns=pred_cols)
    all_preds_df = all_preds_df.append(pred_df)

print('Median score:', mc_rmse_(labels, all_preds_df.reset_index().groupby('index').median().values))
print('Mean score:', mc_rmse_(labels, all_preds_df.reset_index().groupby('index').mean().values))

  0%|          | 0/12 [00:00<?, ?it/s]

Median score: 0.4479615501235969
Mean score: 0.44736027232053655


In [47]:
weights = [1]
selected_models = [attention_large_fgm_CFG]
selected_preds =  np.mean([weights[i] * build_preds([m]) for i,m in enumerate(selected_models)], axis=0)
selected_scores = [mc_rmse_(labels, selected_pred) for selected_pred in selected_preds]
ens_best_score = np.min(selected_scores)

old_score = ens_best_score
for i,model in enumerate(all_models):
    best_score = np.inf 
    best_w = None
    ct = 0
    for w in np.arange(0.01, 1, 0.01):
        trial = w * build_preds([model])[0] + (1-w) * np.average(build_preds(selected_models), axis=0, weights=weights)
        score = mc_rmse_(labels, trial)
        
        if score < best_score:
            best_score = score
            best_w = w
        else:
            ct += 1
        if ct > PATIENCE: break
    if best_score < ens_best_score:
        ens_best_score = best_score 
        ens_best_w = best_w
        ens_best_m = model

    inc = old_score-best_score
    if inc <= TOL:
        print(f'Decrease by {inc:.4f}. Stopping.')
        break

    # DISPLAY RESULTS
    print(); #print(kk,mx,mx_k,mx_w,'%.5f'%inc)
    print('Ensemble score = %.4f after adding model %s with weight %.3f. Increase of %.4f'%(best_score, model.name, best_w, inc))
    print()
    
    if ens_best_score == best_score:
        old_score = ens_best_score
        selected_models.append(ens_best_m)
        weights.append(ens_best_w)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]


Ensemble score = 0.4477 after adding model weighted2last_fgm_512 with weight 0.470. Increase of 0.0044



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]


Ensemble score = 0.4472 after adding model weightedmean2last_fgm_512 with weight 0.290. Increase of 0.0005



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]


Ensemble score = 0.4471 after adding model attention_fgm_512 with weight 0.160. Increase of 0.0001



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]


Ensemble score = 0.4468 after adding model attention_fgm_768 with weight 0.170. Increase of 0.0003



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]


Ensemble score = 0.4469 after adding model attention_fgm with weight 0.010. Increase of -0.0001



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]


Ensemble score = 0.4466 after adding model v21 with weight 0.140. Increase of 0.0002



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]


Ensemble score = 0.4461 after adding model v112 with weight 0.230. Increase of 0.0006



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]


Ensemble score = 0.4459 after adding model mean_attention_no_fgm with weight 0.190. Increase of 0.0002



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]


Ensemble score = 0.4461 after adding model weighted_attention with weight 0.010. Increase of -0.0002



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]


Ensemble score = 0.4460 after adding model roberta_attention_large_fgm with weight 0.070. Increase of -0.0001



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]


Ensemble score = 0.4459 after adding model v2 with weight 0.110. Increase of 0.0000



  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]


Ensemble score = 0.4458 after adding model weightedpool with weight 0.070. Increase of 0.0000



In [48]:
ens_best_score

0.44584549142087876

In [49]:
print('Total models:', len(selected_models))
print('Selected models:', [selected_model.name for selected_model in selected_models])
print('Optimal weights:', weights)

Total models: 10
Selected models: ['attention_large_fgm', 'weighted2last_fgm_512', 'weightedmean2last_fgm_512', 'attention_fgm_512', 'attention_fgm_768', 'v21', 'v112', 'mean_attention_no_fgm', 'v2', 'weightedpool']
Optimal weights: [1, 0.47000000000000003, 0.29000000000000004, 0.16, 0.17, 0.14, 0.23, 0.19, 0.11, 0.06999999999999999]


In [50]:
weights = [w/sum(weights) for w in weights]

In [51]:
weights

[0.353356890459364,
 0.1660777385159011,
 0.10247349823321557,
 0.05653710247349824,
 0.06007067137809188,
 0.049469964664310966,
 0.08127208480565372,
 0.06713780918727916,
 0.03886925795053004,
 0.02473498233215548]

In [52]:
opt_fine_tuned_models = selected_models
opt_fine_tuned_weights = weights

for i,m in enumerate(opt_fine_tuned_models):
    m.inference_weight = opt_fine_tuned_weights[i]
    
_, fine_tuned_oof = get_ensemble_score(opt_fine_tuned_models)

  0%|          | 0/10 [00:00<?, ?it/s]

score: 0.441634  scores: [0.4708088008965855, 0.43557707950892754, 0.40570654844137066, 0.4470577364807026, 0.45828980767092026, 0.4323633351609973]
score: 0.449143  scores: [0.4811032367301944, 0.43957839434782076, 0.4144354121560431, 0.44892059911831245, 0.46513693521184996, 0.445684491347229]
score: 0.453617  scores: [0.47536046744193905, 0.4470521856306097, 0.4118435024534858, 0.4624788094030018, 0.47635847088231065, 0.44860831099062715]
score: 0.438954  scores: [0.4735643022618435, 0.43917074599876654, 0.40330047239598266, 0.43096512797882347, 0.457883147075871, 0.4288374526945873]


score: 0.445893  scores: [0.475222638218856, 0.44036458337173895, 0.4088447595466631, 0.44749486252320375, 0.46447703339811036, 0.43895261923000717]


In [53]:
fine_tuned_oof_preds = fine_tuned_oof[[f'pred_{col}' for col in target_cols]].values

# SVR

In [54]:
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
svr_folds = 15

train = pd.read_csv(TRAIN_PATH)
skf = MultilabelStratifiedKFold(n_splits=svr_folds, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train,train[target_cols])):
    train.loc[val_index,'fold'] = i

In [55]:
from glob import glob 

def get_text_embedding(cfg, dfs):
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model)
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    if cfg.model == 'gpt2':
        cfg.tokenizer.pad_token = cfg.tokenizer.eos_token
    text_embs = []
    for df in dfs:
        dataset = TestDataset(cfg, df)
        loader = DataLoader(
            dataset,
            batch_size=4,
            shuffle=False)

        # Text embedding for SVM
        test_text_emb = []
        if not cfg.pretrained:
            for fold in infer_.cfg.trn_fold:
                test_text_emb.append(infer_.get_text_embedding(loader, device, fold))
            text_emb = np.mean(text_emb, axis=0)
        else:
            text_emb = infer_.get_text_embedding(loader, device)
        text_embs.append(text_emb)
        del dataset, loader; gc.collect(); torch.cuda.empty_cache();
    del infer_; gc.collect(); torch.cuda.empty_cache();
    return text_embs


def get_learner_score(models_cfg, folds=4, manual_features=None, save=False, verbose=False):
    for i, model_cfg in enumerate(models_cfg):
        model_name = model_cfg.model.split('/')[-1].replace('-', '_')
        models_cfg[i].model_name = model_name
        model_file = f'../input/fb3embeddings/train_text_emb_{model_cfg.file_name}.npy'
        if 'embedding' in model_cfg:
            continue
        with open(model_file, 'rb') as f:
            models_cfg[i].embedding = np.load(f)   
    embeddings = np.concatenate(
        [model_cfg.embedding for model_cfg in models_cfg],
        axis=1)
    if type(manual_features) != type(None):
        embeddings = np.concatenate(
            [embeddings, manual_features],
            axis=1)
    #print(embeddings.shape)
    def svr_inference_fn(model_path, te_text_feats):
        model = load(model_path)
        preds = model.predict(te_text_feats)
        return preds

    oof = train.copy()
    for fold in range(folds):
        val_idx = oof[oof['fold'] == fold].index
        preds = svr_inference_fn(f'../input/fb3-train-svr/svr_{fold}.model', embeddings[val_idx, :])
        oof.loc[oof['fold']==fold, [f'pred_{col}' for col in target_cols]] = preds
    labels = oof[target_cols].values
    oof_preds = oof[[f'pred_{col}' for col in target_cols]].values
    print(mc_rmse(labels, oof_preds))
    return oof

In [59]:
pretrained_models_cfg = [
    deberta_large_mnli,
    #gpt2,
    roberta_base,
    roberta_large,
    #xlnet_base, 
    #xlnet_large,
    deberta_base, 
    deberta_large, 
    deberta_xlarge,
    deberta_v2_xlarge, 
    deberta_v2_xxlarge,
    deberta_v3_base, 
    deberta_v3_large,
    
    #bart_base,
    bart_large,
    #bart_lage_mnli,
    #bert_base_uncased,
    bert_large_uncased,
    #muppet_roberta_large,
    funnel_small,
    funnel_large
]
len(pretrained_models_cfg)

14

In [60]:
import glob
from joblib import dump, load
from cuml.svm import SVR
import cuml

svr_oof = get_learner_score(pretrained_models_cfg, folds=15, manual_features=None, save=False, verbose=False)
svr_preds = svr_oof[[f'pred_{col}' for col in target_cols]].values

(0.4497834390128143, [0.4790045080354355, 0.445047470578311, 0.40857737854453746, 0.45267682596283126, 0.4698106269184335, 0.4435838240373371])


# all together

In [70]:
final_weights, _ = find_optimal_weights([fine_tuned_oof_preds, svr_preds], labels)

In [71]:
final_weights

array([0.65498877, 0.34501123])

In [72]:
mc_rmse(labels, np.average([fine_tuned_oof_preds, svr_preds], axis=0, weights=final_weights))

(0.4443898364196513,
 [0.47357347953631784,
  0.43932546148809903,
  0.40603249313580864,
  0.4465081264094765,
  0.463396342743972,
  0.437503115204234])