In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math 
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")
from IPython. display import clear_output

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install iterative-stratification==0.1.7')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels tokenizers')
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

from sklearn.ensemble import StackingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import RidgeCV

clear_output()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print('device:', device)

tokenizers.__version__: 0.12.1
transformers.__version__: 4.20.1
device: cpu


In [2]:
from lightgbm import LGBMRegressor
from sklearn.linear_model import Ridge, Lasso, BayesianRidge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load
if str(device) == 'cpu':
    from sklearn.svm import SVR
else:
    from cuml.svm import SVR
from sklearn.pipeline import FeatureUnion
import seaborn as sns
import matplotlib.pyplot as plt


In [3]:
BASE_PATH = '/kaggle/input/feedback-prize-english-language-learning'
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

In [4]:
class FB3Dataset(Dataset):
    def __init__(self, cfg, data):
        self.cfg = cfg
        self.xs = preprocess(data['full_text'])
        self.ys = data[cfg.target_cols].values 
        
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, idx):
        x = encode_text(self.cfg, self.xs[idx])
        y = torch.tensor(self.ys[idx], dtype=torch.float)
        return x, y
    
def collate(inputs):
    # Trimming input.
    mask_len = int(inputs['attention_mask'].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    #print(type(inputs), inputs['attention_mask'].size())
    return inputs

##################################################################################


In [5]:
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features, *args):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
    
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        last_hidden_state_masked = last_hidden_state
        last_hidden_state_masked[input_mask_expanded == 0] = -1e-9 
        max_embeddings = torch.max(last_hidden_state_masked, 1)[0]
        return max_embeddings
    
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, all_hidden_states):
        # only use first and last
        hidden_states = torch.stack([
            all_hidden_states[layer_i][:, 0].squeeze()
            for layer_i in (-1, 0)],
            dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        #out = self.dropout(out[:, -1, :])
        out = self.dropout(out.mean(dim=1))
        return out

class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average

class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
        nn.Linear(in_dim, in_dim),
        nn.LayerNorm(in_dim),
        nn.GELU(),
        nn.Linear(in_dim, 1),
        )

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask==0]=float('-inf')
        w = torch.softmax(w,1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings

class MultiSampleDropout(nn.Module):
    def __init__(self, fc, num_dropout, prob_dropout):
        super(MultiSampleDropout, self).__init__()
        self.dropout = nn.Dropout
        self.num_dropout = num_dropout
        self.prob_dropout = prob_dropout
        self.classifier = fc
    def forward(self, out):
        if not type(self.prob_dropout) in [float, int]:            
            fcs = [self.classifier(self.dropout(p)(out)) for p in self.prob_dropout]
        else:
            fcs = [self.classifier(self.dropout(self.prob_dropout)(out)) for _ in range(self.num_dropout)]
        
        return torch.mean(torch.stack(fcs, dim=0), dim=0)

# ====================================================
# Model classes
# ====================================================
class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)

        #if self.cfg.reinit_last_layer:
        #    # Re-init last layer of deberta.
        #    for module in self.deberta_v3.encoder.layer[-1:].modules():
        #        self._init_weights(module)
        # self.deberta_v3.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            #nn.init.xavier_uniform_(module.weight.data, gain=1.0)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class WMPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.mean_head = MeanPooling()
        self.wpool_head = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=12)

        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.qa_output = torch.nn.Linear(self.config.hidden_size, 2)
        self.attention_head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
    
    def feature(self, x):
        pt_out = self.deberta_v3(**x)
        all_hidden_states = torch.stack(pt_out.hidden_states)
        # Weighted pooling of last n layers.
        logits = self.wpool_head(all_hidden_states)[:, 0] # Bx768
        return logits
    
    def forward(self, x):
        logits = self.feature(x)
        y_hat = self.fc_out(logits)
        return y_hat

class MultiPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False, pool='mean'):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        # Define model layers.
        self.pool_name = cfg.pool_head
        self.fc = nn.Linear(self.config.hidden_size, self.cfg.num_target)
        if cfg.pool_head in ['mean', 'attention', 'weighted']:
            self.pool = self._pool_layer(cfg.pool_head)
        elif '-' in cfg.pool_head:
            pools = cfg.pool_head.split('-')
            self.pool = nn.ModuleList([])
            for pool_ in pools:
                self.pool.append(self._pool_layer(pool_))
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), self.cfg.num_target)
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, self.cfg.num_dropout, self.cfg.prob_dropout)
    
    def _pool_layer(self, pool_name):
        assert pool_name in ['mean', 'attention', 'weighted']
        if pool_name == 'mean':
            pool = MeanPooling()
        elif pool_name == 'attention':
            pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif pool_name == 'weighted':
            pool = WeightedLayerPooling(
                self.config.num_hidden_layers, 
                layer_start=9,
                layer_weights=None)
        return pool
    
    def _pool_feature(self, pool, pool_name, pt_outputs, attention_mask):
        assert pool_name in ['mean', 'attention', 'weighted']
        last_hidden_state = pt_outputs.last_hidden_state #batch_size x max_len x hidden_size
        all_hidden_states = torch.stack(pt_outputs.hidden_states) #num_layer x batch_size x max_len x hidden_size
        
        if pool_name == 'mean':
            pool_feature = pool(last_hidden_state, attention_mask)
        elif pool_name == 'attention':
            pool_feature = pool(last_hidden_state)
        elif pool_name == 'weighted':
            # Take the CLS token only.
            pool_feature = pool(all_hidden_states)[:, 0]
        return pool_feature

    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_features = []
            pool_names = self.pool_name.split('-')
            
            for pool_name, pool in zip(pool_names, self.pool):
                pool_features.append(self._pool_feature(pool, pool_name, pt_outputs, x['attention_mask']))
            pool_features = torch.cat(pool_features, dim=1)
        else:
            pool_features = self._pool_feature(self.pool, self.pool_name, pt_outputs, x['attention_mask'])
        return pool_features
    
    def forward(self, x):
        feature = self.feature(x)
        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(feature)
        else:
            y_hat = self.fc(feature)
        return y_hat


class Attention4Model(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)
        
        self.head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        self.fc_out = nn.Linear(self.config.hidden_size*4*2, self.cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        
        all_hidden_states = torch.stack(pt_out.hidden_states)
        cat_over_last_layers = torch.cat(
            (all_hidden_states[-1], all_hidden_states[-2], all_hidden_states[-3], all_hidden_states[-4]),
            -1)
        # [CLS] embedding.
        cls_pooling = cat_over_last_layers[:, 0]   
        # Concat of 4 last layers.
        head_logits = self.head(cat_over_last_layers)

        if self.cfg.use_dropout and self.training:
            y_hat = self.multi_dropout(torch.cat([head_logits, cls_pooling], -1))
        else:
            y_hat = self.fc_out(torch.cat([head_logits, cls_pooling], -1))

        return y_hat
    
class AttentionModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.att = nn.Sequential(
            nn.Linear(self.config.hidden_size, self.cfg.ap_hidden_size),
            nn.Tanh(),
            nn.Linear(self.cfg.ap_hidden_size, 1),
            nn.Softmax(dim=1),
        )
        self._init_weights(self.att)
        
        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)

    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        last_hidden_states = pt_out.last_hidden_state
        att_weights = self.att(last_hidden_states)
        logits =  torch.sum(att_weights * last_hidden_states, dim=1)
        
        y_hat = self.fc_out(logits)
        return y_hat
    
class MeanModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.mean_head = MeanPooling()

        # Head.
        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)
    
    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        # Mean pooling.
        logits = self.mean_head(pt_out.last_hidden_state, x['attention_mask'])
        y_hat = self.fc_out(logits)
        return y_hat

######################################

class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)

        #if self.cfg.reinit_last_layer:
        #    # Re-init last layer of deberta.
        #    for module in self.model.encoder.layer[-1:].modules():
        #        self._init_weights(module)
        #self.model.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class WeightedAttentionModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        self.weighted_pool = WeightedLayerPooling(
            self.config.num_hidden_layers, layer_start=9, layer_weights=None)
        self.att_pool = AttentionPooling(self.config.hidden_size)

        self.fc_out = nn.Linear(self.config.hidden_size*2, cfg.num_target)
        self._init_weights(self.fc_out)
    
    def feature(self, x):
        pt_out = self.model(**x)
        hidden_states = pt_out.hidden_states
        last_hidden_state = pt_out.last_hidden_state
        x1 = self.weighted_pool(torch.stack(hidden_states))[:, 0]
        x2 = self.att_pool(last_hidden_state, x['attention_mask'])
        return torch.cat([x1, x2], dim=1)
    
    def forward(self, x):
        feature = self.feature(x)
        y_hat = self.fc_out(feature)
        return y_hat
    
########################

class WeightedLayerPooling_(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layers = None, layer_weights = None):
        super(WeightedLayerPooling_, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        
        if layers:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                    torch.tensor([1] * len(layers), dtype=torch.float)
                )
            self.layers = layers
        else:
            self.layer_weights = layer_weights if layer_weights is not None \
                else nn.Parameter(
                   torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
                )
            self.layers = list(range(layer_start, num_hidden_layers+1))
            

    def forward(self, ft_all_layers):
        all_layer_embedding = torch.stack(ft_all_layers)
        all_layer_embedding = all_layer_embedding[self.layers, :, :, :]
        
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()

        return weighted_average
    
class MeanAttentionModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
            # Expand embedding dim for new tokens.
            self.deberta_v3.resize_token_embeddings(len(cfg.tokenizer))
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)
            
        self.deberta_v3.gradient_checkpointing_enable()
        
        # Define model layers.
        self.fc = nn.Linear(self.config.hidden_size, 6)

        if cfg.pool == 'mean':
            self.pool = MeanPooling()
        elif cfg.pool == 'attention':
            self.pool = AttentionHead(self.config.hidden_size, self.config.hidden_size)
        elif cfg.pool == 'mean-attention':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionHead(self.config.hidden_size, self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        elif cfg.pool == 'mean-attention-with-mask':
            self.pool = nn.ModuleList([
                MeanPooling(),
                AttentionPooling(self.config.hidden_size)
            ])
            self.fc = nn.Linear(self.config.hidden_size * len(self.pool), 6)
        # Re-init weights.
        self._init_weights(self.fc)
        
        # Multi-sample dropout.
        self.multi_dropout = MultiSampleDropout(self.fc, cfg.num_dropout, cfg.prob_dropout)
        
    def global_avg_pool(x):
        return torch.mean(x.view(x.size(0), x.size(1), -1), dim=-1)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def feature(self, x):
        pt_outputs = self.deberta_v3(**x)
        last_hidden_states = pt_outputs[0] # N x max_len x 768
        # Pooling feat.
        if type(self.pool) == nn.ModuleList:
            pool_feature = [pool(last_hidden_states, x['attention_mask']) for pool in self.pool]
            pool_feature = torch.cat(pool_feature, dim=1)
        else:
            pool_feature = self.pool(last_hidden_states, x['attention_mask']) # N x 768
        return pool_feature
    
    def forward(self, x, y=None, loss_fn=None):
        feature = self.feature(x)
        # if self.training:
        #    out = self.multi_dropout(feature)
        # else:
        #    out = self.fc(feature)
        out = self.fc(feature)
        return out
    
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        self.cfg = cfg
        super().__init__()
        
        self.config = AutoConfig.from_pretrained(config_path, output_hidden_states=True)
        self.model = AutoModel.from_config(self.config)
        self.pretrained = pretrained
        
        if cfg.pooling == 'mean':
            self.pool = MeanPooling()
        elif cfg.pooling == 'max':
            self.pool = MaxPooling()
        elif cfg.pooling == 'min':
            self.pool = MinPooling()
        elif cfg.pooling == 'attention':
            self.pool = AttentionPooling(self.config.hidden_size)
        elif cfg.pooling == 'weightedlayer':
            self.pool = WeightedLayerPooling_(self.config.num_hidden_layers, layer_start = cfg.layer_start, layer_weights = None)        
        elif self.cfg.pooling == 'attention4':
            self.pool = AttentionHead(self.config.hidden_size*4, 512)

        if self.cfg.pooling == 'attention4':
            self.fc = nn.Linear(self.config.hidden_size*8, 6)
        else:
            self.fc = nn.Linear(self.config.hidden_size, 6)
                
    def feature(self, inputs):
        outputs = self.model(**inputs)
        if self.cfg.pooling == 'attention4':
            all_layer_embeddings = torch.stack(outputs.hidden_states)
            cat_over_last_layers = torch.cat((all_layer_embeddings[-1], all_layer_embeddings[-2], all_layer_embeddings[-3], all_layer_embeddings[-4]), -1)
            cls_pooling = cat_over_last_layers[:, 0]
            head_logits = self.pool(cat_over_last_layers)
            feature = torch.cat([head_logits, cls_pooling], -1)
        elif self.cfg.pooling != 'weightedlayer':
            last_hidden_states = outputs[0]
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        else:
            all_layer_embeddings = outputs[1]
            feature = self.pool(all_layer_embeddings)[:, 0]
            
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [6]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def init(self, kwargs):
        super().init(kwargs)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)

In [7]:
loggers = {}
def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.propagate = False
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

logger = get_logger()

def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores

def get_result(cfg, oof_df, verbose=1):
    labels = oof_df[cfg.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in cfg.target_cols]].values
    score, scores = mc_rmse(labels, preds)
    if verbose == 1:
        print(f'score: {score:<.8f}  scores: {scores}')
    return score

In [8]:
# ====================================================
# Dataset
# ====================================================
def encode_text(cfg, text):
    if cfg.pretrained:
        inputs = cfg.tokenizer(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=cfg.max_len,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        inputs = {k:v.squeeze(0) for k,v in inputs.items()}
    else:
        inputs = cfg.tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            #max_length=CFG.max_len,
            #pad_to_max_length=True,
            #truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs 

def preprocess(texts):
    texts = (
        texts
        .str.replace(r'\r\n', '<newline>', regex=True)
        .str.replace(r'\n', '<newline>', regex=True)
        .str.replace('<newline><newline>', '<newline>', regex=False)
        .values 
    )
    return texts

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        if not cfg.pretrained and cfg.version in ['1', 'mean-attention']:
            print('preprocess')
            self.texts = preprocess(df['full_text'])
        else:
            self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = encode_text(self.cfg, self.texts[item])
        return inputs

In [9]:
def load_config(input_path, inference_weight=1):
    # Load CFG class.
    cfg = Config(**json.load(open(os.path.join(input_path, 'CFG.json'), 'r')))
    cfg.path = input_path
    cfg.config_path = os.path.join(cfg.path, 'config.pth')
    # Load tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(cfg.path, 'tokenizer'))
    cfg.tokenizer = tokenizer
    
    cfg.inference_weight = inference_weight
    return cfg

def load_model(cfg, fold, version='1', **model_kwargs):
    # Load torch model.
    if version == '1':
        model = MultiPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '2':
        model = Attention4Model(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == '21':
        model = WMPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'weighted-attention':
        model = WeightedAttentionModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'custom':
        model = CustomModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    elif version == 'mean-attention':
        model = MeanAttentionModel(cfg, config_path=cfg.config_path, pretrained=False)
        
    state = torch.load(
        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"),
        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    return model

In [10]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    #tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

class Inferencer:
    def __init__(self, input_path=None, cfg=None, inference_weight=1):
        if cfg == None:
            self.cfg = load_config(input_path, inference_weight)
        else:
            self.cfg = cfg
    
    def predict(self, test_loader, device, stat_fn=np.mean):
        preds = []
        start = time.time()
        print('#'*10, self.cfg.path, '#'*10)
        for fold in self.cfg.trn_fold:
            print(f'Predicting fold {fold}...')
            model = load_model(self.cfg, fold, version=self.cfg.version)
            pred = inference_fn(test_loader, model, device)
            preds.append(pred)
            del model, pred; gc.collect()
            torch.cuda.empty_cache()
        end = time.time() - start
        print('#'*10, f'ETA: {end:.2f}s', '#'*10, '\n')
        
        self.preds = stat_fn(preds, axis=0) 
        self.preds = np.clip(self.preds, 1, 5)
        return self.preds
    
    def get_oof_result(self, file_type='pkl', verbose=1):
        return get_result(self.cfg, self.get_oof_df(file_type), verbose)
    
    def get_oof_df(self, file_type='pkl'):
        if file_type == 'pkl':
            return pd.read_pickle(os.path.join(self.cfg.path, 'oof_df.pkl'))
        return pd.read_csv(os.path.join(self.cfg.path, 'oof_df.csv'))
    
    def get_text_embedding(self, data_loader, device, fold=None): 
        # pretrained=True: not fine-tuned models.
        if not self.cfg.pretrained:
            model = load_model(self.cfg, fold, version=self.cfg.version)            
        else:
            model = AutoModel.from_pretrained(self.cfg.model)
        model.to(device)
        model.eval()
            
        fold_emb = []
        for inputs in data_loader:
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            input_ids = inputs['input_ids'].to(device)
            attention_mask = inputs['attention_mask'].to(device)
            token_type_ids = inputs['token_type_ids'].to(device)
            
            if not self.cfg.pretrained:
                with torch.no_grad():
                    # emb = model(inputs)
                    # output = model.deberta_v3(**inputs)
                    emb = model.feature(inputs)
            else:    
                with torch.no_grad():
                    try:
                        output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                    except:
                        output = model(input_ids=input_ids, attention_mask=attention_mask)
                emb = mean_pooling(output, attention_mask.detach().cpu())
                emb = F.normalize(emb, p=2, dim=1)
                emb = emb.squeeze(0)
            fold_emb.extend(emb.detach().cpu().numpy())
            del emb; gc.collect(); torch.cuda.empty_cache();
            #print(torch.cuda.memory_allocated() /1024/1024)
            
        fold_emb = np.array(fold_emb)
        return fold_emb

In [11]:
from scipy import optimize
def find_optimal_weights(oof_preds, labels, bounds=(0.0, 1.0), method='SLSQP'):
    weights = [1] * len(oof_preds)
    def loss(weights):
        return mc_rmse(labels, np.clip(np.average(oof_preds, weights=weights, axis=0), 1, 5))[0]

    opt_weights = optimize.minimize(
        loss,
        [1/len(oof_preds)] * len(oof_preds),
        constraints=({'type': 'eq','fun': lambda w: 1-sum(w)}),
        method= method, #'Nelder-Mead',
        bounds=[bounds] * len(oof_preds),
        options = {'ftol':1e-10},
    )['x']

    opt_weights = np.array(opt_weights) / sum(opt_weights)
    #print('\n')
    #print("score:", loss(opt_weights))
    #print(', '.join([str(i) for i in opt_weights]))
    return opt_weights, loss(opt_weights)

In [12]:
lgbm_params=\
  {'n_estimators':3000,
   'boosting_type': 'gbdt',
   'objective':'regression',
   'metric':'rmse',
   'subsample': 0.7, 
   'subsample_freq': 1,
   #'num_leaves':124,
   'min_data_in_leaf':40,
   'feature_fraction_bynode':np.sqrt(0.9),
   'feature_fraction': np.sqrt(0.9),            
   'learning_rate': 0.001,
   'max_bin':255,
   #'cat_l2':10,
   #'max_depth':5,
   'boost_from_average':True,
   'nthread' : 8,
    'lambda_l1': 2,  
    'lambda_l2': 20,
  #'min_gain_to_split':0.0001
   #'early_stopping_rounds':200,
   'verbose':-1
    }

---

# load configs

In [13]:
CFG = Config(
    n_fold=15,
    seed=42,
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
)

train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
submission = pd.read_csv(SUBMISSION_PATH)

mskfold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)

for n, (train_idx, val_idx) in enumerate(mskfold.split(train, train[CFG.target_cols])):
    train.loc[val_idx, 'fold'] = int(n)

In [14]:
v114_CFG = Config(
    model="microsoft/deberta-v3-base",
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='mean-attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    path='../input/fb3models/v114/',
    config_path='../input/fb3models/v114/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v114/tokenizer')
)

weightedpool_CFG = Config(
    model='microsoft/deberta-v3-base',
    name='weightedpool',
    version='1',
    num_target=6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=1.5e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='weighted',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3-train/',
    config_path='../input/fb3-train/config.pth',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3-train/tokenizer'),
    inference_weight=1.0)

v116_CFG = load_config('../input/fb3-colab-models/v116', inference_weight=1.0)
v116_CFG.path = '../input/fb3models/v116'
v116_CFG.config_path = '../input/fb3models/v116/config.pth'
v116_CFG.version = '1'
v116_CFG.name = 'v116'

v112_CFG = Config(
    num_workers=1,
    batch_size=3,
    max_len=512,
    model="microsoft/deberta-v3-base",
    name='v112',
    version='1',
    num_target = 6,
    reinit_last_layer=True,
    reinit_fc=True,
    weight_decay=0.01,
    learning_rate=2e-5,
    layerwise_learning_rate_decay=1.5,
    use_dropout=False,
    prob_dropout=[0.06, 0.08, 0.1, 0.12, 0.14],
    num_dropout=5,
    pool_head='attention',
    seed=42,
    n_fold=4,
    trn_fold=[0,1,2,3],
    train=True,
    path='../input/fb3models/v112/',
    config_path='../input/fb3models/v112/config.pth',
    inference_weight=1.0,
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/v112/tokenizer')
)

#####
v2_CFG = load_config('../input/fb3models/v2/', inference_weight=1.0)
v2_CFG.name = 'v2'
v2_CFG.version = '2'
v2_CFG.trn_fold = [0,1,2,3]

#####
v21_CFG = load_config('../input/fb3models/v21/', inference_weight=1)
v21_CFG.name = 'v21'
v21_CFG.version = '21'

#####
attention_fgm_CFG = load_config('../input/fb3models/20221114-192943-deberta-v3-base/', inference_weight=1.0)
attention_fgm_CFG.name = 'attention_fgm'
attention_fgm_CFG.version = 'custom'
attention_fgm_CFG.config_path = '../input/fb3models/20221114-192943-deberta-v3-base/config/config.json'

weighted_fgm_CFG = Config(
    pretrained=False,
    path='../input/fb3models/20221115-061243-deberta-v3-base',
    config_path='../input/fb3models/20221115-061243-deberta-v3-base/config/config.json',
    tokenizer=AutoTokenizer.from_pretrained('../input/fb3models/20221115-061243-deberta-v3-base/tokenizer'),
    name='weighted_fgm',
    version='custom',
    train = True,
    debug = False,
    offline = False,
    models_path = 'FB3-models',
    epochs = 5,
    save_all_models = False,
    competition = 'FB3',
    apex = True,
    print_freq = 20,
    num_workers = 4,
    model = 'microsoft/deberta-v3-base', #If you want to train on the kaggle platform, v3-base is realistic. v3-large will time out.
    loss_func = 'SmoothL1', # 'SmoothL1', 'RMSE'
    gradient_checkpointing = True,
    scheduler = 'cosine',
    batch_scheduler = True,
    num_cycles = 0.5,
    num_warmup_steps = 0,
    encoder_lr = 2e-5,
    decoder_lr = 2e-5,
    min_lr = 1e-6,
    #Layer-Wise Learning Rate Decay
    llrd = True,
    layerwise_lr = 5e-5,
    layerwise_lr_decay = 0.9,
    layerwise_weight_decay = 0.01,
    layerwise_adam_epsilon = 1e-6,
    layerwise_use_bertadam = False,
    #pooling
    pooling = 'weightedlayer', # mean, max, min, attention, weightedlayer
    layer_start = 11,
    layers=None,
    #init_weight
    init_weight = 'normal', # normal, xavier_uniform, xavier_normal, kaiming_uniform, kaiming_normal, orthogonal
    #re-init
    reinit = True,
    reinit_n = 1,
    #adversarial
    fgm = True,
    awp = False,
    adv_lr = 1,
    adv_eps = 0.2,
    unscale = False,
    eps = 1e-6,
    betas = (0.9, 0.999),
    max_len = 512,
    weight_decay = 0.01,
    gradient_accumulation_steps = 1,
    max_grad_norm = 1000,
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions'],
    seed = 42,
    cv_seed = 42,
    n_fold = 4,
    trn_fold = [0,1,2,3],
    batch_size = 8,
    n_targets = 6,
    gpu_id = 0) 

weighted_attention_CFG = load_config('../input/fb3models/weighted_attention_v3', inference_weight=1.0)
weighted_attention_CFG.name = 'weighted_attention'
weighted_attention_CFG.version = 'weighted-attention'

mean_attention_no_fgm_CFG = load_config('../input/fb3models/20221117-183420-deberta-v3-base-mean-attention-with-mask', inference_weight=1.0)
mean_attention_no_fgm_CFG.name = 'mean_attention_no_fgm'
mean_attention_no_fgm_CFG.version = 'mean-attention'

attention_large_fgm_CFG = load_config('../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm')
attention_large_fgm_CFG.name = 'attention_large_fgm'
attention_large_fgm_CFG.version = 'custom'
attention_large_fgm_CFG.config_path = '../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm/config/config.json'

attention_fgm_512_CFG = load_config('../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512')
attention_fgm_512_CFG.name = 'attention_fgm_512'
attention_fgm_512_CFG.version = 'custom'
attention_fgm_512_CFG.config_path = '../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512/config/config.json'

attention_fgm_768_CFG = load_config('../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm')
attention_fgm_768_CFG.name = 'attention_fgm_768'
attention_fgm_768_CFG.version = 'custom'
attention_fgm_768_CFG.config_path = '../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm/config/config.json'

weighted2last_fgm_512_CFG = load_config('../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm')
weighted2last_fgm_512_CFG.name = 'weighted2last_fgm_512'
weighted2last_fgm_512_CFG.version = 'custom'
weighted2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm/config/config.json'

weightedmean2last_fgm_512_CFG = load_config('../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm')
weightedmean2last_fgm_512_CFG.name = 'weightedmean2last_fgm_512'
weightedmean2last_fgm_512_CFG.version = 'custom'
weightedmean2last_fgm_512_CFG.config_path = '../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm/config/config.json'

roberta_attention_fgm_CFG = load_config('../input/fb3models/20221121-173739-roberta-base')
roberta_attention_fgm_CFG.name = 'roberta_attention_large_fgm'
roberta_attention_fgm_CFG.version = 'custom'
roberta_attention_fgm_CFG.config_path = '../input/fb3models/20221121-173739-roberta-base/config/config.json'

##########
v112_CFG.pretrained = False
v114_CFG.pretrained = False
v116_CFG.pretrained = False
v21_CFG.pretrained = False
v2_CFG.pretrained = False
attention_fgm_CFG.pretrained = False
weighted_attention_CFG.pretrained = False
weightedpool_CFG.pretrained = False
mean_attention_no_fgm_CFG.pretrained=False
attention_large_fgm_CFG.pretrained=False
attention_fgm_512_CFG.pretrained = False
attention_fgm_768_CFG.pretrained = False
weighted2last_fgm_512_CFG.pretrained = False
weightedmean2last_fgm_512_CFG.pretrained = False
roberta_attention_fgm_CFG.pretrained = False

weighted_fgm_CFG.inference_weight = 1.0
v114_CFG.inference_weight = 1.0 
v116_CFG.inference_weight = 1.0
v2_CFG.inference_weight = 1.0 
v21_CFG.inference_weight = 1.0
attention_fgm_CFG.inference_weight = 1.0
weighted_attention_CFG.inference_weight = 1.0
attention_large_fgm_CFG.inference_weight = 1.0
attention_fgm_512_CFG.inference_weight = 1.0
attention_fgm_768_CFG.inference_weight = 1.0
weighted2last_fgm_512_CFG.inference_weight = 1.0
weightedmean2last_fgm_512_CFG.inference_weight = 1.0
roberta_attention_fgm_CFG.inference_weight = 1.0

In [15]:
strong_models = [
    weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, attention_fgm_CFG,
]
strong_weights = [0.23303419, 0.28660832, 0.12286597, 0.26682334, 0.09066818]

weak_models = [
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, weighted_attention_CFG, attention_large_fgm_CFG, roberta_attention_fgm_CFG,
]
weak_weights = [0.13595877, 0.20074556, 0.19400446, 0.0293235,0.35083864,0.08912907]

In [16]:
# for i, ftm_cfg in enumerate(fine_tuned_models_cfg):
#     print(ftm_cfg.path, opt_weights[i])
#     fine_tuned_models_cfg[i].inference_weight = opt_weights[i]

In [37]:
def train_stacking(oof_train, learners, save=False):
    oof_scores = []
    oof_oof_train = pd.DataFrame()
    
    def models_fit_predict(models, X_train, y_train, X_val, y_val, fold, save=False):
        preds = []
        for model in models:
            model_name = type(model).__name__.lower()
            if type(model) != LinearRegression:
                model = MultiOutputRegressor(model)
            model.fit(X_train, y_train)
            if save:
                #dump(model, f'{model_name}_strong_fold{fold}.model')
                dump(model, f'{model_name}_fold{fold}.model')
            preds.append(model.predict(X_val))
            #for estimator in model.estimators_:
            #    print(estimator.coef_)
        return np.mean(preds, axis=0)

    for fold in range(CFG.n_fold):
        print(f'\nFold {fold+1}/{CFG.n_fold}')

        X_train = oof_train[oof_train['fold']!=fold][oof_train.columns[8:]].values
        #assert X_train.shape[1] == len(fine_tuned_models_cfg)*6 + 6
        y_train = oof_train[oof_train['fold']!=fold][CFG.target_cols].values
        X_val = oof_train[oof_train['fold']==fold][oof_train.columns[8:]].values
        y_val = oof_train[oof_train['fold']==fold][CFG.target_cols].values

        pred_val = models_fit_predict(
            learners,
            X_train, y_train, X_val, y_val,
            fold, save=save,
        )

        val_fold = oof_train[oof_train['fold']==fold].reset_index(drop=True)
        val_fold[[f'pred_{c}' for c in CFG.target_cols]] = pred_val

        oof_oof_train = pd.concat([oof_oof_train, val_fold])

        oof_score, _ = mc_rmse(y_val, pred_val)
        oof_scores.append(oof_score)
        print(f'Score: {oof_score}')
        print('#'*50)
    get_result(CFG, oof_oof_train)

learners = [
    Ridge(alpha=48.0, random_state=CFG.seed), 
    #BayesianRidge(),
    #Lasso(alpha=1.0, random_state=CFG.seed),
    LinearRegression(normalize=True, positive=True),
    #SVR(kernel='linear', gamma='auto'),
    #LGBMRegressor(**lgbm_params)
]

In [18]:
#######################################
def get_ensemble_oof(cfgs, return_oof_train=False, verbose=True):
    oof_dfs = pd.DataFrame()
    oof_train = pd.DataFrame()
    total_weight = 0
    for cfg in cfgs:
        cfg.target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
        infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)

        if cfg.path in [attention_fgm_CFG.path, attention_fgm_768_CFG.path, attention_fgm_512_CFG.path, attention_large_fgm_CFG.path, roberta_attention_fgm_CFG.path, weightedmean2last_fgm_512_CFG.path, weighted2last_fgm_512_CFG.path]:
            file_type = 'csv'
        else:
            file_type = 'pkl'
        oof_df = infer_.get_oof_df(file_type)
        total_weight += infer_.cfg.inference_weight
        
        if verbose:
            print(cfg.path)
            get_result(cfg, oof_df)
            print('\n')

        pred_cols = [f'pred_{col}' for col in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]
        oof_df_copy = oof_df.copy()
        oof_df[pred_cols] = oof_df[pred_cols] * infer_.cfg.inference_weight
        oof_dfs = oof_dfs.append(oof_df)

        oof_df = oof_df_copy.copy()
        oof_df = oof_df[['text_id'] + pred_cols]
        oof_df.columns = ['text_id'] + [cfg.name + '_' + col for col in pred_cols]

        if len(oof_train) == 0:
            oof_train = (
                train
                .merge(oof_df, on=['text_id'], how='left')
                .drop(columns=['full_text']))
        else:
            oof_train = (
                oof_train.merge(oof_df, on=['text_id'], how='left'))
        del infer_; gc.collect()
    
    pred_cols = [f'pred_{col}' for col in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]
    # oof_dfs_mean = oof_dfs.groupby('text_id')[pred_cols].mean()
    oof_dfs_mean = oof_dfs.groupby('text_id')[pred_cols].sum() / total_weight
    oof_dfs_mean = oof_dfs_mean.join(train.set_index('text_id'))
    
    if return_oof_train:
        get_result(CFG, oof_dfs_mean, verbose=1)
        return oof_train
    return get_result(CFG, oof_dfs_mean, verbose=0), cfgs

In [19]:
target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

## all models

In [20]:
all_models = [
    attention_large_fgm_CFG, weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, 
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, v2_CFG, weightedpool_CFG,
    attention_fgm_CFG
]
optimal_weights = [
    0.353356890459364,
    0.1660777385159011,
    0.10247349823321557,
    0.05653710247349824,
    0.06007067137809188,
    0.049469964664310966,
    0.08127208480565372,
    0.06713780918727916,
    0.03886925795053004,
    0.02473498233215548
]

all_models_oof_train = get_ensemble_oof(all_models, True)

../input/fb3models/20221118-164148-deberta-v3-large-attention_fgm
score: 0.45206291  scores: [0.48157374223152627, 0.4452090627651757, 0.4154085184288849, 0.4536476489584571, 0.47240178087223106, 0.44413673294015477]


../input/fb3models/20221124-060246-deberta-v3-base-weighted2last_fgm
score: 0.45303342  scores: [0.48389889660631685, 0.44678944048963176, 0.41465519997691047, 0.45360604538329913, 0.4717536101123449, 0.4474973036716195]


../input/fb3models/20221124-160318-deberta-v3-base-weightedmean2last_fgm
score: 0.45276573  scores: [0.48130576936482516, 0.4463892153474189, 0.4154102670831803, 0.4533288332620337, 0.471398800994229, 0.4487615040883342]


../input/fb3models/20221121-143655-deberta-v3-base-attention_fgm_512
score: 0.45415400  scores: [0.48439478507550315, 0.4476619277192037, 0.41633279856096794, 0.4540781761924119, 0.4753848249763217, 0.4470715017077387]


../input/fb3models/20221120-072218-deberta-v3-base-attention_fgm
score: 0.45289948  scores: [0.4811456435689267, 0

### find correlation between models

In [21]:
from sklearn.cluster import KMeans, DBSCAN

In [22]:
all_model_preds = []
for model in all_models:
    model_preds = []
    model_name = model.name
    print(model_name)
    for target in target_cols:
        model_target_pred = all_models_oof_train[f'{model_name}_pred_{target}'].values.reshape(1,-1)
        model_preds.extend(model_target_pred)
    model_preds = np.array(model_preds).reshape(1, -1)
    all_model_preds.append(model_preds)
all_model_preds = np.concatenate(all_model_preds)

attention_large_fgm
weighted2last_fgm_512
weightedmean2last_fgm_512
attention_fgm_512
attention_fgm_768
v21
v112
mean_attention_no_fgm
v2
weightedpool
attention_fgm


In [23]:
dbs = DBSCAN(eps=2048, min_samples=2, metric='l1')
dbs_clusters = dbs.fit_predict(all_model_preds)
dbs_clusters

array([-1,  0,  0,  0,  0, -1, -1, -1, -1, -1,  0])

In [24]:
all_models = [
    attention_large_fgm_CFG, weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, 
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, v2_CFG, weightedpool_CFG,
    attention_fgm_CFG
]

In [25]:
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances

sim_mat = {}
for model1 in all_models:
    model1_name = model1.name
    sim_mat[model1_name] = {}
    for model2 in all_models:
        sims = []
        model2_name = model2.name
        if model2_name == model1_name: continue
        for target in target_cols:
            model1_target_pred = all_models_oof_train[f'{model1_name}_pred_{target}'].values.reshape(1,-1)
            model2_target_pred = all_models_oof_train[f'{model2_name}_pred_{target}'].values.reshape(1,-1)
            sim = manhattan_distances(model1_target_pred, model2_target_pred)
            print(f'Cosine similarity between {model1_name} and {model2_name} on {target}: {sim}')
            sims.append(sim)
        model1_model2_sim = np.mean(sims)
        sim_mat[model1_name][model2_name] = model1_model2_sim
        print(f'Mean cosine similarity between {model1_name} and {model2_name}: {model1_model2_sim:.5f}')
        print()

Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on cohesion: [[447.49531734]]
Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on syntax: [[373.54864633]]
Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on vocabulary: [[376.58544242]]
Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on phraseology: [[389.67531335]]
Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on grammar: [[443.73869109]]
Cosine similarity between attention_large_fgm and weighted2last_fgm_512 on conventions: [[424.68960422]]
Mean cosine similarity between attention_large_fgm and weighted2last_fgm_512: 409.28884

Cosine similarity between attention_large_fgm and weightedmean2last_fgm_512 on cohesion: [[431.09817967]]
Cosine similarity between attention_large_fgm and weightedmean2last_fgm_512 on syntax: [[381.97617685]]
Cosine similarity between attention_large_fgm and weightedmean2last_fgm_512 on vocab

1. Mỗi bước xét lấy 1 model nếu như mean khoảng cách cao hơn mean khoảng cách trước đó 

In [26]:
result = []

for model1 in sim_mat.keys():
    #model1 = 'attention_large_fgm'
    models = [model1]
    dists = []
    global_best = 0
    threshold = 300

    while True:
        cur_best = 0
        cur_models = models
        cur_best_model = None
        for model2, model2_dist in sim_mat[model1].items():
            if model2 in models:
                continue
            expect_mean_dist = np.mean([model2_dist, *dists])
            if cur_best < expect_mean_dist:
                cur_best = expect_mean_dist
                cur_models = [*models, model2]
                cur_best_model = model2
        if cur_best <= threshold:
            result.append((global_best, models))
            print()
            print(f'Mean distance decreased by {threshold}. Stopping.')
            print(f'Final mean distance: {global_best}. Models: {models}')
            print()
            break
        print(f'Adding {cur_best_model} to stack. Mean distance: {cur_best:.4f}')
        global_best = cur_best
        models = cur_models
        model1 = cur_best_model

Adding weightedpool to stack. Mean distance: 540.8507
Adding attention_fgm_512 to stack. Mean distance: 514.1399
Adding v2 to stack. Mean distance: 435.1400
Adding mean_attention_no_fgm to stack. Mean distance: 424.3514
Adding v21 to stack. Mean distance: 409.4303
Adding v112 to stack. Mean distance: 418.5441
Adding weighted2last_fgm_512 to stack. Mean distance: 372.3051
Adding attention_fgm to stack. Mean distance: 310.0085
Adding weightedmean2last_fgm_512 to stack. Mean distance: 311.1296
Adding attention_fgm_768 to stack. Mean distance: 300.1116

Mean distance decreased by 300. Stopping.
Final mean distance: 300.1116051774349. Models: ['attention_large_fgm', 'weightedpool', 'attention_fgm_512', 'v2', 'mean_attention_no_fgm', 'v21', 'v112', 'weighted2last_fgm_512', 'attention_fgm', 'weightedmean2last_fgm_512', 'attention_fgm_768']

Adding weightedpool to stack. Mean distance: 497.0153
Adding attention_large_fgm to stack. Mean distance: 540.8507
Adding v21 to stack. Mean distance: 451

In [27]:
for s, m in sorted(result, key=lambda tup: tup[0]):
    cfgs = [cfg for cfg in all_models if cfg.name in m]
    score = get_ensemble_oof(cfgs, False, False)[0]
    print(f'Distance: {s} with {len(m)} models {m} with score {score:.5f}')

Distance: 300.1116051774349 with 11 models ['attention_large_fgm', 'weightedpool', 'attention_fgm_512', 'v2', 'mean_attention_no_fgm', 'v21', 'v112', 'weighted2last_fgm_512', 'attention_fgm', 'weightedmean2last_fgm_512', 'attention_fgm_768'] with score 0.44664
Distance: 300.1116051774349 with 11 models ['v112', 'weightedpool', 'attention_large_fgm', 'v21', 'v2', 'attention_fgm_512', 'mean_attention_no_fgm', 'weighted2last_fgm_512', 'attention_fgm', 'weightedmean2last_fgm_512', 'attention_fgm_768'] with score 0.44664
Distance: 300.1116051774349 with 11 models ['mean_attention_no_fgm', 'weightedpool', 'attention_large_fgm', 'v21', 'v2', 'attention_fgm_512', 'v112', 'weighted2last_fgm_512', 'attention_fgm', 'weightedmean2last_fgm_512', 'attention_fgm_768'] with score 0.44664
Distance: 300.1116051774349 with 11 models ['v2', 'weightedpool', 'attention_large_fgm', 'v21', 'v112', 'attention_fgm_512', 'mean_attention_no_fgm', 'weighted2last_fgm_512', 'attention_fgm', 'weightedmean2last_fgm_51

In [28]:
get_ensemble_oof([weightedmean2last_fgm_512_CFG, attention_fgm_768_CFG], False, False)[0]

0.4501521204120645

In [29]:
# Distance: 435.14002636145824 with 6 models ['mean_attention_no_fgm', 'weightedpool', 'attention_large_fgm', 'v21', 'v2', 'attention_fgm_512'] with score 0.44673
# Distance: 372.3051254451275 with 3 models ['attention_fgm', 'v112', 'weighted2last_fgm_512'] with score 0.44876
# Distance: 300.1116051774349 with 2 models [weightedmean2last_fgm_512, attention_fgm_768] with score 0.45015

In [30]:
group1 = [mean_attention_no_fgm_CFG, weightedpool_CFG, attention_large_fgm_CFG, v21_CFG, v2_CFG, attention_fgm_512_CFG]
group2 = [attention_fgm_CFG, v112_CFG, weighted2last_fgm_512_CFG]
group3 = [weightedmean2last_fgm_512_CFG, attention_fgm_768_CFG]

In [31]:
group1_oof_train = get_ensemble_oof(group1, True, False)
learners = [
    Ridge(alpha=0.0125, tol=1e-4, fit_intercept=True,normalize=True,random_state=CFG.seed),
]
train_stacking(group1_oof_train, learners, save=False)

score: 0.44672897  scores: [0.4767729692779155, 0.4410762828440261, 0.4094831656155451, 0.4479551640355943, 0.46539906358105076, 0.4396871892407088]

Fold 1/15
Score: 0.45870317724899023
##################################################

Fold 2/15
Score: 0.44234899305917225
##################################################

Fold 3/15
Score: 0.44105602887892426
##################################################

Fold 4/15
Score: 0.45117547480859604
##################################################

Fold 5/15
Score: 0.4406279962028159
##################################################

Fold 6/15
Score: 0.4527824419978515
##################################################

Fold 7/15
Score: 0.4361227610530469
##################################################

Fold 8/15
Score: 0.42857617867629977
##################################################

Fold 9/15
Score: 0.4556767472695966
##################################################

Fold 10/15
Score: 0.4548460597965906
################

In [32]:
group2_oof_train = get_ensemble_oof(group2, True, False)
learners = [
    Ridge(alpha=0.0125, fit_intercept=True,normalize=True,random_state=CFG.seed),
]
train_stacking(group2_oof_train, learners, save=False)

score: 0.44876347  scores: [0.47856353179776495, 0.4431374902723514, 0.41045984537779256, 0.4497649613139139, 0.46755120114985516, 0.44310376124802897]

Fold 1/15
Score: 0.45855608189920255
##################################################

Fold 2/15
Score: 0.44299971585478737
##################################################

Fold 3/15
Score: 0.4443066022529081
##################################################

Fold 4/15
Score: 0.4589972675350678
##################################################

Fold 5/15
Score: 0.4464481348270668
##################################################

Fold 6/15
Score: 0.45771956344486564
##################################################

Fold 7/15
Score: 0.43957911463537075
##################################################

Fold 8/15
Score: 0.4314743963579654
##################################################

Fold 9/15
Score: 0.4540069431269629
##################################################

Fold 10/15
Score: 0.4587212889097298
##############

In [33]:
group3_oof_train = get_ensemble_oof(group3, True, False)
learners = [
    Ridge(alpha=0.0125, fit_intercept=True,normalize=True,random_state=CFG.seed),
]
train_stacking(group3_oof_train, learners, save=False)

score: 0.45015212  scores: [0.4783895750702563, 0.4441981438590136, 0.4132218948555447, 0.45136487125318286, 0.4692148522457962, 0.4445233851885932]

Fold 1/15
Score: 0.4636697304428999
##################################################

Fold 2/15
Score: 0.44333697577960907
##################################################

Fold 3/15
Score: 0.44585905406795673
##################################################

Fold 4/15
Score: 0.46042443017276163
##################################################

Fold 5/15
Score: 0.4449849373703035
##################################################

Fold 6/15
Score: 0.45775639148066166
##################################################

Fold 7/15
Score: 0.44134489833282525
##################################################

Fold 8/15
Score: 0.433098556562234
##################################################

Fold 9/15
Score: 0.4530484851798466
##################################################

Fold 10/15
Score: 0.4609800425472033
#################

In [41]:
len(all_models)

10

In [40]:
all_models = [
    attention_large_fgm_CFG, weighted2last_fgm_512_CFG, weightedmean2last_fgm_512_CFG, attention_fgm_512_CFG, attention_fgm_768_CFG, 
    v21_CFG, v112_CFG, mean_attention_no_fgm_CFG, v2_CFG, weightedpool_CFG           
]
# optimal_weights = [
#     0.353356890459364,
#     0.1660777385159011,
#     0.10247349823321557,
#     0.05653710247349824,
#     0.06007067137809188,
#     0.049469964664310966,
#     0.08127208480565372,
#     0.06713780918727916,
#     0.03886925795053004,
#     0.02473498233215548
# ]
all_oof_train = get_ensemble_oof(all_models, True, False)
use_weighted_mean_prediction = True

# if use_weighted_mean_prediction:
#     for i, ftm_cfg in enumerate(all_models):
#         print(ftm_cfg.path, optimal_weights[i])
#         all_models[i].inference_weight = optimal_weights[i]

#     for target in tqdm(['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']):
#         for cfg in all_models:
#             if f'pred_{target}' in all_oof_train.columns:
#                 all_oof_train[f'pred_{target}'] += all_oof_train[cfg.name + '_' + f'pred_{target}'] * cfg.inference_weight
#             else:
#                 all_oof_train[f'pred_{target}'] = all_oof_train[cfg.name + '_' + f'pred_{target}'] * cfg.inference_weight
learners = [
    Ridge(alpha=0.0125, fit_intercept=True,normalize=True,random_state=CFG.seed),
]
train_stacking(all_oof_train, learners, save=True)

score: 0.44640312  scores: [0.47580667377642244, 0.44090222277463526, 0.40901897711244045, 0.4477116762081165, 0.4649767232192434, 0.44000242876507945]

Fold 1/15
Score: 0.45826494901098097
##################################################

Fold 2/15
Score: 0.4412887931729279
##################################################

Fold 3/15
Score: 0.4408872393412681
##################################################

Fold 4/15
Score: 0.4530449403370144
##################################################

Fold 5/15
Score: 0.4417738393814558
##################################################

Fold 6/15
Score: 0.4534210967506897
##################################################

Fold 7/15
Score: 0.43637034407650366
##################################################

Fold 8/15
Score: 0.42836569611584413
##################################################

Fold 9/15
Score: 0.45469522925896283
##################################################

Fold 10/15
Score: 0.4560703214451878
##############

## subset selection

In [130]:
#######################################
def get_ensemble_oof(cfgs):
    oof_dfs = pd.DataFrame()
    oof_train = pd.DataFrame()
    total_weight = 0
    for cfg in cfgs:
        cfg.target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    
        infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)

        if cfg.path in [attention_fgm_CFG.path, attention_fgm_768_CFG.path, attention_fgm_512_CFG.path, attention_large_fgm_CFG.path, roberta_attention_fgm_CFG.path, weightedmean2last_fgm_512_CFG.path, weighted2last_fgm_512_CFG.path]:
            file_type = 'csv'
        else:
            file_type = 'pkl'
        oof_df = infer_.get_oof_df(file_type)
        total_weight += infer_.cfg.inference_weight

        #print(cfg.path)
        #get_result(cfg, oof_df)
        #print('\n')

        pred_cols = [f'pred_{col}' for col in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]
        oof_df_copy = oof_df.copy()
        oof_df[pred_cols] = oof_df[pred_cols] * infer_.cfg.inference_weight
        oof_dfs = oof_dfs.append(oof_df)

        oof_df = oof_df_copy.copy()
        oof_df = oof_df[['text_id'] + pred_cols]
        oof_df.columns = ['text_id'] + [cfg.name + '_' + col for col in pred_cols]

        if len(oof_train) == 0:
            oof_train = (
                train
                .merge(oof_df, on=['text_id'], how='left')
                .drop(columns=['full_text']))
        else:
            oof_train = (
                oof_train.merge(oof_df, on=['text_id'], how='left'))
        del infer_; gc.collect()
    
    pred_cols = [f'pred_{col}' for col in ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']]
    # oof_dfs_mean = oof_dfs.groupby('text_id')[pred_cols].mean()
    oof_dfs_mean = oof_dfs.groupby('text_id')[pred_cols].sum() / total_weight
    oof_dfs_mean = oof_dfs_mean.join(train.set_index('text_id'))
    
    return get_result(CFG, oof_dfs_mean, verbose=0), cfgs

In [131]:
model_selection = False

if model_selection:
    fine_tuned_models_cfg = [
        attention_fgm_CFG, 
        v21_CFG, 
        v112_CFG, 
        mean_attention_no_fgm_CFG, 
        weighted_attention_CFG, 
        attention_large_fgm_CFG, 
        attention_fgm_768_CFG, 
        attention_fgm_512_CFG,
        weightedmean2last_fgm_512_CFG,
        weighted2last_fgm_512_CFG,

        v2_CFG, 
        v116_CFG, 
        weightedpool_CFG, 
        weighted_fgm_CFG
    ]

    for c in fine_tuned_models_cfg:
        if len(c.trn_fold) < 4:
            print(c.name)

    for i, first_model in enumerate(tqdm(fine_tuned_models_cfg)):
        features = [first_model]
        prev_score,_ = get_ensemble_oof(features)
        cur_score = 0
        
        while True:
            models = [feat.name for feat in features]
            if len(models) == len(fine_tuned_models_cfg):
                break
                
            scores_and_features = [get_ensemble_oof(features + [feat]) for feat in fine_tuned_models_cfg if feat.name not in models]
            scores = [s for s,c in scores_and_features]
            cur_features = [c for s,c in scores_and_features]
            cur_score = np.min(scores)
            cur_best_feature = cur_features[np.argmin(scores)][-1]
            features.append(cur_best_feature)
            
            if prev_score < cur_score:
                break
            prev_score = cur_score

            del scores_and_features, scores, cur_best_feature, cur_features; gc.collect(); torch.cuda.empty_cache();        
        
        logger.info(f'Interation {i+1}:')
        logger.info(f'model_set={[c.name for c in features]} \nbest_score={cur_score}')
        logger.info('#'*50)
        logger.info('\n')

  0%|          | 0/14 [00:00<?, ?it/s]

Interation 1:
model_set=['attention_fgm', 'attention_large_fgm', 'v112', 'weightedmean2last_fgm_512', 'mean_attention_no_fgm', 'v2', 'weighted2last_fgm_512'] 
best_score=0.4463336717953639
##################################################


Interation 2:
model_set=['v21', 'attention_large_fgm', 'v112', 'mean_attention_no_fgm', 'weightedmean2last_fgm_512', 'v2', 'attention_fgm_512', 'v116'] 
best_score=0.4462124748982587
##################################################


Interation 3:
model_set=['v112', 'attention_large_fgm', 'weightedmean2last_fgm_512', 'mean_attention_no_fgm', 'v2', 'attention_fgm_768', 'v21'] 
best_score=0.44612802576036886
##################################################


Interation 4:
model_set=['mean_attention_no_fgm', 'attention_large_fgm', 'weightedmean2last_fgm_512', 'v112', 'v2', 'attention_fgm_768', 'v21'] 
best_score=0.44612802576036886
##################################################


Interation 5:
model_set=['weighted_attention', 'attention_large_

In [126]:
# model_set=['attention_fgm_512', 'attention_large_fgm', 'v112', 'mean_attention_no_fgm', 'v2', 'weightedmean2last_fgm_512', 'v21'] 
# best_score=0.44609399605554056



2

## train

In [57]:
# features_df = pd.read_csv('../input/fb3-feature-engineering/train_fe.csv')
# feature_cols = [col for col in features_df.columns if col not in ['full_text', 'text_id'] + CFG.target_cols]

# features = features_df.set_index('text_id')[feature_cols]

# pred_cols = [col for col in oof_train.columns if col not in ['text_id', 'cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions', 'fold']]
# oof_train[pred_cols] = (oof_train[pred_cols] - oof_train[pred_cols].mean().values) / oof_train[pred_cols].std().values
# oof_train = oof_train.set_index('text_id').join(features)

# for col in oof_train.columns[8:]:
#     oof_train[col] = (oof_train[col] - oof_train[col].mean())/(oof_train[col].std())
#     #print(oof_train[col].mean(), oof_train[col].std())

In [31]:
oof_scores = []
oof_oof_train = pd.DataFrame()
def models_fit_predict(models, X_train, y_train, X_val, y_val, fold, save=False):
    preds = []
    for model in models:
        model_name = type(model).__name__.lower()
        if type(model) != LinearRegression:
            model = MultiOutputRegressor(model)
        model.fit(X_train, y_train)
        if save:
            #dump(model, f'{model_name}_strong_fold{fold}.model')
            dump(model, f'{model_name}_weak_fold{fold}.model')
        preds.append(model.predict(X_val))
        #for estimator in model.estimators_:
        #    print(estimator.coef_)
    return np.mean(preds, axis=0)

for fold in range(CFG.n_fold):
    print(f'\nFold {fold+1}/{CFG.n_fold}')
    
    X_train = oof_train[oof_train['fold']!=fold][oof_train.columns[8:]].values
    #assert X_train.shape[1] == len(fine_tuned_models_cfg)*6 + 6
    y_train = oof_train[oof_train['fold']!=fold][CFG.target_cols].values
    X_val = oof_train[oof_train['fold']==fold][oof_train.columns[8:]].values
    y_val = oof_train[oof_train['fold']==fold][CFG.target_cols].values
    
    pred_val = models_fit_predict(
        [
            Ridge(alpha=48.0, random_state=CFG.seed), 
            #BayesianRidge(),
            #Lasso(alpha=1.0, random_state=CFG.seed),
            LinearRegression(normalize=True, positive=True),
            #SVR(kernel='linear', gamma='auto'),
            #LGBMRegressor(max_depth=24, random_state=42, learning_rate=0.01),
        ],
        X_train, y_train, X_val, y_val,
        fold, save=False,
    )
    
    val_fold = oof_train[oof_train['fold']==fold].reset_index(drop=True)
    val_fold[[f'pred_{c}' for c in CFG.target_cols]] = pred_val
    
    oof_oof_train = pd.concat([oof_oof_train, val_fold])

    oof_score, _ = mc_rmse(y_val, pred_val)
    oof_scores.append(oof_score)
    print(f'Score: {oof_score}')
    print('#'*50)

get_result(CFG, oof_oof_train)


Fold 1/4
Score: 0.44758374702096937
##################################################

Fold 2/4
Score: 0.45311470919605995
##################################################

Fold 3/4
Score: 0.45800629488427996
##################################################

Fold 4/4
Score: 0.4432123703793988
##################################################
score: 0.45053400  scores: [0.4787650786478345, 0.44443982088474465, 0.4130861316193527, 0.4520671715257306, 0.47022854514651, 0.4446172264203405]


0.4505339957074188

In [None]:
from datetime import datetime
with open('info.txt', 'w') as f:
    f.write(f'{datetime.today().strftime("%Y%m%d")}-{np.mean(oof_scores)}')

In [None]:
# for f in os.listdir('./'):
#     if '.model' in f:
#         os.remove(f'./{f}')

# embedding stacking

In [None]:
# from sklearn.linear_model import BayesianRidge, LinearRegression
# from sklearn.multioutput import MultiOutputRegressor

In [None]:
# fine_tuned_models_cfg = [v112_CFG, v21_CFG, weighted_attention_CFG, mean_attention_no_fgm_CFG, attention_fgm_CFG, attention_fgm_768_CFG]

In [None]:
# def get_embedding(df, cfg, fold):
#     inferencer = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
#     train_dataset = TestDataset(cfg, df)
#     train_loader = DataLoader(
#             train_dataset,
#             batch_size=12,
#             shuffle=False,
#             collate_fn=DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest'),
#             num_workers=2, 
#             pin_memory=True, 
#             drop_last=False)
#     embedding = inferencer.get_text_embedding(train_loader, device, fold=fold)
#     return embedding

# oof_df = train.copy()
# for fold in range(4):
#     print(f'fold={fold}')
#     train_folds = train[train['fold'] != fold]
#     train_labels = train_folds[fine_tuned_models_cfg[-1].target_cols].values 
#     train_idx = train_folds.index
    
#     valid_folds = train[train['fold'] == fold]
#     valid_labels = valid_folds[fine_tuned_models_cfg[-1].target_cols].values 
#     valid_idx = valid_folds.index
    
#     fine_tuned_embeddings = []
#     for cfg in tqdm(fine_tuned_models_cfg):
#         embedding = get_embedding(train, cfg, fold)        
#         #print(embedding.std(), embedding.mean())
#         fine_tuned_embeddings.append(embedding)
#         print(cfg.name, f'embedding_{cfg.name}_{embedding.shape[1]}.npy')
        
#         with open(f'./embedding_{cfg.name}_{embedding.shape[1]}.npy', 'wb') as f:
#             np.save(f, embedding)
        
#         del embedding; gc.collect(); torch.cuda.empty_cache()
        
#     fine_tuned_embeddings = np.concatenate(fine_tuned_embeddings, axis=1)
#     train_embeddings = fine_tuned_embeddings[train_idx, :]
#     valid_embeddings = fine_tuned_embeddings[val_idx, :]
    
#     model_ridge = MultiOutputRegressor(BayesianRidge())
#     model_ridge.fit(train_embeddings, train_labels)
#     ridge_val_preds = model_ridge.predict(valid_embeddings)
#     dump(model_ridge, f'ridge_embed_stacking_{fold}.model')
#     score, _ = mc_rmse(valid_labels, ridge_val_preds)
    
#     oof_df.loc[val_idx, ['pred_' + target_col for target_col in fine_tuned_models_cfg[-1].target_cols]] = ridge_val_preds    
#     print(f'fold:{fold} - score:{score}')

# get_result(cfg, oof_df)

# oof train - groups

In [None]:
group1 =  [v21, v112, mean_attention_no_fgm, v2, weightedpool]
group2 = []
# [weighted2last_fgm_512, weightedmean2last_fgm_512] - [attention_fgm_512, attention_fgm_768] - [attention_large_fgm, v21, v112, mean_attention_no_fgm, v2, weightedpool]