# Introduction

* **TL;DR**: I ensemble a fine-tuned deberta-v3-base model (CV: 0.4571, LB: 0.44) with a cuml.SVR trained on embeddings extracted on 12 transformer models. This shows that we can easily achieve 0.43 without much training. 
* Dataset of embeddings can be found here: https://www.kaggle.com/datasets/quangphm/fb3embeddings
* Reference: https://www.kaggle.com/code/cdeotte/rapids-svr-cv-0-450-lb-0-44x

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math 
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")
from IPython. display import clear_output

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

#os.system('pip install iterative-stratification==0.1.7')
#from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-my-pip-wheels tokenizers')
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import DataCollatorWithPadding
%env TOKENIZERS_PARALLELISM=false

clear_output()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
print('device:', device)

In [None]:
BASE_PATH = '/kaggle/input/feedback-prize-english-language-learning'
SUBMISSION_PATH = os.path.join(BASE_PATH, 'sample_submission.csv')
TRAIN_PATH = os.path.join(BASE_PATH, 'train.csv')
TEST_PATH = os.path.join(BASE_PATH, 'test.csv')

In [None]:
class Config(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__
    
    def init(self, kwargs):
        super().init(kwargs)

        for k, v in kwargs.items():
            setattr(self, k, v)

    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
def get_logger(filename='inference'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.propagate = False
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

In [None]:
def seed_everything(seed=42):
    '''
    Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.
    '''
    random.seed(seed)
    # Set a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        # When running on the CuDNN backend, two further options must be set
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
seed_everything(seed=42)

def mc_rmse(y_true, y_pred):
    scores = []
    ncols = y_true.shape[1]
    
    for n in range(ncols):
        yn_true = y_true[:, n]
        yn_pred = y_pred[:, n]
        rmse_ = mean_squared_error(yn_true, yn_pred, squared=False)
        scores.append(rmse_)
    score = np.mean(scores) 
    return score, scores

def get_result(cfg, oof_df):
    labels = oof_df[cfg.target_cols].values
    preds = oof_df[[f"pred_{c}" for c in cfg.target_cols]].values
    score, scores = mc_rmse(labels, preds)
    print(f'score: {score:<.6f}  scores: {scores}')

In [None]:
# ====================================================
# Dataset
# ====================================================
def encode_text(cfg, text):
    if cfg.pretrained:
        inputs = cfg.tokenizer(
            text,
            None,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=cfg.max_len,
            return_token_type_ids=True,
            return_tensors='pt'
        )
        inputs = {k:v.squeeze(0) for k,v in inputs.items()}
    else:
        inputs = cfg.tokenizer.encode_plus(
            text, 
            return_tensors=None, 
            add_special_tokens=True, 
            #max_length=CFG.max_len,
            #pad_to_max_length=True,
            #truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs 

def preprocess(texts):
    texts = (
        texts
        .str.replace(r'\r\n', '<newline>', regex=True)
        .str.replace(r'\n', '<newline>', regex=True)
        .str.replace('<newline><newline>', '<newline>', regex=False)
        .values 
    )
    return texts

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        if cfg.pretrained:
            self.texts = df['full_text'].values
        else:
            self.texts = preprocess(df['full_text'])

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = encode_text(self.cfg, self.texts[item])
        return inputs

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features, *args):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector
    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 9, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
        
    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average
    
class FB3Model(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg 
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            # Turn off dropouts.
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            #LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.deberta_v3 = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.deberta_v3 = AutoModel.from_config(self.config)

        if self.cfg.reinit_last_layer:
            # Re-init last layer of deberta.
            for module in self.deberta_v3.encoder.layer[-1:].modules():
                self._init_weights(module)
        self.deberta_v3.gradient_checkpointing_enable()

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            #nn.init.xavier_uniform_(module.weight.data, gain=1.0)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

class WMPoolModel(FB3Model):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__(cfg, config_path=config_path, pretrained=pretrained)

        # Poolings.
        self.wpool_head = WeightedLayerPooling(self.config.num_hidden_layers, layer_start=12)

        self.fc_out = nn.Linear(self.config.hidden_size, cfg.num_target)
        self._init_weights(self.fc_out)
        
        self.layer_norm = nn.LayerNorm(self.config.hidden_size)
        self.qa_output = torch.nn.Linear(self.config.hidden_size, 2)
        self.attention_head = AttentionHead(self.config.hidden_size*4, self.config.hidden_size)
        
    def forward(self, x):
        pt_out = self.deberta_v3(**x)
        all_hidden_states = torch.stack(pt_out.hidden_states)
        # Weighted pooling of last n layers.
        logits = self.wpool_head(all_hidden_states)[:, 0] # Bx768
        y_hat = self.fc_out(logits)
        return y_hat
    


In [None]:
def load_config(input_path, inference_weight=1):
    # Load CFG class.
    cfg = Config(**json.load(open(os.path.join(input_path, 'CFG.json'), 'r')))
    cfg.path = input_path
    cfg.config_path = os.path.join(cfg.path, 'config.pth')
    # Load tokenizer.
    tokenizer = AutoTokenizer.from_pretrained(os.path.join(cfg.path, 'tokenizer'))
    cfg.tokenizer = tokenizer
    
    cfg.inference_weight = inference_weight
    return cfg


def load_model(cfg, fold, **model_kwargs):
    # Load torch model.
    model = WMPoolModel(cfg, config_path=cfg.config_path, pretrained=False, **model_kwargs)
    state = torch.load(
        os.path.join(cfg.path, f"{cfg.model.replace('/', '-')}_fold{fold}_best.pth"),
        map_location=torch.device('cpu'))
    model.load_state_dict(state['model'])
    return model

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state.detach().cpu()
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    #tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in test_loader:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

class Inferencer:
    def __init__(self, input_path=None, cfg=None, inference_weight=1):
        if cfg == None:
            self.cfg = load_config(input_path, inference_weight)
        else:
            self.cfg = cfg
    
    def predict(self, test_loader, device, stat_fn=np.mean):
        preds = []
        start = time.time()
        print('#'*10, cfg.path, '#'*10)
        for fold in self.cfg.trn_fold:
            print(f'Predicting fold {fold}...')
            model = load_model(self.cfg, fold)
            pred = inference_fn(test_loader, model, device)
            preds.append(pred)
            del model, pred; gc.collect()
            torch.cuda.empty_cache()
        end = time.time() - start
        print('#'*10, f'ETA: {end:.2f}s', '#'*10, '\n')
        
        self.preds = stat_fn(preds, axis=0) 
        self.preds = np.clip(self.preds, 1, 5)
        return self.preds
    
    def get_oof_result(self):
        return get_result(pd.read_pickle(os.path.join(cfg.path, 'oof_df.pkl')))
    
    def get_text_embedding(self, data_loader, device, fold=None): 
        # pretrained=True: not fine-tuned models.
        if not self.cfg.pretrained:
            model = load_model(self.cfg, fold, pool=self.cfg.pool_head)            
        else:
            model = AutoModel.from_pretrained(self.cfg.model)
        model.to(device)
        model.eval()
            
        fold_emb = []
        for inputs in data_loader:
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            if not self.cfg.pretrained:
                with torch.no_grad():
                    emb = model.feature(**inputs)
            else:
                input_ids = inputs['input_ids'].to(device)
                attention_mask = inputs['attention_mask'].to(device)
                token_type_ids = inputs['token_type_ids'].to(device)
                
                with torch.no_grad():
                    output = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
                emb = mean_pooling(output, attention_mask.detach().cpu())
                emb = F.normalize(emb, p=2, dim=1)
                emb = emb.squeeze(0)
            fold_emb.extend(emb.detach().cpu().numpy())
            del emb; gc.collect(); torch.cuda.empty_cache();
            #print(torch.cuda.memory_allocated() /1024/1024)
            
        fold_emb = np.array(fold_emb)
        return fold_emb

# svr + pretrained embeddings
* Listed below are more than 20 transformers but I only use 12 for this submission. 
* Using the method `get_text_embedding` from `Inferencer` you can create your own embeddings.

In [None]:
##################################################
deberta_base = Config(
    model='../input/huggingface-deberta-variants/deberta-base/deberta-base',
    file_name='microsoft_deberta_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_large = Config(
    model='../input/huggingface-deberta-variants/deberta-large/deberta-large', 
    file_name='microsoft_deberta_large_1024',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_xlarge = Config(
    model='../input/huggingface-deberta-variants/deberta-xlarge/deberta-xlarge', 
    file_name='microsoft_deberta_xlarge_1024',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xlarge', 
    file_name='microsoft_deberta_v2_xlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)
deberta_v2_xxlarge = Config(
    model='../input/bert-shopping-mall/deberta-v2-xxlarge', 
    file_name='microsoft_deberta_v2_xxlarge_1536',
    pretrained=True, inference_weight=1, max_len=640)

deberta_v3_base = Config(
    model='../input/bert-shopping-mall/deberta-v3-base',
    file_name='microsoft_deberta_v3_base_768',
    pretrained=True, inference_weight=1, max_len=640) #
deberta_v3_large = Config(
    model='../input/bert-shopping-mall/deberta-v3-large', 
    file_name='microsoft_deberta_v3_large_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

deberta_large_mnli = Config(
    model='../input/huggingface-deberta-variants/deberta-large-mnli/deberta-large-mnli',
    file_name='microsoft_deberta_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640) # 

gpt2 = Config(
    model='../input/hugging-face-gpt2/gpt2',
    file_name='gpt2_768',
    pretrained=True, inference_weight=1, max_len=512) #

roberta_base = Config(
    model='../input/transformers/roberta-base', 
    file_name='roberta_base_768',
    pretrained=True, inference_weight=1, max_len=512) #
roberta_large = Config(
    model='../input/transformers/roberta-large',
    file_name='roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512) # 

xlnet_base = Config(
    model='../input/transformers/xlnet-base-cased',
    file_name='xlnet_base_cased_768',
    pretrained=True, inference_weight=1, max_len=640) #
xlnet_large = Config(
    model='../input/transformers/xlnet-large-cased', 
    file_name='xlnet_large_cased_1024',
    pretrained=True, inference_weight=1, max_len=640) #

bart_base = Config(
    model='../input/transformers/facebook-bart-base',
    file_name='facebook_bart_base_768',
    pretrained=True, inference_weight=1, max_len=640)
bart_large = Config(
    model='../input/transformers/facebook-bart-large',
    file_name='facebook_bart_large_1024',
    pretrained=True, inference_weight=1, max_len=640)
bart_lage_mnli = Config(
    model='../input/facebook-bart-large-mnli',
    file_name='facebook_bart_large_mnli_1024',
    pretrained=True, inference_weight=1, max_len=640)

bert_base_uncased = Config(
    model='../input/transformers/bert-base-uncased/',
    file_name='bert_base_uncased_768',
    pretrained=True, inference_weight=1, max_len=512)
bert_large_uncased = Config(
    model='../input/transformers/bert-large-uncased',
    file_name='bert_large_uncased_1024',
    pretrained=True, inference_weight=1, max_len=512)

muppet_roberta_large = Config(
    model='../input/muppet-roberta-large',
    file_name='facebook_muppet_roberta_large_1024',
    pretrained=True, inference_weight=1, max_len=512)

funnel_small = Config(
    model='../input/transformers/funnel-transformer-small',
    file_name='funnel_transformer_small_768',
    pretrained=True, inference_weight=1, max_len=640)
funnel_large = Config(
    model='../input/transformers/funnel-transformer-large',
    file_name='funnel_transformer_large_1024',
    pretrained=True, inference_weight=1, max_len=640)

##################################################

target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

## load embeddings

In [None]:
from sklearn.metrics import make_scorer
from joblib import dump, load
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import StackingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import RidgeCV, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor

if str(device) == 'cpu':
    from sklearn.svm import SVR
else:
    from cuml.svm import SVR
    import cuml
device

In [None]:
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

In [None]:
import sys
sys.path.append('../input/iterativestratification')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
svr_folds = 15

skf = MultilabelStratifiedKFold(n_splits=svr_folds, shuffle=True, random_state=42)
for i,(train_index, val_index) in enumerate(skf.split(train,train[target_cols])):
    train.loc[val_index,'fold'] = i

In [None]:
from glob import glob 

def get_text_embedding(cfg, dfs):
    # Simply load saved embeddings for training.
    cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model)
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    if cfg.model == 'gpt2':
        cfg.tokenizer.pad_token = cfg.tokenizer.eos_token
    text_embs = []
    for df in dfs:
        dataset = TestDataset(cfg, df)
        loader = DataLoader(
            dataset,
            batch_size=4,
            shuffle=False)

        # Text embedding for SVM
        test_text_emb = []
        if not cfg.pretrained:
            for fold in infer_.cfg.trn_fold:
                test_text_emb.append(infer_.get_text_embedding(loader, device, fold))
            text_emb = np.mean(text_emb, axis=0)
        else:
            text_emb = infer_.get_text_embedding(loader, device)
        text_embs.append(text_emb)
        del dataset, loader; gc.collect(); torch.cuda.empty_cache();
    del infer_; gc.collect(); torch.cuda.empty_cache();
    return text_embs

def learner_cv(features, learner, folds=15, save=False, verbose=False):
    scores = []
    for fold in range(folds):
        dftr_ = train[train['fold']!=fold]
        dfev_ = train[train['fold']==fold]

        tr_text_feats = features[list(dftr_.index),:]
        ev_text_feats = features[list(dfev_.index),:]

        # clf = MultiOutputRegressor(SVR(C=2.0))
        clf = MultiOutputRegressor(learner)
        clf.fit(tr_text_feats, dftr_[target_cols].values)
        ev_preds = clf.predict(ev_text_feats)

        score,_ = mc_rmse(dfev_[target_cols].values, ev_preds)
        scores.append(score)

        if verbose:
            print('#'*25)
            print('### Fold',fold+1)
            print("Score: {}".format(score))
        if save:
            dump(clf, f'svr_{fold}.model')

    # print('#'*25)
    # print('Overall CV =', np.mean(scores))
    return np.mean(scores)

def get_learner_score(models_cfg, learner, folds=5, save=False, verbose=False):
    for i, model_cfg in enumerate(models_cfg):
        model_name = model_cfg.model.split('/')[-1].replace('-', '_')
        models_cfg[i].model_name = model_name
        model_file = f'../input/fb3embeddings/train_text_emb_{model_cfg.file_name}.npy'
        if 'embedding' in model_cfg:
            continue
        with open(model_file, 'rb') as f:
            models_cfg[i].embedding = np.load(f)   
    embeddings = np.concatenate([model_cfg.embedding for model_cfg in models_cfg], axis=1)
    svr_score = learner_cv(embeddings, learner, folds=folds, save=save, verbose=verbose)
    print('\n')
    print(f'model_set={[m.model_name for m in models_cfg]};   score={svr_score}')
    return svr_score, models_cfg

## models selection
* Simple hill climbing algorithm to find the (somewhat) best set of models to train SVR on.

In [None]:
# model_selection = False

In [None]:
# if model_selection:
#     for i, first_model in enumerate(tqdm(pretrained_models_cfg[1:])):
#         features = [first_model]
#         prev_score,_ = get_learner_score(features)
#         cur_score = 0
        
#         while True:
#             models = [feat.model for feat in features]
#             if len(models) == len(pretrained_models_cfg):
#                 break
                
#             scores_and_cfgs = [get_learner_score(features + [feat], folds=15, save=False) for feat in pretrained_models_cfg if feat.model not in models]
#             scores = [s for s,c in scores_and_cfgs]
#             cur_features = [c for s,c in scores_and_cfgs]
            
#             cur_score = np.min(scores)
#             cur_best_feature = cur_features[np.argmin(scores)][-1]
#             features.append(cur_best_feature)
            
#             if prev_score < cur_score:
#                 break
#             prev_score = cur_score

#             del scores_and_cfgs, scores, cur_best_feature, cur_features; gc.collect(); torch.cuda.empty_cache();
        
#         LOGGER.info(f'Interation {i+1}:')
#         LOGGER.info(f'model_set={[c.model_name for c in features]} \nbest_score={cur_score}')
#         LOGGER.info('#'*50)
#         LOGGER.info('\n')

## train

In [None]:
pretrained_models_cfg = [
    deberta_large_mnli,
    #gpt2,
    roberta_base,
    roberta_large,
    xlnet_base, 
    xlnet_large,
    deberta_base, 
    deberta_large, 
    deberta_xlarge,
    deberta_v2_xlarge, 
    deberta_v2_xxlarge,
    deberta_v3_base, 
    deberta_v3_large,
]

In [None]:
# learner = Ridge(alpha=2.0)
learner = SVR(C=2.0)
svr_score, models_cfg = get_learner_score(pretrained_models_cfg, learner, folds=svr_folds, save=True, verbose=True)

## predict

In [None]:
all_test_text_emb = []
for cfg in tqdm(pretrained_models_cfg):
    test_text_emb = get_text_embedding(cfg, [test])[0]
    all_test_text_emb.append(test_text_emb)
    
    del test_text_emb; gc.collect(); torch.cuda.empty_cache();
    print(f'{cfg.model} loaded.')
    
gc.collect(); torch.cuda.empty_cache();

final_test_text_emb = np.concatenate(all_test_text_emb, axis=1)

In [None]:
import glob
def svr_inference_fn(model_path, te_text_feats):
    model = load(model_path)
    preds = model.predict(te_text_feats)
    return preds

predictions = []
svr_model_paths = glob.glob('./*.model')
for model_path in tqdm(svr_model_paths):
    #model_path = os.path.join('../input/fb3-svr-train/', model_path)
    preds = svr_inference_fn(model_path, final_test_text_emb)
    predictions.append(preds)
svr_predictions = np.mean(predictions, axis=0)
svr_predictions


---

# fine-tuned models

In [None]:
v21_CFG = load_config('../input/fb3models/v21/', inference_weight=1)
v21_CFG.version = '21'

tokenizer = AutoTokenizer.from_pretrained(os.path.join(v21_CFG.path, 'tokenizer'))
v21_CFG.tokenizer = tokenizer
v21_CFG.pretrained = False

In [None]:
test = pd.read_csv(TEST_PATH)

In [None]:
fine_tuned_models_cfg = [v21_CFG]

fine_tuned_predictions = []
total_weight = 0
for cfg in tqdm(fine_tuned_models_cfg):
    # infer_ = Inferencer(setup['path'], setup['inference_weight'])
    infer_ = Inferencer(cfg=cfg, inference_weight=cfg.inference_weight)
    
    test_dataset = TestDataset(cfg, test)
    test_loader = DataLoader(
        test_dataset,
        batch_size=3,
        shuffle=False,
        collate_fn=DataCollatorWithPadding(tokenizer=cfg.tokenizer, padding='longest'),
        num_workers=1, 
        pin_memory=True, 
        drop_last=False)
    prediction = infer_.predict(test_loader, device) * cfg.inference_weight
    
    fine_tuned_predictions.append(prediction)
    total_weight += cfg.inference_weight
    
    del infer_, test_dataset, test_loader, prediction; gc.collect; torch.cuda.empty_cache();
    
final_fine_tuned_predictions = np.sum(fine_tuned_predictions, axis=0)/total_weight    
final_fine_tuned_predictions

# combine & submit

In [None]:
# Simply average two predictions.
final_predictions = (svr_predictions + final_fine_tuned_predictions)/2
final_predictions = np.clip(final_predictions, 1, 5)
test[target_cols] = final_predictions

In [None]:
submission = pd.read_csv(SUBMISSION_PATH)
submission = submission.drop(columns=target_cols).merge(test[['text_id'] + target_cols], on='text_id', how='left')
display(submission.head())
submission[['text_id'] + target_cols].to_csv('submission.csv', index=False)