# Changelog
## Changes

## Bugs & Issues

## Fixed bugs

## Holding

In [None]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [None]:
# !pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113
# !pip install -q torchmetrics
# !pip install -q transformers==4.20.1
# !pip install -q sentencepiece
!pip install -q kaggle --upgrade

In [None]:
prepare_data = True
import os
if os.path.exists('/content/data'):
    prepare_data = False

In [None]:
if prepare_data:    
    from google.colab import files, drive

    uploaded = files.upload()

    for fn in uploaded.keys():
        print('User uploaded file "{name}" with length {length} bytes'.format(
            name=fn, length=len(uploaded[fn])))
    
    # Then move kaggle.json into the folder where the API expects to find it.
    !mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

    !mkdir data && cd data && kaggle competitions download -c feedback-prize-english-language-learning
    !unzip /content/data/feedback-prize-english-language-learning.zip -d /content/data/

    # !mkdir data/pretrained && cd data/pretrained && kaggle datasets download -d sunpnwt12/fb3-pretrained-s42
    # !unzip /content/data/pretrained/fb3-pretrained-s42.zip -d /content/data/pretrained/

    drive.mount('/content/drive')

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 65 bytes
Downloading feedback-prize-english-language-learning.zip to /content/data
100% 2.80M/2.80M [00:00<00:00, 5.30MB/s]
100% 2.80M/2.80M [00:00<00:00, 4.51MB/s]
Archive:  /content/data/feedback-prize-english-language-learning.zip
  inflating: /content/data/sample_submission.csv  
  inflating: /content/data/test.csv  
  inflating: /content/data/train.csv  
Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
import time
import datetime
import glob
import pytz
import gc; gc.enable()
import random
import warnings
import yaml
import shutil
import types
from pathlib import Path
from tqdm.notebook import trange, tqdm
from tabulate import tabulate
warnings.filterwarnings("ignore")
print(f'python version: {sys.version}') 

os.system('pip install -q iterative-stratification==0.1.7')
import iterstrat
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
print(f'iterstart version: {iterstrat.__version__}')

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.cuda.amp import autocast, GradScaler

from torch.optim.swa_utils import AveragedModel, SWALR

print(f'torch version: {torch.__version__}')

from torchmetrics.functional import mean_squared_error

# os.system('pip install --root-user-action=ignore --force-reinstall transformers==4.22.1')
import transformers

from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding
print(f'transfromers version: {transformers.__version__}')

os.environ["TOKENIZERS_PARALLELISM"] = "true"

python version: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]




iterstart version: 0.1.6
torch version: 1.11.0
transfromers version: 4.20.1


# Config

In [None]:
class BASICCONF:
    seed = 42
    
    data_path = '/kaggle/input/feedback-prize-english-language-learning'
    
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    num_labels = 6
    num_folds = 5
    
    dropout_ratio = 0.

# Seeding

In [None]:
#https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
seed_everything(BASICCONF.seed)

# Dataset

In [None]:
def get_model_path_dict(source, fold_num_list):
    model_path_dict = {
        'yaml': f'{source}/config.yml',
        'config': [cf for cf in Path(source).glob('*_config.pt')][0].as_posix(),
        'tokenizer': f'{source}/tokenizers/',
        'models': [[f_n, f'{source}/best-epoch-fold{f_n}.pt'] for f_n in fold_num_list],
        'tables': {
            'train_result': f'{source}/train_result.csv',
            'best_result': f'{source}/best_result.csv',
            'cv_result': f'{source}/cv_result.csv',
        },
        'log': f'{source}/log.txt'
    }
    return model_path_dict

In [None]:
TRAIN_PATH = f'{BASICCONF.data_path}/train.csv'
TEST_PATH = f'{BASICCONF.data_path}/test.csv'
SAMP_SUB = f'{BASICCONF.data_path}/sample_submission.csv'

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
samp_sup = pd.read_csv(SAMP_SUB)

In [None]:
class FB3Dataset(Dataset):
    def __init__(self, df, tokenizer):
#         self.text_id = df['text_id'].reset_index(drop=True)
        self.full_texts = df['full_text'].reset_index(drop=True)
        self.tokenizer = tokenizer
#         self.max_len = self._get_max_len()
        
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, idx):
        token = self._get_token(idx)
        
        return token
    
    def _get_token(self, idx):
        tokenized = self.tokenizer(
                        self.full_texts.loc[idx],
                        add_special_tokens=True,
#                         max_length=self.max_len,
#                         pad_to_max_length=True,
                        truncation=True,
                        return_tensors=None
                )
        return {k: torch.tensor(v, dtype=torch.long) for k, v in tokenized.items()} # stack tensor


# Model

In [None]:
# https://www.kaggle.com/code/maunish/clrp-pytorch-roberta-finetune/notebook
class AttentionHead(nn.Module):
    def __init__(self, in_features, hidden_dim):
        super().__init__()
        self.in_features = in_features
        self.middle_features = hidden_dim
        self.W = nn.Linear(in_features, hidden_dim)
        self.V = nn.Linear(hidden_dim, 1)
        self.out_features = hidden_dim

    def forward(self, features):
        att = torch.tanh(self.W(features))
        score = self.V(att)
        attention_weights = torch.softmax(score, dim=1)
        context_vector = attention_weights * features
        context_vector = torch.sum(context_vector, dim=1)

        return context_vector

# https://www.kaggle.com/competitions/feedback-prize-english-language-learning/discussion/361678
class AttentionPooling(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
        )

    def forward(self, x, mask):
        w = self.attention(x).float()
        w[mask==0]=float('-inf')
        w = torch.softmax(w,1)
        x = torch.sum(w * x, dim=1)
        return x    
    
# https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
class HiddenAttentionPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_fc):
        super().__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_fc = hiddendim_fc
        self.dropout = nn.Dropout(0.1)

        q_t = torch.normal(mean=0.0, std=0.02, size=(1, self.hidden_size))
        self.q = nn.Parameter(q_t).float()
        w_ht = torch.normal(mean=0.0, std=0.02, size=(self.hidden_size, self.hiddendim_fc))
        self.w_h = nn.Parameter(w_ht).float()

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out = self.attention(hidden_states)
        out = self.dropout(out)
        return out

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v

class ConcatPooling(nn.Module):
    def __init__(self, pooling_last=4):
        super().__init__()
        self.pooling_last = pooling_last
        
    def forward(self, all_hidden_states):
        concat_pooling = torch.cat(tuple(all_hidden_states[-l] for l in range(1, self.pooling_last + 1)), -1)
#         concat_pooling = concat_pooling.mean(dim=1) # average instead of select only one
        concat_pooling = concat_pooling[:, 0] # select the first one
        return concat_pooling

# https://www.kaggle.com/competitions/google-quest-challenge/discussion/129840
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_layers=12, init_std=0.02):
        super().__init__()
        self.num_layers = num_layers
        weights_init = torch.zeros(self.num_layers).float()
        weights_init.data[:-1] = -3
        self.layer_weights = torch.nn.Parameter(weights_init)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, all_hidden_states):
        all_layer_encoders = torch.stack(
            [self.dropout(layer) for layer in all_hidden_states[-self.num_layers:]], dim=0
        )
        averaged_layers = (torch.softmax(self.layer_weights, dim=0).unsqueeze(1).unsqueeze(1).unsqueeze(1) * all_layer_encoders).sum(0)
        return averaged_layers
        
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class CustomModel(nn.Module):
    def __init__(self, conf, fold_num, config_path=None):
        super().__init__()
        if not config_path:
            self.model_conf = AutoConfig.from_pretrained(conf.model_name, output_hidden_states=True)
            self.model_conf = self._set_dropout(self.model_conf)
            self.backbone = AutoModel.from_pretrained(conf.model_name, config=self.model_conf)
        else:
            self.model_conf = torch.load(config_path)
            self.backbone = AutoModel.from_config(self.model_conf)
        if conf.gradient_checkpointing:
            self.backbone.gradient_checkpointing_enable()
        
        if not config_path:
            for layer in self.backbone.encoder.layer[-conf.reinit_last_layers:]:
                for module in layer.modules():
                    self._init_weights(module)
                    
        self.pooling_strategy = conf.pooling_strategy_list[fold_num]
        if self.pooling_strategy == 'mean_pooling':
            self.pooler = MeanPooling()
            
        elif self.pooling_strategy == 'concat_pooling':
            self.pooler = ConcatPooling(conf.concat_pooling_last)
            
        elif self.pooling_strategy == 'attn_pooling': 
            self.attn_pooler = AttentionPooling(self.model_conf.hidden_size) 
            for attn_module in self.attn_pooler.modules():
                self._init_weights(attn_module)
            
        elif self.pooling_strategy == 'wlp_attn_pooling':
            self.wlp_pooler = WeightedLayerPooling(self.model_conf.num_hidden_layers, self.model_conf.initializer_range)
            self.attn_pooler = AttentionPooling(self.model_conf.hidden_size)
            for attn_module in self.attn_pooler.modules():
                self._init_weights(attn_module)
                
        elif self.pooling_strategy == 'concat_h_attn_mean_pooling':
            self.hattn_pooler = HiddenAttentionPooling(self.model_conf.num_hidden_layers, self.model_conf.hidden_size, self.model_conf.hidden_size)
            self.mean_pooler = MeanPooling()

        elif self.pooling_strategy == 'concat_attn_mean_pooling':
            self.attn_pooler = AttentionPooling(self.model_conf.hidden_size)
            for attn_module in self.attn_pooler.modules():
                self._init_weights(attn_module)
            self.mean_pooler = MeanPooling()
            
        else:
            raise Exception('Invalid pooling strategy')

        if self.pooling_strategy in ['mean_pooling', 'attn_pooling', 'wlp_attn_pooling']:
            hidden_size = self.model_conf.hidden_size
        elif self.pooling_strategy in ['concat_pooling']:
            hidden_size = self.model_conf.hidenn_size * conf.concat_pooling_last
        elif self.pooling_strategy in ['concat_h_attn_mean_pooling', 'concat_attn_mean_pooling']:
            hidden_size = self.model_conf.hidden_size * 2
        else:
            raise Exception('Cannot create fc layer.')
            
        self.multi_dropout = conf.multi_dropout
        if self.multi_dropout:
            self.dropout1 = nn.Dropout(conf.multi_dropout_p[0])
            self.dropout2 = nn.Dropout(conf.multi_dropout_p[1])
            self.dropout3 = nn.Dropout(conf.multi_dropout_p[2])
            self.dropout4 = nn.Dropout(conf.multi_dropout_p[3])
            self.dropout5 = nn.Dropout(conf.multi_dropout_p[4])
        else:
            self.dropout0 = nn.Dropout(0.1)
        
        self.fc = nn.Linear(hidden_size, conf.num_labels)
        if conf.reinit_method is not None:
            self._init_weights2_([self.fc], conf.reinit_method)
        else:
            self._init_weights(self.fc)

        self.use_ln = conf.use_ln
        if self.use_ln:
            self.ln = nn.LayerNorm(hidden_size)
            self._init_weights(self.ln)
        
    def _set_dropout(self, conf, ratio=0.):
        conf.attention_probs_dropout_prob = ratio
        conf.hidden_dropout = ratio 
        conf.hidden_dropout_prob = ratio
        conf.pooler_dropout = ratio
        return conf

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def _init_weights2_(self, module_lst, method):
        for module in module_lst:
            for param in module.parameters():
                if param.dim() > 1:
                    if method == 'kaiming_normal':
                        nn.init.kaiming_normal_(param)
                    elif method == 'xavier_normal':
                        nn.init.xavier_normal_(param)
                    elif method == 'orthoganol':
                        nn.init.orthogonal_(param)
                    else:
                        raise Exception('The method is invalid')
        
    def forward(self, inputs):
        backbone_outputs = self.backbone(**inputs)
        if self.pooling_strategy == 'mean_pooling':
            last_hidden_states = backbone_outputs['last_hidden_state']
            pooler_outputs = self.pooler(last_hidden_states, inputs['attention_mask'])
            
        elif self.pooling_strategy == 'concat_pooling':
            all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
            pooler_outputs = self.pooler(all_hidden_states)
            
        elif self.pooling_strategy == 'attn_pooling':
            last_hidden_states = backbone_outputs['last_hidden_state']
            pooler_outputs = self.attn_pooler(last_hidden_states, inputs['attention_mask'])
            
        elif self.pooling_strategy == 'wlp_attn_pooling':
            all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
            wlp_pooler = self.wlp_pooler(all_hidden_states)
            pooler_outputs = self.attn_pooler(wlp_pooler, inputs['attention_mask'])
            
        elif self.pooling_strategy == 'concat_h_attn_mean_pooling':
            last_hidden_states = backbone_outputs['last_hidden_state']
            all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
            hattn_outputs = self.hattn_pooler(all_hidden_states)
            mean_outputs = self.mean_pooler(last_hidden_states, inputs['attention_mask'])
            pooler_outputs = torch.cat((hattn_outputs, mean_outputs), -1)
            
        elif self.pooling_strategy == 'concat_attn_mean_pooling':
            last_hidden_states = backbone_outputs['last_hidden_state']
            attn_outputs = self.attn_pooler(last_hidden_states, inputs['attention_mask'])
            mean_outputs = self.mean_pooler(last_hidden_states, inputs['attention_mask'])
            pooler_outputs = torch.cat((attn_outputs, mean_outputs), -1)
            
        if self.use_ln:
            pooler_outputs = self.ln(pooler_outputs)
            
        if self.multi_dropout:
            x1 = self.fc(self.dropout1(pooler_outputs))
            x2 = self.fc(self.dropout2(pooler_outputs))
            x3 = self.fc(self.dropout3(pooler_outputs))
            x4 = self.fc(self.dropout4(pooler_outputs))
            x5 = self.fc(self.dropout5(pooler_outputs))
            
            outputs = (x1 + x2 + x3 + x4 + x5) / 5

        else:
            outputs = self.fc(self.dropout0(pooler_outputs))
            
        return outputs

# Helper

In [None]:
# https://www.kaggle.com/code/yasufuminakama/fb3-deberta-v3-base-baseline-train/notebook
class Averager:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
    def get_average(self):
        return self.avg
    
    def get_value(self):
        return self.val

# Helper for Pseudo Label

In [None]:
def inference_fn_pl(model, device, dataloader, tokenizer):
    predictions = []
    
    model.eval()
    model.to(device)

    for inputs in tqdm(dataloader):
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(inputs)
        predictions.append(outputs)
        
    predictions_stack = torch.stack(predictions)
    
    del model, dataloader
    
    return predictions_stack

def inference_cv_pl(conf, model_path_dict, df):
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
    fold_predictions_list = []
    text_id_df = df['text_id']
    
    model_config_path = model_path_dict['config']
    tokenizer = get_tokenizer_pl(model_path_dict['tokenizer'])
    
    print(f'Using "{model_config_path}"')
    
    for fold_num, model_path in model_path_dict['models']:
        model = load_model_pl(conf, device, fold_num, model_config_path, model_path)
        test_dataloader = get_dataloader_pl(df, tokenizer, 1)
        
        fold_predictions = inference_fn_pl(model, device, test_dataloader, tokenizer)
        fold_predictions_list.append(fold_predictions)
    
    cv_mean = torch.mean(torch.stack(fold_predictions_list), dim=0).squeeze().cpu().numpy()
    cv_mean_df = pd.DataFrame(cv_mean, columns=conf.target_cols)

    cv_df = pd.concat([text_id_df, cv_mean_df], axis=1)
    
    del model, test_dataloader
    
    return cv_df

In [None]:
def get_dataloader_pl(df, tokenizer, batch_num):
    
    test_dataset = FB3Dataset(df, tokenizer)
    
    test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_num,
        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
        num_workers=4,
        shuffle=False,
        pin_memory=True,
        drop_last=False,
    )
    
    return test_dataloader

def get_tokenizer_pl(tokenizer_path):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def load_model_pl(conf, device, fold_num, config_path, pretrained_model_path):
    model = CustomModel(conf, fold_num, config_path=config_path)
    state_dict = torch.load(pretrained_model_path, map_location=device)['model_state_dict']
    model.load_state_dict(state_dict)
    print(f'Loaded "{pretrained_model_path}"')
    return model

def pick_sample(num_data):
    pl_data_path = '/kaggle/input/feedback-prize-2021/train/'
    file_list = glob.glob(pl_data_path + '*.txt')
    if isinstance(num_data, int):
        selected_files = random.sample(file_list, num_data)
    elif num_data == 'all':
        selected_files = file_list
    else:
        raise Exception('Invalid num_data input')
    
    return selected_files

def make_pl_df(model_path_dict, selected_files):
    with open(model_path_dict['yaml']) as file:
        conf = types.SimpleNamespace(**yaml.safe_load(file))
    
    ids_text_list = []

    for file in selected_files:
        ids = Path(file).stem
        with open(file, mode='r', encoding='utf-8') as f:
            lines = f.readlines()
            full_text = ''.join(lines)
            ids_text_list.append([ids, full_text])
            
    fb_df = pd.DataFrame(ids_text_list, columns=['text_id', 'full_text'])
    pl_df = inference_cv_pl(conf, model_path_dict, fb_df)
    pl_df = fb_df.merge(pl_df, how='inner', on='text_id')
    
    torch.cuda.empty_cache()
    gc.collect()
    
    return pl_df

def concat_pl(train_df, pl_df):
    concated_df = pd.concat([train_df, pl_df], axis=0)
    return concated_df.drop_duplicates(subset=['text_id']).reset_index(drop=True)

def combine_result(df_list, method=None):
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    text_id_df = df_list[0]['text_id']
    np_pred_list = [df[target_cols].values for df in df_list]
    
    if method == 'mean':
        pred_mean = np.mean(np_pred_list, axis=0)
        pred_mean_df = pd.DataFrame(pred_mean, columns=target_cols)
        result = pd.concat([text_id_df, pred_mean_df], axis=1)
    
    if isinstance(method, list):
        if len(df_list) != len(method):
            raise Exception('Weight len or df_list is not equivalent')
        else:
            pred_weighted_mean = np.average(np_pred_list, axis=0, weights=method)
            pred_weighted_mean_df = pd.DataFrame(pred_weighted_mean, columns=target_cols)
            result = pd.concat([text_id_df, pred_weighted_mean_df], axis=1)
    
    return result

# Making Pseudo Label

In [None]:
model_path_list_s42 = [
    '/kaggle/input/fb3-deberta-v3-base/exp43s42',
    '/kaggle/input/fb3-deberta-v3-large/exp50s42',
    '/kaggle/input/fb3-bigbird-roberta-base/exp49s42',
    '/kaggle/input/fb3-roberta-large/exp54s42',
    '/kaggle/input/fb3-longformer-large/exp57s42',
]

model_path_list_s12 = [
    '/kaggle/input/fb3-deberta-v3-base/exp43s12',
    '/kaggle/input/fb3-deberta-v3-large/exp50s12',
    '/kaggle/input/fb3-roberta-large/exp54s12',
    '/kaggle/input/fb3-longformer-large/exp57s12',
]

In [None]:
def make_pl_fold(model_path_list, seed, sample_num, weight_list):
    if len(model_path_list) != len(weight_list):
        raise Exception('model_path_list and weight_list len is not equivalent')
        
    target_cols = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed_everything(seed)
    selected_samples = pick_sample(sample_num)
    
    for f in [0, 1, 2, 3]:
        print(f'Start creating pl fold {f}')
        pl_f_targets_list = []
        for model_path in model_path_list:
            model_path_dict_path= get_model_path_dict(model_path, [f])
            pl_f_df = make_pl_df(model_path_dict_path, selected_samples)
            pl_f_targets_list.append(pl_f_df[target_cols].values)
            
#         mean_pl_f = np.mean(pl_f_targets_list, axis=0)
        mean_pl_f = np.average(pl_f_targets_list, axis=0, weights=weight_list)
        mean_pl_f_df = pd.DataFrame(mean_pl_f, columns=target_cols)
        full_pl_f_df = pd.concat([pl_f_df[['text_id', 'full_text']], mean_pl_f_df], axis=1)
        full_pl_f_df.to_csv(Path('./', f'pl_s{seed}_f{f}.csv', index=False))
        print('==========================')
        print('')

In [None]:
# 42
# [0.5251053345458068, 0.22003221662481717, 0.8542359345530548, 0.42515310855913024, 0.42911461149981833]
# 0.44652450528900384

# [0.6273137771770656, 0.8787619240919684, 0.45825589785978954, 0.48178205912577515]
# 0.44665768571664594

# 12
# [0.7031174591345418, 0.17526885209382478, 0.8931524975803248, 0.5275256661576303, 0.6753998852501416]
# 0.44691928761081545

# [0.8074348842019702, 0.9534285472400551, 0.579247898557129, 0.7839275842173122]
# 0.4469713746715202

In [None]:
sample_num = 4000

In [None]:
# make_pl_fold(model_path_list_s42, 42, sample_num, [0.5251053345458068, 0.22003221662481717, 0.8542359345530548, 0.42515310855913024, 0.42911461149981833])
make_pl_fold(model_path_list_s12, 12, sample_num, [0.8074348842019702, 0.9534285472400551, 0.579247898557129, 0.7839275842173122])

Start creating pl fold 0
Using "/kaggle/input/fb3-deberta-v3-base/exp43s42/deberta-v3-base_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-base/exp43s42/best-epoch-fold0.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-deberta-v3-large/exp50s42/deberta-v3-large_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-large/exp50s42/best-epoch-fold0.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/bigbird-roberta-base_config.pt"
Loaded "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/best-epoch-fold0.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 420 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Using "/kaggle/input/fb3-roberta-large/exp54s42/roberta-large_config.pt"
Loaded "/kaggle/input/fb3-roberta-large/exp54s42/best-epoch-fold0.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Using "/kaggle/input/fb3-longformer-large/exp57s42/longformer-large-4096_config.pt"
Loaded "/kaggle/input/fb3-longformer-large/exp57s42/best-epoch-fold0.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]


Start creating pl fold 1
Using "/kaggle/input/fb3-deberta-v3-base/exp43s42/deberta-v3-base_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-base/exp43s42/best-epoch-fold1.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-deberta-v3-large/exp50s42/deberta-v3-large_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-large/exp50s42/best-epoch-fold1.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/bigbird-roberta-base_config.pt"
Loaded "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/best-epoch-fold1.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 420 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Using "/kaggle/input/fb3-roberta-large/exp54s42/roberta-large_config.pt"
Loaded "/kaggle/input/fb3-roberta-large/exp54s42/best-epoch-fold1.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Using "/kaggle/input/fb3-longformer-large/exp57s42/longformer-large-4096_config.pt"
Loaded "/kaggle/input/fb3-longformer-large/exp57s42/best-epoch-fold1.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]


Start creating pl fold 2
Using "/kaggle/input/fb3-deberta-v3-base/exp43s42/deberta-v3-base_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-base/exp43s42/best-epoch-fold2.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-deberta-v3-large/exp50s42/deberta-v3-large_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-large/exp50s42/best-epoch-fold2.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/bigbird-roberta-base_config.pt"
Loaded "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/best-epoch-fold2.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 420 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Using "/kaggle/input/fb3-roberta-large/exp54s42/roberta-large_config.pt"
Loaded "/kaggle/input/fb3-roberta-large/exp54s42/best-epoch-fold2.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Using "/kaggle/input/fb3-longformer-large/exp57s42/longformer-large-4096_config.pt"
Loaded "/kaggle/input/fb3-longformer-large/exp57s42/best-epoch-fold2.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]


Start creating pl fold 3
Using "/kaggle/input/fb3-deberta-v3-base/exp43s42/deberta-v3-base_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-base/exp43s42/best-epoch-fold3.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-deberta-v3-large/exp50s42/deberta-v3-large_config.pt"
Loaded "/kaggle/input/fb3-deberta-v3-large/exp50s42/best-epoch-fold3.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Using "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/bigbird-roberta-base_config.pt"
Loaded "/kaggle/input/fb3-bigbird-roberta-base/exp49s42/best-epoch-fold3.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Attention type 'block_sparse' is not possible if sequence_length: 420 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...


Using "/kaggle/input/fb3-roberta-large/exp54s42/roberta-large_config.pt"
Loaded "/kaggle/input/fb3-roberta-large/exp54s42/best-epoch-fold3.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]

Using "/kaggle/input/fb3-longformer-large/exp57s42/longformer-large-4096_config.pt"
Loaded "/kaggle/input/fb3-longformer-large/exp57s42/best-epoch-fold3.pt"


  0%|          | 0/4000 [00:00<?, ?it/s]


