In [1]:
!pip install '/kaggle/input/lal-ae2-wheel/pyspellchecker-0.8.1-py3-none-any.whl'

Processing /kaggle/input/lal-ae2-wheel/pyspellchecker-0.8.1-py3-none-any.whl
Installing collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1


In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import random
import yaml
import time
import datetime
import os
import sys
import warnings
from tqdm.notebook import trange, tqdm
from types import SimpleNamespace
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.cuda.amp import autocast, GradScaler

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, confusion_matrix, ConfusionMatrixDisplay, f1_score

import polars as pl
import matplotlib.pyplot as plt
from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import lightgbm as lgb

print(f'python version: {sys.version}') 
print(f'torch version: {torch.__version__}')
print(f'transfromers version: {transformers.__version__}')

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter('ignore')

2024-05-04 12:46:40.437179: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-04 12:46:40.437311: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-04 12:46:40.585136: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


python version: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
torch version: 2.1.2
transfromers version: 4.39.3


In [3]:
#https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [4]:
data_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'
# train_df = pd.read_csv(data_path + "train.csv")
# test_df = pd.read_csv(data_path + "test.csv")
# samp_df = pd.read_csv(data_path + "sample_submission.csv")

test_df = pd.read_csv('/kaggle/input/lal-aes2-create-prompt-data/train_df_non_overlapped.csv')

# Dataset

In [5]:
class AE2Dataset(Dataset):
    def __init__(self, conf, df, tokenizer, output_tokens_only=False):
        self.conf = conf
        self.full_texts = df[self.conf.train_col].reset_index(drop=True).values
        self.tokenizer = tokenizer
        self.output_tokens_only = output_tokens_only
        
        if not self.output_tokens_only:
            self.essay_ids = df['essay_id'].reset_index(drop=True).values
            self.labels = df[self.conf.target_col].reset_index(drop=True).values
            if self.conf.num_labels == 1: # regression
                self.label_dtype = torch.float
                if self.conf.criterion == 'bce':
                    self.labels = self.labels / 5.0 
            else: # classication
                self.label_dtype = torch.long
                
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, idx):
        tokens = self._get_token(idx)
        if self.output_tokens_only:
            return tokens
        else:
            ids = self.essay_ids[idx]
            labels = self._get_label(idx)
        return {'tokens': tokens, 'labels': labels, 'ids': ids}
    
    def _get_token(self, idx):
        tokenized = self.tokenizer(
            self.full_texts[idx],
            add_special_tokens=True,
            max_length=self.conf.max_len,
            padding="max_length",
            truncation=True,
            return_tensors=None,
        )
        
        return {k: torch.tensor(v, dtype=torch.long) for k, v in tokenized.items()}
    
    def _get_label(self, idx):
        return torch.tensor(self.labels[idx], dtype=self.label_dtype)

# Model

In [6]:
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class ConcatPooling(nn.Module):
    def __init__(self, pooling_last=4):
        super().__init__()
        self.pooling_last = pooling_last
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        concat_pooling = torch.cat(tuple(all_hidden_states[-l] for l in range(1, self.pooling_last + 1)), -1)
        concat_pooling = concat_pooling[:, 0] # select the first one
        return concat_pooling
    
# https://www.kaggle.com/competitions/google-quest-challenge/discussion/129840    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]
    
# https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super().__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

In [7]:
class CustomModel(nn.Module):
    def __init__(self, conf, conf_path=None):
        super().__init__()
        self.conf_path = conf_path
        self.multi_dropout = conf.multi_dropout
        if not self.conf_path:
            self.model_conf = AutoConfig.from_pretrained(conf.model_name, output_hidden_states=True)
            self.model_conf = self._set_dropout(self.model_conf)
            self.backbone = AutoModel.from_pretrained(conf.model_name, config=self.model_conf)
        else:
            self.model_conf = torch.load(self.conf_path)
            self.backbone = AutoModel.from_config(self.model_conf)
        
        if conf.gradient_checkpointing:
            self.backbone.gradient_checkpointing_enable()
        
        self.pooler, hidden_size = self.get_pooling_layer(conf)
        self.fc = nn.Linear(hidden_size, conf.num_labels)
        self._init_weights(self.fc)
        
        if self.multi_dropout and conf.num_labels > 1:
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
    
    def _set_dropout(self, model_conf, ratio=0.):
        model_conf.attention_dropout = ratio
        model_conf.attention_probs_dropout_prob = ratio
        model_conf.hidden_dropout = ratio
        model_conf.hidden_dropout_prob = ratio
        
        return model_conf
    
    def get_pooling_layer(self, conf):
        if conf.pooling_layer == 'mean_pooling':
            hidden_size = self.model_conf.hidden_size
            return MeanPooling(), hidden_size
        if conf.pooling_layer == 'concat_pooling':
            hidden_size = self.model_conf.hidden_size * conf.ccp_pooling_last
            return ConcatPooling(conf.ccp_pooling_last), hidden_size
        if conf.pooling_layer == 'weighted_layer_pooling':
            hidden_size = self.model_conf.hidden_size
            return WeightedLayerPooling(self.model_conf.num_hidden_layers, conf.wlp_layer_start), hidden_size
        if conf.pooling_layer == 'lstm_pooling':
            hidden_size = self.model_conf.hidden_size
            return LSTMPooling(self.model_conf.num_hidden_layers, hidden_size, conf.lstm_hidden), hidden_size
        else:
            raise Exception('Invalid pooling layer name')
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, inputs):
        backbone_outputs = self.backbone(**inputs)
        pooler_outputs = self.pooler(backbone_outputs, inputs)
        if self.multi_dropout:
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    h = self.fc(dropout(pooler_outputs))
                else:
                    h += self.fc(dropout(pooler_outputs))

            outputs = h / len(self.dropouts)
        else:
            outputs = self.fc(pooler_outputs)
        
        return outputs

# Utils

In [8]:
def preprocess_data(df_):
    df = df_.copy()
    
    drop_id_list = ['e9be80d', '6017fea', 'e9be80d']
    
    df = df[~df['essay_id'].isin(drop_id_list)].reset_index(drop=True)
    
                
#     labels_map = {
#         'Car-free cities': 0,
#         '"A Cowboy Who Rode the Waves"': 1,
#         'Exploring Venus': 2,
#         'Facial action coding system': 3,
#         'The Face on Mars': 4,
#         'Driverless cars': 5,
#         'Does the electoral college work?': 6
#     }
#     df['prompt_name'] = df['prompt_name'].replace(labels_map).astype(int)
    
    
#     df['label'] = df['score'].copy() - 1
    df['full_text'] = df['full_text'].str.replace('\xa0', ' ')
    df['full_text'] = df['full_text'].str.replace('\n', '|')
    df['full_text'] = df['full_text'].str.strip()
    return df

## LGBM Preprocess and Feature Engineering

In [9]:
def __tokenizer(x):
    return x
def __preprocessor(x):
    return x

spell = SpellChecker()
def count_spelling_errors(txt):
    freq = np.array([spell[word] for word in txt.split(' ')])
    misspelled_count = len(freq[freq == 0])
    return misspelled_count

def data_preprocessing(x, data_type):
    pattern_list = [r'<.*?>', '@\w+', "'\d+", '\d+', 'http\w+', r'\s+', r'\.+', r'\,+', '\xa0']
    replace_with_list = ['', '', '', '', '', ' ', '.', ',', ' ']
    x = x.with_columns(pl.col(data_type).str.to_lowercase())
    x = x.with_columns(pl.col(data_type).str.replace_many(pattern_list, replace_with_list))
    x = x.with_columns(pl.col(data_type).str.strip_chars())
    
    return x

def paragraph_preprocess(df):
    
    df = df.explode('paragraph')
    df = data_preprocessing(df, 'paragraph')
    df = df.with_columns(pl.col('paragraph').str.replace_all(r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]', '').alias('p_no_punctuation'))
    df = df.with_columns(pl.col('p_no_punctuation').map_elements(count_spelling_errors, return_dtype=pl.Int64).alias('p_error_num'))
    df = df.with_columns(pl.col('paragraph').str.len_chars().alias('p_len'))
    df = df.with_columns(pl.col('paragraph').str.count_matches('\.').alias('p_sentence_count'))
    df = df.with_columns(pl.col('paragraph').str.count_matches(' ').alias('p_word_count'))

    return df

def paragraph_eng(df):
    p_features = ['p_error_num', 'p_len', 'p_sentence_count', 'p_word_count']
    range_list = np.arange(0, 625, 25)
    range_list2 = np.arange(0, 725, 25)
    aggs = [
        *[pl.col('paragraph').filter(pl.col('p_len') >= l).count().alias(f'p_g{l}_count') for l in range_list],
        *[pl.col('paragraph').filter(pl.col('p_len') <= l).count().alias(f'p_l{l}_count') for l in [24, 49]],
        *[pl.col(feat).max().alias(f'{feat}_max') for feat in p_features],
        *[pl.col(feat).mean().alias(f'{feat}_mean') for feat in p_features],
        *[pl.col(feat).min().alias(f'{feat}_min') for feat in p_features],
        *[pl.col(feat).sum().alias(f'{feat}_sum') for feat in p_features],
        *[pl.col(feat).first().alias(f'{feat}_first') for feat in p_features],
        *[pl.col(feat).last().alias(f'{feat}_last') for feat in p_features],
        *[pl.col(feat).kurtosis().alias(f'{feat}_kurtosis') for feat in p_features],
        *[pl.col(feat).quantile(0.25).alias(f'{feat}_q1') for feat in p_features],
        *[pl.col(feat).quantile(0.75).alias(f'{feat}_q3') for feat in p_features],
    ]
    
    df = df.group_by('essay_id', maintain_order=True).agg(aggs)

    return df

def sentence_preprocess(df):
    df = data_preprocessing(df, 'full_text')
    df = df.with_columns(pl.col('full_text').str.split('.').alias('sentence'))
    df = df.explode('sentence')
    df = df.with_columns(pl.col('sentence').str.len_chars().alias('s_len'))
    df = df.with_columns(pl.col('sentence').str.count_matches(' ').alias('s_word_count'))
    return df

def sentence_eng(df):
    s_features = ['s_len', 's_word_count']
    range_list = np.arange(0, 350, 50)
    aggs = [
        *[pl.col('sentence').filter(pl.col('s_len') >= l).alias(f's_g{l}_count').count() for l in range_list],
        *[pl.col('sentence').filter(pl.col('s_len') <= l).alias(f's_l{l}_count').count() for l in [15]],
        *[pl.col(feat).max().alias(f'{feat}_max') for feat in s_features],
        *[pl.col(feat).mean().alias(f'{feat}_mean') for feat in s_features],
        *[pl.col(feat).min().alias(f'{feat}_min') for feat in s_features],
        *[pl.col(feat).sum().alias(f'{feat}_sum') for feat in s_features],
        *[pl.col(feat).first().alias(f'{feat}_first') for feat in s_features],
        *[pl.col(feat).last().alias(f'{feat}_last') for feat in s_features],
        *[pl.col(feat).kurtosis().alias(f'{feat}_kurtosis') for feat in s_features],
        *[pl.col(feat).quantile(0.25).alias(f'{feat}_q1') for feat in s_features],
        *[pl.col(feat).quantile(0.75).alias(f'{feat}_q3') for feat in s_features],
    ]
    
    df = df.group_by('essay_id', maintain_order=True).agg(aggs)
    
    return df

def word_preprocess(df):
    df = data_preprocessing(df, 'full_text')
    df = df.with_columns(pl.col('full_text').str.split(' ').alias('word'))
    df = df.explode('word')
    df = df.with_columns(pl.col('word').str.len_chars().alias('w_len'))
    df = df.filter(pl.col('w_len') != 0)
    
    return df

def word_eng(df):
    range_list = np.arange(1, 21)
    aggs = [
        *[pl.col('word').filter(pl.col('w_len') >= i).count().alias(f'w_{i}_count') for i in range_list],
        pl.col('w_len').max().alias('w_len_max'),
        pl.col('w_len').mean().alias('w_len_mean'),
        pl.col('w_len').std().alias('w_len_std'),
        pl.col('w_len').quantile(0.25).alias('w_len_q1'),
        pl.col('w_len').quantile(0.50).alias('w_len_q2'),
        pl.col('w_len').quantile(0.75).alias('w_len_q3'),
    ]
    
    df = df.group_by('essay_id', maintain_order=True).agg(aggs)

    return df

def get_tfidf_vec_feats(df, vec_path):
    tfidf_vec = torch.load(vec_path)
    train_tfidf = tfidf_vec.transform([i for i in df['full_text']])
    tfidf_dense = train_tfidf.toarray()
    tfidf_df = pl.DataFrame(tfidf_dense)
    tfidf_columns = [f'tfidf_vec_{i}' for i in range(len(tfidf_df.columns))]
    tfidf_df.columns = tfidf_columns
    tfidf_df = tfidf_df.with_columns(essay_id=df['essay_id'])
    return tfidf_df

def get_count_vec_feats(df, vec_path):
    count_vec = torch.load(vec_path)
    train_count = count_vec.transform([i for i in df['full_text']])
    count_dense = train_count.toarray()
    count_df = pl.DataFrame(count_dense)
    count_columns = [f'count_vec_{i}' for i in range(len(count_df.columns))]
    count_df.columns = count_columns
    count_df = count_df.with_columns(essay_id=df['essay_id'])
    return count_df

def quadratic_weighted_kappa(y_true, y_pred):
    y_true = y_true + a
    y_pred = (y_pred + a).clip(1, 6).round()
    qwk = cohen_kappa_score(y_true, y_pred, weights="quadratic")
    return 'QWK', qwk, True
def qwk_obj(y_true, y_pred):
    labels = y_true + a
    preds = y_pred + a
    preds = preds.clip(1, 6)
    f = 1/2*np.sum((preds-labels)**2)
    g = 1/2*np.sum((preds-a)**2+b)
    df = preds - labels
    dg = preds - a
    grad = (df/g - f*dg/g**2)*len(labels)
    hess = np.ones(len(labels))
    return grad, hess

# a = 2.998
# b = 1.092

# (2.948402380539666, 1.0918134361390224)

a = 2.948
b = 1.092

# Infer Model

In [10]:
class InferModels:
    def __init__(self):
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.models_dict = OrderedDict()
        self.lgbm_models_dict = OrderedDict()
    
    def register(self, model_name, path, fold_num_list):
        model_config_dict = self._load_config_path(path, fold_num_list)
        self.models_dict[model_name] = model_config_dict
        print(f'REGISTERED: {model_name}')
        
    def register_lgbm(self, model_name, path):
        model_config_dict = self._load_lgbm_config_path(path)
        self.lgbm_models_dict[model_name] = model_config_dict
        print(f'REGISTERED: {model_name}')
    
    def predict_cv(self, test_df):
        
        test_df = preprocess_data(test_df)
        
        for model_name, model_config_dict in self.models_dict.items():

            with open(model_config_dict['yaml']) as file:
                conf = SimpleNamespace(**yaml.safe_load(file))
                
            seed_everything(conf.seed)
            
            model_config_path = model_config_dict['model_config']
            tokenizer = self._get_tokenizer(model_config_dict['tokenizer'])
            
            fold_predictions_list = []
            fold_raw_predictions_list = []
            essay_id_df = test_df['essay_id']
            
            for fold_num, model_path in model_config_dict['models'].items():
                print(f'INFERENCING: {model_name}, CONFIG_EXP: {conf.exp}, FOLD: {fold_num}')
                
                model = self._load_model(conf, model_config_path, model_path)
                test_dataloader = self._get_test_dataloader(conf, test_df, tokenizer)
                fold_predictions, fold_raw_predictions = self._predict(conf, model, test_dataloader)
                fold_predictions_list.append(fold_predictions.squeeze())
            
            fold_preds_array = np.rint(torch.stack(fold_predictions_list).cpu().numpy())
            fold_preds_array = {f'class_f{f}': i for f, i in enumerate(fold_preds_array)}
            
            cv_df = pd.DataFrame(fold_preds_array)
            cv_df = pd.concat([essay_id_df, cv_df], axis=1)
            
            self.models_dict[model_name]['cv_df'] = cv_df
        
    def ensemble(self, method='mean'): # method need works, when weights needs
        model_pred_df_list = [
            v['cv_df'][['essay_id'] + [f'class_f{i}' for i in range(len(v['models']))]]
            for v in self.models_dict.values()
        ]
        essay_id_df = model_pred_df_list[0]['essay_id']
        pred_values = [np.mean(df.drop('essay_id', axis=1).values, axis=1) for df in model_pred_df_list]
        
        if method == 'mean':
            pred_mean = np.mean(pred_values, axis=0).clip(0, 6) # take mean across all models and clip to [1, 6]
            pred_df = pd.DataFrame(pred_mean, columns=['class'])
            
        elif isinstance(method, list) and len(method) == (len(self.models_dict) + len(self.lgbm_models_dict)):
            weighted_pred = np.average(pred_values, axis=0, weights=method)
            pred_df = pd.DataFrame(weighted_pred, columns['class'])
            
        else:
            raise Exception('method is invalid')
            
        result = pd.concat([essay_id_df, pred_df], axis=1)
        result['class'] = result['class'].astype(int)
        
        return result
        
    def _predict(self, conf, model, dataloader):
        raw_predictions = []
        predictions = []
        
        model.eval()
        model.to(self.device)

        for inputs in tqdm(dataloader):

            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                raw_outputs = model(inputs)
            
            outputs = self._process_outputs(conf, raw_outputs)
            raw_predictions.append(raw_outputs)
            predictions.append(outputs.float())
        
        raw_predictions = torch.stack(raw_predictions)
        predictions = torch.stack(predictions)
        
        return predictions, raw_predictions
    
    def _process_outputs(self, conf, outputs):
        if conf.num_labels == 1:
            ouputs = outputs.squeeze()
        else:
            outputs = outputs.softmax(1).argmax(-1)
        
        if conf.criterion == 'bce':
            outputs = outputs.sigmoid() * 5.0
            
        return outputs
    
    def _get_tokenizer(self, tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        return tokenizer
    
    def _load_config_path(self, path, fold_num_list):
        # /kaggle/input/fb-debertav3-roberta-large-seed0/exp54s0/best-epoch-fold3.pt
        config_dict = {
            'yaml': f'{path}/config.yaml',
            'model_config': list(Path(path).glob('*_config.pt'))[0].as_posix(),
            'tokenizer': f'{path}/tokenizers/',
            'models': {f: f'{path}/best_score_fold{f}.pt' for f in fold_num_list},
            'oof_df': f'{path}/oof_df.csv'
        }
        return config_dict
    
    def _load_lgbm_config_path(self, path):
        config_dict = {
            'models': f'{path}/models.bin',
            'selected_feats': f'{path}/selected_feats.yaml',
            'vectors': {
                'tf-idf': f'{path}/tfidf_vec.bin',
                'count': f'{path}/count_vec.bin'
            }
        }
        return config_dict
    
    def _load_model(self, conf, model_config_path, pretrained_model_path):
        model = CustomModel(conf, conf_path=model_config_path)
        state_dict = torch.load(pretrained_model_path, map_location=self.device)['model_state_dict']
        model.load_state_dict(state_dict)
        return model
    
    def _get_test_dataloader(self, conf, df, tokenizer, batch_num=1):
        test_dataset = AE2Dataset(conf, df, tokenizer, output_tokens_only=True)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=batch_num,
            num_workers=0,
            collate_fn= DataCollatorWithPadding(tokenizer=tokenizer, padding='longest'),
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )
        return test_dataloader
        
    def print_registered_models(self):
        for model_name in self.models_dict.keys():
            print(model_name)
        for model_name in self.lgbm_models_dict.keys():
            print(model_name)

# Infer

In [11]:
submit = True

In [12]:
infermodels = InferModels()
infermodels.register('PROMPT_02', '/kaggle/input/lal-aes2-train-prompt-name', [0, 1, 2])
# infermodels.register('exp022', '/kaggle/input/lal-aes2-exp022', [0, 1, 2, 3])

# infermodels.print_registered_models()

REGISTERED: PROMPT_02


In [13]:
infermodels.predict_cv(test_df)

pred_df = infermodels.ensemble(method='mean')
if submit:
    display(pred_df)
    pred_df.to_csv('train_df_with_pred_prompt.csv', index=False)

# lgbm_pred_df = infermodels.predict_cv_ensemble_lgbm(test_df)
# if submit:
#     display(lgbm_pred_df)
#     lgbm_pred_df.to_csv('submission.csv', index=False)

INFERENCING: PROMPT_02, CONFIG_EXP: PROMPT_02, FOLD: 0


  0%|          | 0/4434 [00:00<?, ?it/s]

INFERENCING: PROMPT_02, CONFIG_EXP: PROMPT_02, FOLD: 1


  0%|          | 0/4434 [00:00<?, ?it/s]

INFERENCING: PROMPT_02, CONFIG_EXP: PROMPT_02, FOLD: 2


  0%|          | 0/4434 [00:00<?, ?it/s]

Unnamed: 0,essay_id,class
0,000fe60,4
1,001ab80,5
2,001bdc0,2
3,0033037,3
4,0065bd6,5
...,...,...
4429,ffbd0b4,2
4430,ffcb061,1
4431,ffcb264,3
4432,ffd378d,2
