In [None]:
import os
import warnings
import logging
import shutil
import json
import random
import re
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize, treebank
from spellchecker import SpellChecker
import spacy
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.cuda import amp
from torch.optim import Adam, AdamW, SGD
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
import seaborn as sns
import tokenizers

import transformers
from transformers import (
    AutoModel,
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup
)
from datasets import Dataset, load_dataset, load_from_disk, load_metric, disable_progress_bar
from tqdm import tqdm

print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# logging setting
warnings.simplefilter("ignore")
logging.disable(logging.ERROR)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
disable_progress_bar()
tqdm.pandas()

In [None]:
import sys
sys.path.append('/kaggle/input/sentence-transformers')
from sentence_transformers import SentenceTransformer

In [None]:
import gc
import difflib
os.system('python -m pip install --no-index --find-links=/kaggle/input/reability py_readability_metrics')
from readability import Readability

In [None]:
# set random seed
def seed_everything(seed: int):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True


seed_everything(seed=42)

In [None]:
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

## Dataload

In [None]:
DATA_DIR = "/kaggle/input/commonlit-evaluate-student-summaries/"

prompts_train = pd.read_csv(DATA_DIR + "prompts_train.csv")
prompts_test = pd.read_csv(DATA_DIR + "prompts_test.csv")
summaries_train = pd.read_csv(DATA_DIR + "summaries_train.csv")
summaries_test = pd.read_csv(DATA_DIR + "summaries_test.csv")
sample_submission = pd.read_csv(DATA_DIR + "sample_submission.csv")

#summaries_train = summaries_train.head(10) # for dev mode

In [None]:
test = summaries_test.merge(prompts_test, on="prompt_id", how="left")

## Metrics

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_mcrmse(eval_pred):
    """
    Calculates mean columnwise root mean squared error
    https://www.kaggle.com/competitions/commonlit-evaluate-student-summaries/overview/evaluation
    """
    preds, labels = eval_pred

    col_rmse = np.sqrt(np.mean((preds - labels) ** 2, axis=0))
    mcrmse = np.mean(col_rmse)

    return {
        "content_rmse": col_rmse[0],
        "wording_rmse": col_rmse[1],
        "mcrmse": mcrmse,
    }

def compt_score(content_true, content_pred, wording_true, wording_pred):
    content_score = mean_squared_error(content_true, content_pred)**(1/2)
    wording_score = mean_squared_error(wording_true, wording_pred)**(1/2)

    return (content_score + wording_score)/2

# Transformer Inference

In [None]:
transformer_preds_dict = {}

## Config

In [None]:
class CFG:
    model_name="debertav3base"
    learning_rate=1.5e-5
    weight_decay=0.02
    hidden_dropout_prob=0.005
    attention_probs_dropout_prob=0.005
    num_train_epochs=5
    n_splits=4
    batch_size=8
    random_seed=42
    save_steps=100
    max_length=512

class CFG2:
    num_workers=4
    path=["/kaggle/input/exp023-content/exp023_content/","/kaggle/input/exp023-wording/exp023_wording/"]
    config_path=path[0]+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    make_feat=False
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False


class CFG3:
    num_workers=4
    path='/kaggle/input/exp008/exp008/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG4:
    num_workers=4
    path='/kaggle/input/exp012/exp012/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False


class CFG5:
    num_workers=4
    path='/kaggle/input/exp015/exp015/'
    config_path=path+'config.pth'
    model="microsoft/deberta-xlarge"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG6:
    num_workers=4
    path='/kaggle/input/exp009/exp009/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG7:
    num_workers=4
    path='/kaggle/input/exp011/exp011/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v2-xlarge"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG8:
    num_workers=4
    path='/kaggle/input/exp038/exp038/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-base"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.007     #0.005
    attention_probs_dropout_prob=0.007     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG10:
    num_workers=4
    path=["/kaggle/input/exp041-content/exp042_content/","/kaggle/input/exp042-wording/exp042_wording/"]
    config_path=path[0]+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    make_feat=False
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='MeanPooling'
    is_max_len=False

class CFG11:
    num_workers=4
    path='/kaggle/input/exp046/exp046/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='LSTMPooling'
    hidden_size=512
    is_max_len=False

class CFG12:
    num_workers=4
    path='/kaggle/input/exp047/exp047/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='LSTMPooling'
    hidden_size=512
    is_max_len=False

class CFG13:
    num_workers=4
    path='/kaggle/input/exp048/exp048/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=4
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=4
    is_max_len=False

class CFG14:
    num_workers=4
    path='/kaggle/input/exp049-2/exp049/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='WeightedLayerPooling'
    is_max_len=False

class CFG15:
    num_workers=4
    path='/kaggle/input/exp050-2/exp050/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=64
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=14
    is_max_len=False


class CFG16:
    num_workers=4
    path=["/kaggle/input/exp052-wording/exp052_wording/"]
    config_path=path[0]+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=8
    target_cols=['wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    make_feat=False
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=14
    is_max_len=False

class CFG19:
    num_workers=4
    path='/kaggle/input/exp064/exp064/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=1
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=14
    is_max_len=True
    max_len=896

class CFG20:
    num_workers=4
    path='/kaggle/input/exp068/exp068/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=1
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=10
    is_max_len=True
    max_len=1392

class CFG21:
    num_workers=4
    path='/kaggle/input/exp085-full-train/exp085_full_train/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=1
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=10
    is_max_len=True
    max_len=1024

class CFG22:
    num_workers=4
    path='/kaggle/input/exp108-full-train/exp108_full_train/'
    config_path=path+'config.pth'
    model="microsoft/deberta-v3-large"
    gradient_checkpointing=False
    batch_size=1
    target_cols=['content', 'wording']
    seed=42
    n_fold=4
    trn_fold=[0,1,2,3]
    weight=1
    hidden_dropout_prob=0.    #0.005
    attention_probs_dropout_prob=0.     #0.005
    pooling='ConcatPooling'
    n_layers=10
    is_max_len=True
    max_len=1800


CFG_list=[CFG14, CFG15, CFG22] # ,CFG16] # ,CFG21]

## Deberta Regressor

In [None]:
class ContentScoreRegressor:
    def __init__(self,
                model_name: str,
                model_dir: str,
                target: str,
                hidden_dropout_prob: float,
                attention_probs_dropout_prob: float,
                max_length: int,
                ):
        self.inputs = ["prompt_text", "prompt_title", "prompt_question", "text"]
        self.input_col = "input"

        self.text_cols = [self.input_col]
        self.target = target
        self.target_cols = [target]

        self.model_name = model_name
        self.model_dir = model_dir
        self.max_length = max_length

        self.tokenizer = AutoTokenizer.from_pretrained(f"/kaggle/input/{model_name}")
        self.model_config = AutoConfig.from_pretrained(f"/kaggle/input/{model_name}")

        self.model_config.update({
            "hidden_dropout_prob": hidden_dropout_prob,
            "attention_probs_dropout_prob": attention_probs_dropout_prob,
            "num_labels": 1,
            "problem_type": "regression",
        })

        seed_everything(seed=42)

        self.data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer
        )


    def tokenize_function(self, examples: pd.DataFrame):
        labels = [examples[self.target]]
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return {
            **tokenized,
            "labels": labels,
        }

    def tokenize_function_test(self, examples: pd.DataFrame):
        tokenized = self.tokenizer(examples[self.input_col],
                         padding=False,
                         truncation=True,
                         max_length=self.max_length)
        return tokenized

    def train(self,
            fold: int,
            train_df: pd.DataFrame,
            valid_df: pd.DataFrame,
            batch_size: int,
            learning_rate: float,
            weight_decay: float,
            num_train_epochs: float,
            save_steps: int,
        ) -> None:
        """fine-tuning"""

        sep = self.tokenizer.sep_token
        train_df[self.input_col] = (
                    train_df["prompt_title"] + sep
                    + train_df["prompt_question"] + sep
                    + train_df["text"]
                  )

        valid_df[self.input_col] = (
                    valid_df["prompt_title"] + sep
                    + valid_df["prompt_question"] + sep
                    + valid_df["text"]
                  )

        train_df = train_df[[self.input_col] + self.target_cols]
        valid_df = valid_df[[self.input_col] + self.target_cols]

        model_content = AutoModelForSequenceClassification.from_pretrained(
            f"/kaggle/input/{self.model_name}",
            config=self.model_config
        )

        train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
        val_dataset = Dataset.from_pandas(valid_df, preserve_index=False)

        train_tokenized_datasets = train_dataset.map(self.tokenize_function, batched=False)
        val_tokenized_datasets = val_dataset.map(self.tokenize_function, batched=False)

        # eg. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold))

        training_args = TrainingArguments(
            output_dir=model_fold_dir,
            load_best_model_at_end=True, # select best model
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=8,
            num_train_epochs=num_train_epochs,
            weight_decay=weight_decay,
            report_to='none',
            greater_is_better=False,
            save_strategy="steps",
            evaluation_strategy="steps",
            eval_steps=save_steps,
            save_steps=save_steps,
            metric_for_best_model="rmse",
            save_total_limit=1
        )

        trainer = Trainer(
            model=model_content,
            args=training_args,
            train_dataset=train_tokenized_datasets,
            eval_dataset=val_tokenized_datasets,
            tokenizer=self.tokenizer,
            compute_metrics=compute_metrics,
            data_collator=self.data_collator
        )

        trainer.train()

        model_content.save_pretrained(self.model_dir)
        self.tokenizer.save_pretrained(self.model_dir)


    def predict(self,
                test_df: pd.DataFrame,
                fold: int,
               ):
        """predict content score"""

        sep = self.tokenizer.sep_token
        in_text = (
                    test_df["prompt_title"] + sep
                    + test_df["prompt_question"] + sep
                    + test_df["text"]
                  )
        test_df[self.input_col] = in_text

        test_ = test_df[[self.input_col]]

        test_dataset = Dataset.from_pandas(test_, preserve_index=False)
        test_tokenized_dataset = test_dataset.map(self.tokenize_function_test, batched=False)

        model_content = AutoModelForSequenceClassification.from_pretrained(f"{self.model_dir}")
        model_content.eval()

        # e.g. "bert/fold_0/"
        model_fold_dir = os.path.join(self.model_dir, str(fold))

        test_args = TrainingArguments(
            output_dir=model_fold_dir,
            do_train = False,
            do_predict = True,
            per_device_eval_batch_size = 4,
            dataloader_drop_last = False,
        )

        # init trainer
        infer_content = Trainer(
                      model = model_content,
                      tokenizer=self.tokenizer,
                      data_collator=self.data_collator,
                      args = test_args)

        preds = infer_content.predict(test_tokenized_dataset)[0]

        return preds

In [None]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    if cfg.is_max_len:
        inputs = cfg.tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=cfg.max_len,
        pad_to_max_length=True,
        truncation=True
        )
    else:
        inputs = cfg.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
    #         max_length=cfg.max_len,
    #         pad_to_max_length=True,
    #         truncation=True
        )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        return inputs


#ref:https://github.com/shu421/kagglib/blob/main/nlp/model.py
# ====================================================
# Model
# ====================================================

def freeze(module):
    """
    Freezes module's parameters.
    """

    for parameter in module.parameters():
        parameter.requires_grad = False
# =====================================================
# Pooling
# =====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings



class AttentionPooling(nn.Module):
    """
    Usage:
        self.pool = AttentionPooling(self.config.hidden_size)
    """
    def __init__(self, in_dim):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(in_dim, in_dim),
            nn.LayerNorm(in_dim),
            nn.GELU(),
            nn.Linear(in_dim, 1),
        )

        self._init_weights(self.attention)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, last_hidden_state, attention_mask):
        w = self.attention(last_hidden_state).float()
        w[attention_mask == 0] = float("-inf")
        w = torch.softmax(w, 1)
        attention_embeddings = torch.sum(w * last_hidden_state, dim=1)
        return attention_embeddings



class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights=None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
            torch.tensor([1] * (num_hidden_layers + 1 - layer_start), dtype=torch.float)
        )

    def forward(self, all_hidden_states):
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor * all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]

class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm, dropout_rate, is_lstm=True):
        super(LSTMPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm

        if is_lstm:
            self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        else:
            self.lstm = nn.GRU(self.hidden_size, self.hiddendim_lstm, batch_first=True, bidirectional=True)

        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers + 1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out

class ConcatPooling(nn.Module):
    def __init__(self, n_layers=4):
        super(ConcatPooling, self, ).__init__()

        self.n_layers = n_layers

    def forward(self, all_hidden_states):
        concatenate_pooling = torch.cat([all_hidden_states[-(i + 1)] for i in range(self.n_layers)], -1)
        concatenate_pooling = concatenate_pooling[:, 0]
        return concatenate_pooling

# GeM
class GeMText(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super(GeMText, self).__init__()
        self.dim = dim
        self.p = Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, x, attention_mask):
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(x.shape)
        x = (x.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = x / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

# ===========================================
# custom Model
# ===========================================

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if cfg.pooling =='MeanPooling':
            self.pool=MeanPooling()
            self.fc = nn.Linear(self.config.hidden_size, 2)
        elif cfg.pooling =='LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
            self.fc = nn.Linear(self.cfg.hidden_size, 2)
        elif cfg.pooling == "GeM":
            self.pool = GeMText()
            self.fc = nn.Linear(self.config.hidden_size, 2)
        elif cfg.pooling=='ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 2)
        elif cfg.pooling=='WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
            self.fc = nn.Linear(self.config.hidden_size, 2)


        self._init_weights(self.fc)



    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        if self.cfg.pooling=='MeanPooling':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling=='GeM':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

class CustomModel_1(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        if cfg.pooling =='MeanPooling':
            self.pool=MeanPooling()
            self.fc = nn.Linear(self.config.hidden_size, 1)
        elif cfg.pooling =='LSTMPooling':
            self.pool =  LSTMPooling(self.config.num_hidden_layers,
                                       self.config.hidden_size,
                                       self.cfg.hidden_size,
                                       0.1,
                                       is_lstm=True
                           )
            self.fc = nn.Linear(self.cfg.hidden_size, 1)
        elif cfg.pooling == "GeM":
            self.pool = GeMText()
            self.fc = nn.Linear(self.config.hidden_size, 1)
        elif cfg.pooling=='ConcatPooling':
            self.pool = ConcatPooling(n_layers=cfg.n_layers)
            self.fc = nn.Linear(cfg.n_layers*self.config.hidden_size, 1)
        elif cfg.pooling=='WeightedLayerPooling':
            self.pool = WeightedLayerPooling(self.config.num_hidden_layers)
            self.fc = nn.Linear(self.config.hidden_size, 1)


        self._init_weights(self.fc)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        if self.cfg.pooling=='MeanPooling':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling in ['GRUPooling', 'LSTMPooling']:
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling=='GeM':
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
        elif self.cfg.pooling == 'ConcatPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        elif self.cfg.pooling == 'WeightedLayerPooling':
            all_hidden_states = torch.stack(outputs[1])
            feature = self.pool(all_hidden_states)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# initialize layer
def reinit_bert(model):
    """_summary_

    Args:
        model (AutoModel): _description_

    Returns:
        model (AutoModel): _description_

    Usage:
        model = reinit_bert(model)
    """
    for layer in model.model.encoder.layer[-1:]:
        for module in layer.modules():
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=model.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)
    return model

# ====================================================
# inference
# ====================================================
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions


In [None]:
def takai_inference(df, CFG_):
    # ====================================================
    # tokenizer
    # ====================================================
    if isinstance(CFG_.path, list):
        CFG_.tokenizer = AutoTokenizer.from_pretrained(CFG_.path[0]+'tokenizer/')
    else:
        CFG_.tokenizer = AutoTokenizer.from_pretrained(CFG_.path+'tokenizer/')
    # ====================================================
    # Data Loading
    # ====================================================

    submission = pd.read_csv('../input/commonlit-evaluate-student-summaries/sample_submission.csv')


    #     # "text"列の長さを計算して新しい列"length"に追加
    #     test['length'] = test['text'].apply(len)
    #     # "text"列の先頭に"length"列の値を結合
    #     test['text'] = test['length'].astype(str) + '[SEP]' + test['prompt_question'] + '[SEP]' + test['prompt_title'] + '[SEP]' + test['text']
    #CFG_list=[CFG2,CFG4,CFG10,CFG11,CFG12,CFG13,CFG14,CFG15,CFG16]
    if CFG_==CFG2 or CFG_==CFG10 or CFG_==CFG2 or CFG_==CFG10 :
        df['full_text'] =   df['prompt_title'] + '[SEP]' + df['prompt_question'] + ' [SEP] ' + df['text']
    elif CFG_==CFG4 or CFG_==CFG14 or CFG_==CFG15 or CFG_==CFG16:
        df['full_text'] = df['prompt_question']+' [SEP] ' + df['text']
    elif CFG_==CFG11 or CFG_==CFG12 or CFG_==CFG13:
        df['full_text'] =  df['prompt_question'] + '[SEP]' + df['prompt_title'] + ' [SEP] ' + df['text']
    elif CFG_==CFG22 or CFG_==CFG21:
        df['full_text'] = df['text']+' [SEP] '+df['prompt_text']


    #df['text'] = df['prompt_question']+' [SEP] ' + df['text']
    print(f"df.shape: {df.shape}")
    display(df.head())
    print(f"submission.shape: {submission.shape}")
    display(submission.head())

    # sort by length to speed up inference
    df['tokenize_length'] = [len(CFG_.tokenizer(text)['input_ids']) for text in df['full_text'].values]
    df = df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)
    display(df.head())
    test_dataset = TestDataset(CFG_, df)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG_.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG_.tokenizer, padding='longest'),
                             num_workers=CFG_.num_workers, pin_memory=True, drop_last=False)
    predictions = []
    if  CFG_==CFG22 or CFG_==CFG21:
        model = CustomModel(CFG_, config_path=CFG_.config_path, pretrained=False)
        state = torch.load(CFG_.path+f"{CFG_.model.replace('/', '-')}_full_train.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'],strict=False)
        prediction = inference_fn(test_loader, model, device)

        predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()

    else:
        for fold in CFG_.trn_fold:
            if isinstance(CFG_.path,list):
                for n,path in enumerate(CFG_.path):

                    model = CustomModel_1(CFG_, config_path=CFG_.config_path, pretrained=False)
                    state = torch.load(path+f"{CFG_.model.replace('/', '-')}_fold{fold}_best.pth",
                                       map_location=torch.device('cpu'))
                    model.load_state_dict(state['model'],strict=False)
                    prediction_one = inference_fn(test_loader, model, device)
                    if n==0:
                        prediction=prediction_one
                    else:
                        prediction=np.concatenate((prediction,prediction_one),axis=1)

            else:

                model = CustomModel(CFG_, config_path=CFG_.config_path, pretrained=False)
                state = torch.load(CFG_.path+f"{CFG_.model.replace('/', '-')}_fold{fold}_best.pth",
                                   map_location=torch.device('cpu'))
                model.load_state_dict(state['model'],strict=False)
                prediction = inference_fn(test_loader, model, device)

            predictions.append(prediction)
            del model, state, prediction; gc.collect()
            torch.cuda.empty_cache()

    predictions = np.mean(predictions, axis=0)
    torch.cuda.empty_cache()

    student_ids = df["student_id"].values
    return predictions, student_ids

In [None]:
# CFG_listの数だけ推論
# transformers_preds_dictにモデル名をキーにして推論結果を保存
for idx, CFG_ in enumerate(CFG_list):
    model_path = CFG_.path
    if isinstance(model_path, list):
        model_path = model_path[0]
    model_name = "takai_" + Path(model_path).name
    predictions, student_ids = takai_inference(test.copy(), CFG_)
    original_sorted_predictions = pd.DataFrame(predictions, index=student_ids).reindex(test["student_id"]).values
    del predictions
    gc.collect()
    transformer_preds_dict[model_name] = original_sorted_predictions

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
class NLPDataset(Dataset):
    def __init__(self, df, prompt_df, tokenizer, meta_feature_cols, is_train=True, max_len=128):
        self.student_ids = df["student_id"].tolist()
        self.text = df["text"].tolist()
        # self.content = df["content"].tolist()
        # self.wording = df["wording"].tolist()
        self.prompt_question = prompt_df.set_index("prompt_id")["prompt_question"].reindex(df["prompt_id"]).tolist()
        self.prompt_title  = prompt_df.set_index("prompt_id")["prompt_title"].reindex(df["prompt_id"]).tolist()
        self.prompt_text = prompt_df.set_index("prompt_id")["prompt_text"].reindex(df["prompt_id"]).tolist()
        self.meta_features = df[meta_feature_cols].values
        self.tokenizer = tokenizer

        self.is_train = is_train
        self.max_len = max_len

    def __len__(self):
        return len(self.student_ids)

    def __getitem__(self, ix):
        prompt_question = str(self.prompt_question[ix])
        prompt_text = str(self.prompt_text[ix])
        text = str(self.text[ix])

        sentence = prompt_question + " [SEP] " + text  # + " [SEP] " + prompt_text
        meta_feature = self.meta_features[ix]

        text_inputs = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True
        )

        data = {
            "input_ids": torch.tensor(text_inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(text_inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(text_inputs["token_type_ids"], dtype=torch.long),
            "meta_features": torch.tensor(meta_feature, dtype=torch.float),
            # "targets": torch.tensor([target1, target2], dtype=torch.float),
        }

        return data

In [None]:
class NLPModel046(nn.Module):
    def __init__(self, pretrain_path, mlm_model_path=None):
        super(NLPModel046, self).__init__()
        model_config = transformers.AutoConfig.from_pretrained(pretrain_path)
        model_config.attention_probs_dropout_prob=0.0
        model_config.hidden_dropout_prob=0.0
        model_config.output_hidden_states=True
        self.encoder = transformers.AutoModel.from_pretrained(
            pretrain_path,
            config=model_config,
        )
        self.fc = nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, meta_features):
        last_hidden_state = self.encoder(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids = token_type_ids,
                                 return_dict=False)[1][-1]

        cls_token = last_hidden_state[:, 0, :]
        output = self.fc(cls_token)
        return output


class NLPModel139(nn.Module):
    def __init__(self, pretrain_path, meta_feature_cols, mlm_model_path=None):
        super(NLPModel139, self).__init__()
        model_config = transformers.AutoConfig.from_pretrained(pretrain_path)
        model_config.attention_probs_dropout_prob=0.0
        model_config.hidden_dropout_prob=0.0
        model_config.output_hidden_states=True
        self.encoder = transformers.AutoModel.from_pretrained(
            pretrain_path,
            config=model_config,
        )
        self.fc = nn.Linear(1024 + len(meta_feature_cols), 2)

    def forward(self, input_ids, attention_mask, token_type_ids, meta_features):
        last_hidden_state = self.encoder(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids = token_type_ids,
                                 return_dict=False)[1][-1]

        cls_token = last_hidden_state[:, 0, :]
        feature = torch.cat([cls_token, meta_features], dim=1)
        output = self.fc(feature)
        return output


class NLPModel068(nn.Module):
    def __init__(self, pretrain_path, mlm_model_path=None):
        super(NLPModel068, self).__init__()
        model_config = transformers.AutoConfig.from_pretrained(pretrain_path)
        model_config.attention_probs_dropout_prob=0.0
        model_config.hidden_dropout_prob=0.0
        model_config.output_hidden_states=True
        self.encoder = transformers.AutoModel.from_pretrained(
            pretrain_path,
            config=model_config,
        )
        self.fc = nn.Linear(1536, 2)

    def forward(self, input_ids, attention_mask, token_type_ids, meta_features):

        last_hidden_state = self.encoder(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids = token_type_ids,
                                 return_dict=False)[1][-1]

        cls_token = last_hidden_state[:, 0, :]
        feature = cls_token
        output = self.fc(feature)
        return output

In [None]:
class EmbDataset(Dataset):
    def __init__(self, df, prompt_df, tokenizer, max_len=128):
        self.student_ids = df["student_id"].tolist()
        self.text = df["text"].tolist()
        self.prompt_question = prompt_df.set_index("prompt_id")["prompt_question"].reindex(df["prompt_id"]).tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.student_ids)

    def __getitem__(self, ix):
        prompt_question = str(self.prompt_question[ix])
        text = str(self.text[ix])
        sentence = prompt_question + " [SEP] " + text # + " [SEP] " + prompt_text
        text_inputs = self.tokenizer(
            sentence,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_token_type_ids=True
        )
        data = {
            "input_ids": torch.tensor(text_inputs["input_ids"], dtype=torch.long),
            "attention_mask": torch.tensor(text_inputs["attention_mask"], dtype=torch.long),
            "token_type_ids": torch.tensor(text_inputs["token_type_ids"], dtype=torch.long),
        }
        return data

class EmbModel(nn.Module):
    def __init__(self, model_path):
        super(EmbModel, self).__init__()
        self.encoder = transformers.AutoModel.from_pretrained(
            model_path,
            attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0,
            output_hidden_states=True
        )

    def forward(self, input_ids, attention_mask, token_type_ids):
        last_hidden_state = self.encoder(input_ids=input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids = token_type_ids,
                                 return_dict=False)[1][-1]
        cls_token = last_hidden_state[:, 0, :]
        return cls_token

def get_embedding(model, data_loader, device):
    # get batch data loop
    epoch_loss = 0
    epoch_data_num = len(data_loader.dataset)
    embs_list = []
    bar = tqdm(enumerate(data_loader), total=len(data_loader))
    model.eval()
    for iter_i, batch in bar:
        batch = {k : v.to(device) for k, v in batch.items()}
        text_inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"],
            "token_type_ids": batch["token_type_ids"],
        }
        with torch.no_grad():
            embs = model(**text_inputs)
        embs_list.append(embs.detach().cpu().numpy())
    embeddings = np.concatenate(embs_list, axis=0)
    return embeddings


def get_transformer_embeddings(df, prompt_df, pretrain_path, model_prefix=""):
    set_seed(CFG.seed)
    device = torch.device(
        f"cuda:0" if torch.cuda.is_available() else "cpu"
    )
    model = EmbModel(pretrain_path)
    model.to(device)
    tokenizer = transformers.AutoTokenizer.from_pretrained(pretrain_path)
    scaler = amp.GradScaler(enabled=CFG.use_fp16)
    dataset = EmbDataset(df, prompt_df, tokenizer, max_len=CFG.max_length)
    data_loader = DataLoader(
        dataset,
        batch_size=32,
        num_workers=2,
        shuffle=False,
    )
    embeddings = get_embedding(
        model, data_loader, device
    )
    return embeddings

In [None]:
def predict_run(test_df, p_test_df, model_path, pretrain_path, meta_feature_cols, model_cls):

    set_seed(CFG.seed)
    device = torch.device(
        "cuda:0" if torch.cuda.is_available() else "cpu"
    )

    ###################################
    # Model and Tokenizer
    ###################################
    model = model_cls(pretrain_path, meta_feature_cols)
    model.to(device)

    trained_state_dict = torch.load(
        model_path,
        map_location=lambda storage,loc: storage
    )["model_state_dict"]

    own_state = model.state_dict()
    for name, param in trained_state_dict.items():
        if name not in own_state:
             continue
        else:
            param = param.data
        own_state[name].copy_(param)

    model.eval()
    tokenizer = transformers.AutoTokenizer.from_pretrained(pretrain_path)

    # token sort for fast inference
    # 一旦このパターンしかないのでここで固定。場合によっては設定で切り替える
    full_text = test_df["prompt_question"] + " [SEP] " + test_df["text"]
    test_df['tokenize_length'] = full_text.map(lambda text: len(tokenizer(text)['input_ids']))
    sorted_test_df = test_df.sort_values('tokenize_length', ascending=True).reset_index(drop=True)


    ###################################
    # Make data
    ###################################

    test_dataset = NLPDataset(
        sorted_test_df, p_test_df, tokenizer, meta_feature_cols, max_len=CFG.max_length, is_train=False
    )

    # data loader
    test_loader = DataLoader(
        test_dataset,
        batch_size=CFG.batch_size,
        num_workers=CFG.num_workers,
        shuffle=False,
        collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')
    )

    ###############################
    # predict per batch
    ###############################
    epoch_data_num = len(test_loader.dataset)
    pred_list = []

    for iter_i, input_data in enumerate(test_loader):
        # input
        input_data = {k : v.to(device) for k, v in input_data.items()}
        text_inputs = {
            "input_ids": input_data["input_ids"],
            "attention_mask": input_data["attention_mask"],
            "token_type_ids": input_data["token_type_ids"],
            "meta_features": input_data["meta_features"],
        }

        with torch.no_grad():
            preds = model(**text_inputs)
        pred_list.append(preds.detach().cpu().numpy())

    sorted_test_preds = np.concatenate(pred_list, axis=0)
    sorted_preds_df = pd.DataFrame(sorted_test_preds, columns=["content", "wording"])
    sorted_preds_df["student_id"] = sorted_test_df["student_id"].values

    # 元のtest_dfの並び順に戻す
    test_preds = sorted_preds_df.set_index("student_id").reindex(test_df["student_id"])[["content", "wording"]].values
    return test_preds

In [None]:
class CFG:
    seed = 2023
    num_workers = 4
    batch_size = 16
    max_length = 512
    n_folds = 4
    use_fp16 = False

In [None]:
models = {
#     "exp034": {
#         "path_list": [
#             "/kaggle/input/commonlit2-034/fold0last-checkpoint.bin",
#             "/kaggle/input/commonlit2-034/fold1last-checkpoint.bin",
#             "/kaggle/input/commonlit2-034/fold2last-checkpoint.bin",
#             "/kaggle/input/commonlit2-034/fold3last-checkpoint.bin",
#         ],
#         "pretrain_path": "/kaggle/input/transformers-models/deberta-v3-large",
#         "model_module": NLPModel034,
#     },
#     "exp112": {
#         "path_list": [
#             "/kaggle/input/commonlit2-112/all_last-checkpoint.bin",
#         ],
#         "pretrain_path": "/kaggle/input/transformers-models/deberta-v3-large",
#         "model_module": NLPModel046,  # same with 046 model
#         # "ensemble_weight": 0.4
#     },
    "exp139": {
        "path_list": [
            "/kaggle/input/commonlit2-139/all_last-checkpoint.bin",
        ],
        "pretrain_path": "/kaggle/input/transformers-models/deberta-v3-large",
        "model_module": NLPModel139,
        # "ensemble_weight": 0.28
    },
    "exp068": {
        "path_list": [
            "/kaggle/input/commonlit2-068/all_last-checkpoint.bin",
        ],
        "pretrain_path": "/kaggle/input/transformers-models/deberta-v2-xlarge",
        "model_module": NLPModel068,
        # "ensemble_weight": 0.32
    },
}

In [None]:
# get embedding　as meta feature of transformer
emb_pretrain_path = "/kaggle/input/transformers-models/deberta-v3-large"
embeddings = get_transformer_embeddings(test, prompts_test, emb_pretrain_path)
emb_df = pd.DataFrame(embeddings, columns=[f"emb_{i}" for i in range(embeddings.shape[1])])
meta_feature_cols = emb_df.columns.tolist()
test = pd.concat([test, emb_df], axis=1)
del emb_df
gc.collect()
for c in tqdm(meta_feature_cols):
    test[c] = test[c] - test.groupby("prompt_id")[c].transform("mean")

# preds
for model_name, model_info in models.items():
    pretrain_path = model_info["pretrain_path"]
    model_path_list = model_info["path_list"]
    model = model_info["model_module"]
    test_preds = np.zeros((len(test), 2))
    for model_ix, model_path in enumerate(model_path_list):
        print(f"Model {model_ix}")
        each_test_preds = predict_run(test, prompts_test, model_path, pretrain_path, meta_feature_cols, model)
        test_preds = each_test_preds
        break
    # TODO: token lengthでのsort処理及び復元処理
    transformer_preds_dict[model_name] = test_preds

test.drop(meta_feature_cols, axis=1, inplace=True)

In [None]:
ensemble_weights = {
    # 'exp112': 0.1,
    'exp139': 0.15,
    'exp068': 0.15,
    'takai_exp049': 0.3,
    'takai_exp050': 0.4,
    # 'takai_exp069': 0.5
}

assert(np.sum(list(ensemble_weights.values())) == 1)

transformer_ensemble_preds = np.zeros((len(test), 2))

for exp_name, weight in ensemble_weights.items():
    pred = transformer_preds_dict[exp_name]
    transformer_ensemble_preds += pred * weight

In [None]:
test["ensemble_pred_content"] = transformer_ensemble_preds[:, 0]
test["ensemble_pred_wording"] = transformer_ensemble_preds[:, 1]

In [None]:
print(transformer_preds_dict.keys())

# Feature Engineering for lgbm

In [None]:
def nakama_fb_predict_feature(df):

    class CFG:
        num_workers=4
        path="../input/fb3-deberta-v3-base-baseline-train/"
        config_path=path+'config.pth'
        model="microsoft/deberta-v3-base"
        gradient_checkpointing=False
        batch_size=24
        target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
        seed=42
        n_fold=4
        trn_fold=[0, 1, 2, 3]

    def prepare_input(cfg, text):
        inputs = cfg.tokenizer.encode_plus(
            text,
            return_tensors=None,
            add_special_tokens=True,
            #max_length=CFG.max_len,
            #pad_to_max_length=True,
            #truncation=True
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs

    class TestDataset(Dataset):
        def __init__(self, cfg, df):
            self.cfg = cfg
            self.texts = df['text'].values

        def __len__(self):
            return len(self.texts)

        def __getitem__(self, item):
            inputs = prepare_input(self.cfg, self.texts[item])
            return inputs


    class MeanPooling(nn.Module):
        def __init__(self):
            super(MeanPooling, self).__init__()

        def forward(self, last_hidden_state, attention_mask):
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            mean_embeddings = sum_embeddings / sum_mask
            return mean_embeddings

    class CustomFBModel(nn.Module):
        def __init__(self, cfg, config_path=None, pretrained=False):
            super().__init__()
            self.cfg = cfg
            if config_path is None:
                self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
                self.config.hidden_dropout = 0.
                self.config.hidden_dropout_prob = 0.
                self.config.attention_dropout = 0.
                self.config.attention_probs_dropout_prob = 0.
                # LOGGER.info(self.config)
            else:
                self.config = torch.load(config_path)
            if pretrained:
                self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            else:
                self.model = AutoModel.from_config(self.config)
            if self.cfg.gradient_checkpointing:
                self.model.gradient_checkpointing_enable()
            self.pool = MeanPooling()
            self.fc = nn.Linear(self.config.hidden_size, 6)
            self._init_weights(self.fc)

        def _init_weights(self, module):
            if isinstance(module, nn.Linear):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.bias is not None:
                    module.bias.data.zero_()
            elif isinstance(module, nn.Embedding):
                module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
                if module.padding_idx is not None:
                    module.weight.data[module.padding_idx].zero_()
            elif isinstance(module, nn.LayerNorm):
                module.bias.data.zero_()
                module.weight.data.fill_(1.0)

        def feature(self, inputs):
            outputs = self.model(**inputs)
            last_hidden_states = outputs[0]
            feature = self.pool(last_hidden_states, inputs['attention_mask'])
            return feature

        def forward(self, inputs):
            feature = self.feature(inputs)
            output = self.fc(feature)
            return output

        def inference_fn(test_loader, model, device):
            preds = []
            model.eval()
            model.to(device)
            tk0 = tqdm(test_loader, total=len(test_loader))
            for inputs in tk0:
                for k, v in inputs.items():
                    inputs[k] = v.to(device)
                with torch.no_grad():
                    y_preds = model(inputs)
                preds.append(y_preds.to('cpu').numpy())
            predictions = np.concatenate(preds)
            return predictions

    CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.path+'tokenizer/')

    test_dataset = TestDataset(CFG, df)
    test_loader = DataLoader(test_dataset,
                             batch_size=CFG.batch_size,
                             shuffle=False,
                             collate_fn=DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding='longest'),
                             num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
    predictions = []
    for fold in CFG.trn_fold:
        model = CustomFBModel(CFG, config_path=CFG.config_path, pretrained=False)
        state = torch.load(CFG.path+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
                           map_location=torch.device('cpu'))
        model.load_state_dict(state['model'])
        prediction = inference_fn(test_loader, model, device)
        predictions.append(prediction)
        del model, state, prediction; gc.collect()
        torch.cuda.empty_cache()
    predictions = np.mean(predictions, axis=0)

    df[CFG.target_cols] = predictions

    return df

In [None]:
spell_checker = SpellChecker()

def calc_spell_miss(text):
    wordlist=text.split()
    amount_miss = len(list(spell_checker.unknown(wordlist)))
    return amount_miss

def ngrams(token, n):
    # Use the zip function to help us generate n-grams
    # Concatentate the tokens into ngrams and return
    ngrams = zip(*[token[i:] for i in range(n)])
    return [" ".join(ngram) for ngram in ngrams]

def ngram_co_occurrence(row, n: int):
    # Tokenize the original text and summary into words
    original_tokens = row['prompt_tokens']
    summary_tokens = row['summary_tokens']

    # Generate n-grams for the original text and summary
    original_ngrams = set(ngrams(original_tokens, n))
    summary_ngrams = set(ngrams(summary_tokens, n))

    # Calculate the number of common n-grams
    common_ngrams = original_ngrams.intersection(summary_ngrams)

    # # Optionally, you can get the frequency of common n-grams for a more nuanced analysis
    # original_ngram_freq = Counter(ngrams(original_words, n))
    # summary_ngram_freq = Counter(ngrams(summary_words, n))
    # common_ngram_freq = {ngram: min(original_ngram_freq[ngram], summary_ngram_freq[ngram]) for ngram in common_ngrams}

    return len(common_ngrams)


def word_overlap_count(row):
    prompt_words = row['prompt_tokens']
    summary_words = row['summary_tokens']
    return len(set(prompt_words).intersection(set(summary_words)))


def quotes_count(row):
    summary = row['text']
    text = row['prompt_text']
    quotes_from_summary = re.findall(r'"([^"]*)"', summary)
    if len(quotes_from_summary)>0:
        return [quote in text for quote in quotes_from_summary].count(True)
    else:
        return 0


def clean_text(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)


def jaccard_similarity(text1, text2):
    set1 = set(clean_text(text1).split())
    set2 = set(clean_text(text2).split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def calc_jaccard_score(df):
    prompt_questions = df['prompt_question'].tolist()
    prompt_texts = df['prompt_text'].tolist()
    question_jaccard_scores = []
    text_jaccard_scores = []
    texts = df['text'].tolist()
    for ix in tqdm(range(len(df))):
        text = texts[ix]
        prompt_question = prompt_questions[ix]
        prompt_text = prompt_texts[ix]
        score1 = jaccard_similarity(text, prompt_question)
        score2 = jaccard_similarity(text, prompt_text)
        question_jaccard_scores.append(score1)
        text_jaccard_scores.append(score2)
    return question_jaccard_scores, text_jaccard_scores


def calc_sentence_embedding(df, prompt_df):

    device = torch.device(
        f"cuda:0" if torch.cuda.is_available() else "cpu"
    )

    model = SentenceTransformer('/kaggle/input/transformers-models/sentence-transformers_all-mpnet-base-v2')
    model.to(device)

    # ----------------------------------
    # prompt questions
    # ----------------------------------
    prompt_questions = prompt_df["prompt_question"].tolist()
    prompt_texts = prompt_df["prompt_text"].tolist()
    prompt_questions_embeddings = {}
    _embeddings = model.encode(prompt_questions, show_progress_bar=False)
    for k, v in zip(prompt_df["prompt_id"].tolist(), _embeddings):
        prompt_questions_embeddings[k] = v

    # ----------------------------------
    # prompt texts
    # ----------------------------------
    prompt_texts_embeddings = {}
    _embeddings = model.encode(prompt_texts, show_progress_bar=False)
    for k, v in zip(prompt_df["prompt_id"].tolist(), _embeddings):
        prompt_texts_embeddings[k] = v

    # ----------------------------------
    # prompt texts split version
    # ----------------------------------
    prompt_texts_split_embeddings = {}
    prompt_sentences = {}
    prompt_ids = prompt_df["prompt_id"].tolist()
    for prompt_ix, prompt_id in enumerate(prompt_ids):
        # sentences = re.split('[.\n\r]+', prompt_df["prompt_text"].iloc[prompt_ix])
        sentences = sent_tokenize(prompt_df["prompt_text"].iloc[prompt_ix])
        # sentences = [s for s in sentences if len(s.split()) > 3]
        _embeddings = model.encode(sentences, show_progress_bar=False)
        prompt_texts_split_embeddings[prompt_id] = _embeddings
        prompt_sentences[prompt_id] = sentences

    # ----------------------------------
    # summary texts
    # ----------------------------------
    # create text embeddings of df
    batch_size = 32
    texts = df['text'].tolist()
    texts_embeddings = []
    for batch_ix in tqdm(range(len(df) // batch_size + 1)):
        batch_from = batch_ix * batch_size
        batch_to = (batch_ix + 1) * batch_size
        batch_to = min(batch_to, len(df))
        batch_texts = texts[batch_from:batch_to]
        batch_embeddings = model.encode(batch_texts)
        texts_embeddings.append(batch_embeddings)
    texts_embeddings = np.concatenate(texts_embeddings, axis=0)

    # ----------------------------------
    # summary texts split version
    # ----------------------------------
    texts = df['text'].tolist()
    texts_split_embeddings = []
    for row_text in tqdm(texts):
        sentences = sent_tokenize(row_text)
        _embeddings = model.encode(sentences, show_progress_bar=False)
        texts_split_embeddings.append(_embeddings)

    return prompt_questions_embeddings, prompt_texts_embeddings, prompt_texts_split_embeddings, prompt_sentences, texts_embeddings, texts_split_embeddings


def create_meta_feature(df, prompt_df):

    tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/transformers-models/deberta-v3-large")

    df["summary_word_len"] = df["text"].apply(lambda x: len(x.split()))

    df["summary_length"] = df["text"].apply(
        lambda x: len(tokenizer.encode(x))
    )
    df["prompt_length"] = df["prompt_text"].apply(
        lambda x: len(tokenizer.encode(x))
    )
    df["summary_tokens"] = df["text"].apply(
        lambda x: tokenizer.convert_ids_to_tokens(
            tokenizer.encode(x),
            skip_special_tokens=True
        )
    )
    df["prompt_tokens"] = df["prompt_text"].apply(
        lambda x: tokenizer.convert_ids_to_tokens(
            tokenizer.encode(x),
            skip_special_tokens=True
        )
    )
    # df['length_ratio'] = df['summary_length'] / df['prompt_length']
    df["spell_err_num"] = df["text"].map(calc_spell_miss)

    df['bigram_overlap_count'] = df.apply(
        ngram_co_occurrence,args=(2,), axis=1
    )
    df['trigram_overlap_count'] = df.apply(
        ngram_co_occurrence, args=(3,), axis=1
    )
    df['bigram_overlap_ratio'] = df['bigram_overlap_count'] / (df['summary_length'] - 1)
    df['trigram_overlap_ratio'] = df['trigram_overlap_count'] / (df['summary_length'] - 1)

    df["summary_length"] = df["summary_length"] - df["prompt_id"].map(df.groupby("prompt_id")["summary_length"].mean())

    df['word_overlap_count'] = df.apply(word_overlap_count, axis=1)
    df['quotes_count'] = df.apply(quotes_count, axis=1)

    question_jaccard_scores, text_jaccard_scores = calc_jaccard_score(df)
    df['question_jaccard'] = question_jaccard_scores
    df['text_jaccard'] = text_jaccard_scores

    (
        prompt_questions_embeddings,
        prompt_texts_embeddings,
        prompt_texts_split_embeddings,
        prompt_sentences,
        texts_embeddings,
        texts_split_embeddings
    ) = calc_sentence_embedding(df, prompt_df)

    prompt_ids = df['prompt_id'].tolist()
    questions_cossim_feature = []
    texts_cossim_feature = []
    for ix in tqdm(range(len(df))):
        prompt_id = prompt_ids[ix]
        prompt_questions_cosine_sim = cosine_similarity(texts_embeddings[ix].reshape(1, -1), prompt_questions_embeddings[prompt_id].reshape(1, -1))[0][0]
        prompt_texts_cosine_sim = cosine_similarity(texts_embeddings[ix].reshape(1, -1), prompt_texts_embeddings[prompt_id].reshape(1, -1))[0][0]
        questions_cossim_feature.append(prompt_questions_cosine_sim)
        texts_cossim_feature.append(prompt_texts_cosine_sim)

    df['questions_cossim'] = questions_cossim_feature
    df['texts_cossim'] = texts_cossim_feature

    emb_cols = [f"emb_{i}" for i in range(texts_embeddings.shape[1])]
    emb_df = pd.DataFrame(texts_embeddings, columns=emb_cols)
    emb_df["prompt_id"] = df["prompt_id"].values
    prompt_mean = emb_df.groupby("prompt_id").mean()
    emb_arr = emb_df[emb_cols].values
    idx2prompt = emb_df["prompt_id"].values
    mean_cossim = emb_df.index.map(lambda idx: np.sum(emb_arr[idx] * prompt_mean.loc[idx2prompt[idx]]))
    df["sentence_embedding_prompt_mean_cossim"] = mean_cossim
    mean_diff = emb_df.index.map(lambda idx: np.mean(np.square(emb_arr[idx] - prompt_mean.loc[idx2prompt[idx]])))
    df["sentence_embedding_prompt_mean_diff"] = mean_diff

    # knn
    for p_id in df["prompt_id"].unique():
        p_id_indices = df[df["prompt_id"] == p_id].index
        knn = NearestNeighbors(n_neighbors=min(101, len(p_id_indices)))
        knn.fit(texts_embeddings[p_id_indices])
        neighbors_dists, neighbors_indices = knn.kneighbors(texts_embeddings[p_id_indices])
        content_arr = df["ensemble_pred_content"].values[p_id_indices]
        wording_arr = df["ensemble_pred_wording"].values[p_id_indices]
        for thresh in [100]:
            content_mean = content_arr[neighbors_indices[:, 1:thresh+1]].mean(axis=1)
            wording_mean = wording_arr[neighbors_indices[:, 1:thresh+1]].mean(axis=1)
            df.loc[p_id_indices, f"knn{thresh}_pred_content_mean"] = content_mean
            df.loc[p_id_indices, f"knn{thresh}_pred_wording_mean"] = wording_mean
            df.loc[p_id_indices, f"knn{thresh}_dists_mean"] = neighbors_dists[:, 1:thresh+1].mean(axis=1)

    for thresh in [100]:
        df[f"diff_knn{thresh}_pred_content_mean"] = df["ensemble_pred_content"] - df[f"knn{thresh}_pred_content_mean"]
        df[f"diff_knn{thresh}_pred_wording_mean"] = df["ensemble_pred_wording"] - df[f"knn{thresh}_pred_wording_mean"]


    for p_id in df["prompt_id"].unique():
        p_id_indices = df[df["prompt_id"] == p_id].index
        tfidf = TfidfVectorizer(lowercase=False)
        tfidf.fit(df.loc[p_id_indices, "text"])
        tfidf_train = tfidf.transform(df.loc[p_id_indices, "text"])
        df.loc[p_id_indices, "tfidf_mean"] = tfidf_train.toarray().mean(axis=1)
        df.loc[p_id_indices, "tfidf_nonzero_mean"] = np.nanmean(np.where(0, np.nan, tfidf_train.toarray()), axis=1)

        tfidf_texts_embeddings = tfidf_train.toarray()
        emb_cols = [f"emb_{i}" for i in range(tfidf_texts_embeddings.shape[1])]
        emb_df = pd.DataFrame(tfidf_texts_embeddings, columns=emb_cols)
        emb_mean = emb_df.mean(axis=0).values
        scalar = (np.linalg.norm(tfidf_texts_embeddings, axis=1) * np.linalg.norm(emb_mean))
        cossim = ((tfidf_texts_embeddings * emb_mean).sum(axis=1)) / scalar
        df.loc[p_id_indices, "tfidf_emb_prompt_mean_cossim"] = cossim

        bow = CountVectorizer()
        bow.fit(df.loc[p_id_indices, "text"])
        bow_train = bow.transform(df.loc[p_id_indices, "text"])
        tfidf_texts_embeddings = bow_train.toarray()
        emb_cols = [f"emb_{i}" for i in range(tfidf_texts_embeddings.shape[1])]
        emb_df = pd.DataFrame(tfidf_texts_embeddings, columns=emb_cols)
        emb_mean = emb_df.mean(axis=0).values
        scalar = (np.linalg.norm(tfidf_texts_embeddings, axis=1) * np.linalg.norm(emb_mean))
        cossim = ((tfidf_texts_embeddings * emb_mean).sum(axis=1)) / scalar
        df.loc[p_id_indices, "bow_emb_prompt_mean_cossim"] = cossim



    # ----------------------------------
    # split text cossim agg features
    # ----------------------------------
    # prompt text split version
    prompt_ids = df['prompt_id'].tolist()
    texts_cossim_feature = []
    cosine_sim_features = []
    for ix in tqdm(range(len(df))):
        prompt_id = prompt_ids[ix]
        prompt_texts_cosine_sim = cosine_similarity(texts_embeddings[ix].reshape(1, -1), prompt_texts_split_embeddings[prompt_id])
        cosine_sim_features.append({
            # "prompt_cossim_mean": prompt_texts_cosine_sim[0].mean(),
            # "prompt_cossim_max": prompt_texts_cosine_sim[0].max(),
            "prompt_cossim_std": prompt_texts_cosine_sim[0].std(),
        })
    cosine_sim_features_df = pd.DataFrame(cosine_sim_features)
    df = pd.concat([df, cosine_sim_features_df], axis=1)

    # summary texts split version
    prompt_ids = df['prompt_id'].tolist()
    texts_cossim_feature = []
    cosine_sim_features = []
    for ix in tqdm(range(len(df))):
        prompt_id = prompt_ids[ix]
        prompt_texts_cosine_sim = cosine_similarity(prompt_texts_embeddings[prompt_id].reshape(1, -1), texts_split_embeddings[ix])
        cosine_sim_features.append({
            # "per_text_sentence_prompt_cossim_mean": prompt_texts_cosine_sim[0].mean(),
            # "per_text_sentence_prompt_cossim_max": prompt_texts_cosine_sim[0].max(),
            "per_text_sentence_prompt_cossim_std": prompt_texts_cosine_sim[0].std(),
        })
    cosine_sim_features_df = pd.DataFrame(cosine_sim_features)
    df = pd.concat([df, cosine_sim_features_df], axis=1)

    del texts_embeddings, prompt_texts_embeddings, prompt_sentences, texts_split_embeddings, prompt_texts_split_embeddings
    gc.collect()

    readability_scores = []
    for text in tqdm(df["text"].tolist()):
        score = {}
        try:
            r = Readability(text)
        except:
            readability_scores.append(score)
            continue
        try:
            score["flesch_kincaid"] = r.flesch_kincaid().score
        except:
            score["flesch_kincaid"] = np.nan
            pass
        try:
            score["flesch"] = r.flesch().score
        except:
            score["flesch"] = np.nan
            pass
        try:
            score["gunning_fog"] = r.gunning_fog().score
        except:
            score["gunning_fog"] = np.nan
            pass
        try:
            score["coleman_liau"] = r.coleman_liau().score
        except:
            score["coleman_liau"] = np.nan
            pass
        try:
            score["dale_chall"] = r.dale_chall().score
        except:
            score["dale_chall"] = np.nan
            pass
        try:
            score["ari"] = r.ari().score
        except:
            score["ari"] = np.nan
            pass
        try:
            score["linsear_write"] = r.linsear_write().score
        except:
            score["linsear_write"] = np.nan
            pass
        try:
            score["spache"] = r.spache().score
        except:
            score["spache"] = np.nan
            pass
        readability_scores.append(score)

    readability_df = pd.DataFrame(readability_scores)
    df = pd.concat([df, readability_df], axis=1)

    def count_words_within_quotes(text):
        # 正規表現を使って二重引用符で囲まれたテキストを抽出
        quoted_texts = re.findall(r'"([^"]+)"', text)
        # 引用符が2つ以上存在しない場合は0を返す
        if len(quoted_texts) == 0:
            return 0

        # 引用符で囲まれたテキストの中で最も単語数の多いものを求める
        max_word_count = max(len(quoted_text) for quoted_text in quoted_texts)
        return max_word_count

    df["mean_num_words"] = df["text"].map(lambda x: np.mean([len(e.split()) for e in x.split('.')]))
    df['refer_cnt'] = df["text"].map(count_words_within_quotes)

    return df

In [None]:
test = create_meta_feature(test, prompts_test)

In [None]:
test = nakama_fb_predict_feature(test)

# LightGBM inference

In [None]:
LGBM_DIRS = [
    # model_name, lgbm_dir, weight_original, weight_long_prompt, seed_num, transformer_name
    ("exp405", "/kaggle/input/commonlit2-405-lgb", 0.24, 0.27, 10, "exp139"),
    ("exp406", "/kaggle/input/commonlit2-406-lgb", 0.11, 0.14, 10, "exp068"),
    ("exp407", "/kaggle/input/commonlit2-407-lgb", 0.16, 0.19, 10, "takai_exp049"),
    ("exp408", "/kaggle/input/commonlit2-408-lgb", 0.12, 0.15, 10, "takai_exp050"),
    ("exp409", "/kaggle/input/commonlit2-409-lgb", 0.20, 0.23, 10, "takai_exp108_full_train"),
]


for model_name, preds in transformer_preds_dict.items():
    test[f"{model_name}_pred_content"] = preds[:, 0]
    test[f"{model_name}_pred_wording"] = preds[:, 1]

test[f"takai_exp069_pred_content"] = test[f"takai_exp108_full_train_pred_content"]
test[f"takai_exp069_pred_wording"] = test[f"takai_exp108_full_train_pred_wording"]



lgbm_test_preds = np.zeros((len(test), 2))
lgbm_test_preds_for_long_prompt = np.zeros((len(test), 2))
for model_name, lgbm_dir, weight, weight_long_prompt, seed_num, transformer_model_name in LGBM_DIRS:

    lgb_preds_list = []
    for seed_ix in tqdm(range(seed_num)):
        for fold_ix in range(CFG.n_folds):

#             ensemble_preds = np.zeros((len(test_df), 2))
#             for model_name, preds in transformer_preds_dict.items():
#                 test_df[f"{model_name}_pred_content"] = preds[:, 0]
#                 test_df[f"{model_name}_pred_wording"] = preds[:, 1]
#                 ensemble_preds[:, 0] += test_df[f"{model_name}_pred_content"].values * models[model_name]["ensemble_weight"]
#                 ensemble_preds[:, 1] += test_df[f"{model_name}_pred_wording"].values * models[model_name]["ensemble_weight"]

#             test_df["ensemble_pred_content"] = ensemble_preds[:, 0]
#             test_df["ensemble_pred_wording"] = ensemble_preds[:, 1]
            test["ensemble_pred_content"] = transformer_preds_dict[transformer_model_name][:, 0]
            test["ensemble_pred_wording"] = transformer_preds_dict[transformer_model_name][:, 1]

            with open(Path(lgbm_dir) / f"use_feats_fold{fold_ix}.txt") as f:
                use_feats = f.read().splitlines()

            lgb_preds = np.zeros((len(test), 2))
            for target_ix, target_col in enumerate(["content", "wording"]):
                lgb_model = lgb.Booster(model_file=Path(lgbm_dir) / f"{target_col}_fold{fold_ix}_{seed_ix}.txt")
                lgb_preds[:, target_ix] = lgb_model.predict(test[use_feats])

            lgb_preds_list.append(lgb_preds)

    _test_preds = np.mean(lgb_preds_list, axis=0)
    lgbm_test_preds += _test_preds * weight
    lgbm_test_preds_for_long_prompt += _test_preds * weight_long_prompt

## Add Deberta(with prompt text) predict

In [None]:
weight_exp069 = 0.17
weight_exp069_for_long_prompt = 0.02
assert(round(sum([w for _, _, w, _, _, _ in LGBM_DIRS]) + weight_exp069, 1) == 1.0)
assert(round(sum([w for _, _, _, w, _, _ in LGBM_DIRS]) + weight_exp069_for_long_prompt, 1) == 1.0)

In [None]:
test_preds = lgbm_test_preds + transformer_preds_dict["takai_exp108_full_train"] * weight_exp069
test_preds_for_long_prompt = lgbm_test_preds_for_long_prompt + transformer_preds_dict["takai_exp108_full_train"] * weight_exp069_for_long_prompt

del transformer_preds_dict
gc.collect()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/transformers-models/deberta-v3-large")
test['prompt_text_tokenize_length'] = test["prompt_text"].map(lambda text: len(tokenizer(text)['input_ids']))
test['text_tokenize_length'] = test["text"].map(lambda text: len(tokenizer(text)['input_ids']))
test_tokenize_length = test['prompt_text_tokenize_length'] + test['text_tokenize_length']

LONG_THRESH1 = 1800
LONG_THRESH2 = 3000
weights_long = (test_tokenize_length - LONG_THRESH1) / (LONG_THRESH2 - LONG_THRESH1)
weights_long[weights_long < 0] = 0
weights_long[weights_long > 1] = 1

weights_base = np.ones(len(test)) - weights_long

In [None]:
sub_df = pd.read_csv("/kaggle/input/commonlit-evaluate-student-summaries/sample_submission.csv")
sub_df["content"] = test_preds[:, 0] * weights_base + test_preds_for_long_prompt[:, 0] * weights_long
sub_df["wording"] = test_preds[:, 1] * weights_base + test_preds_for_long_prompt[:, 1] * weights_long

sub_df.to_csv("submission.csv", index=False)