## Feature from Feedback Competion

In [None]:
import sys
sys.path.append("../input/omegaconf")

In [None]:
# basics
import os
import sys
import json
from copy import deepcopy
from itertools import chain
from omegaconf import OmegaConf

# Processing
import numpy as np
import pandas as pd

from tqdm.auto import tqdm

# ipython
from IPython.display import display
from IPython.core.debugger import set_trace

In [None]:
%%writefile create_datasets_main.py

import sys
sys.path.append("../input/omegaconf")

import os
import json
import re
import argparse
from copy import deepcopy

import numpy as np
import pandas as pd
from datasets import Dataset, load_from_disk
from tokenizers import AddedToken
from transformers import AutoTokenizer
from omegaconf import OmegaConf

#--------------- Tokenizer ---------------------------------------------#
NEW_TOKENS = [
    "[LF]",
]

def get_tokenizer(cfg):
    """load the tokenizer"""
    tokenizer_path = cfg.model.backbone_path
    print(f"loading tokenizer from {tokenizer_path}")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # NEW TOKENS
    print("adding new tokens...")
    tokens_to_add = []
    for this_tok in NEW_TOKENS:
        tokens_to_add.append(AddedToken(this_tok, lstrip=False, rstrip=False))
    tokenizer.add_tokens(tokens_to_add)

    print(f"tokenizer len: {len(tokenizer)}")
    test_string = "This is a test \n [LF]!"
    tokenized_string = tokenizer.tokenize(test_string)
    print(f"tokenizer test: {tokenized_string}")
    return tokenizer

#--------------- Dataset ----------------------------------------------#


class FeedbackDataset:
    """Dataset class for feedback prize effectiveness task
    """
    def __init__(self, cfg):
        # assign config
        self.cfg = cfg

        # label columns
        self.target_names = cfg.model.target_names

        # load tokenizer
        self.load_tokenizer()


    def load_tokenizer(self):
        self.tokenizer = get_tokenizer(self.cfg)

    def pre_process(self, df):
        df["full_text"] = df["full_text"].apply(lambda x: re.sub(re.compile(r'\n\n'), " [LF] ", x))
        return df

    def tokenize_function(self, examples):
        tz = self.tokenizer(
            examples["full_text"],
            padding=False,
            truncation=True,
            max_length=self.cfg.model.max_length,
            add_special_tokens=True,
            return_token_type_ids=True,
        )
        return tz

    def compute_input_length(self, examples):
        return {"input_length": [len(x) for x in examples["input_ids"]]}

    def get_dataset(self, df, mode='train'):
        """main api for creating the Feedback dataset
        :param df: input annotation dataframe
        :type df: pd.DataFrame
        :param mode: check if required for train or infer, defaults to 'train'
        :type mode: str, optional
        :return: the created dataset
        :rtype: Dataset
        """
        df = self.pre_process(df)

        print(f"sample text:")
        print("=="*40)
        print(df.sample().full_text.values[0])
        print("=="*40)

        task_dataset = Dataset.from_pandas(df)
        task_dataset = task_dataset.map(self.tokenize_function, batched=True)
        task_dataset = task_dataset.map(self.compute_input_length, batched=True)
        try:
            task_dataset = task_dataset.remove_columns(column_names=["__index_level_0__"])
        except Exception as e:
            print(e)
        return task_dataset

if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    ap.add_argument('--save_path', type=str, required=True)
    ap.add_argument('--rank', type=int, required=True)
    args = ap.parse_args()
    
    # read configuration
    cfg = OmegaConf.load(args.config_path)
    cfg_dict = OmegaConf.to_container(cfg, resolve=True)
    print("=="*40)
    print(json.dumps(cfg_dict, indent=4))
    print("=="*40)
    
    # create the dataset
    print("creating the dataset...")
    df = pd.read_csv(cfg.infer_params.input_path)
    if "train" in cfg.infer_params.input_path:
        df = df
    # double test set to ensure successfully run in public mode.
    if df.shape[0] < 10:
        df = pd.concat([df, df], axis=0).reset_index(drop=True)
    n_samples = len(df)
    n_rank = 4
    n_part_1 = n_samples // n_rank
    n_part_2 = n_samples - n_part_1 * (n_rank - 1)

    if args.rank < (n_rank - 1):
        df = df.head(n_part_1 * (args.rank + 1)).tail(n_part_1).reset_index(drop=True)
    elif args.rank == (n_rank - 1):
        df = df.tail(n_part_2).reset_index(drop=True)

    elif args.rank == n_rank:
        df = df
    print(df.head())
    dataset_creator = FeedbackDataset(cfg)
    infer_ds = dataset_creator.get_dataset(df, mode="infer")
    
    # save dataset
    infer_ds.save_to_disk(args.save_path.format(rank=args.rank))
    
    print("done!")
    print("=="*40)

In [None]:
%%writefile fb_model_main.py

import pdb
from copy import deepcopy

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from transformers import AutoConfig, AutoModel, BertConfig
from transformers.models.bert.modeling_bert import BertAttention


#-------- Model ------------------------------------------------------------------#


class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings


class FeatureExtractor(nn.Module):
    """
    extract features from backbone outputs
        - multi-head attention mechanism
        - weighted average of top transformer layers
    """

    def __init__(self, config):
        super(FeatureExtractor, self).__init__()

        self.config = config
        self.num_layers = config["num_layers"]
        self.num_features = config["hidden_size"]

        #------------ weighted-average ---------------------------------------------------#
        init_amax = 5
        weight_data = torch.linspace(-init_amax, init_amax, self.num_layers)
        # weight_data = torch.tensor([1] * self.num_layers, dtype=torch.float)
        weight_data = weight_data.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)
        self.weights = nn.Parameter(weight_data, requires_grad=True)

        #------------ multi-head attention -----------------------------------------------#
        attention_config = BertConfig()
        attention_config.update(
            {
                "num_attention_heads": 4,  # 4,
                "hidden_size": self.num_features,
                "attention_probs_dropout_prob": 0.0,
                "hidden_dropout_prob": 0.0,
                "is_decoder": False,
            }
        )
        self.mha_layer_norm = nn.LayerNorm(self.num_features, 1e-7)
        self.attention = BertAttention(attention_config, position_embedding_type="absolute")

        #------------ mean-pooling ------------------------------------------------------#
        self.pool = MeanPooling()

        #------------ Layer Normalization ------------------------------------------------#
        self.layer_norm = nn.LayerNorm(self.num_features, 1e-7)

    def forward(self, backbone_outputs, attention_mask):

        #------------ Output Transformation ----------------------------------------------#
        x = torch.stack(backbone_outputs.hidden_states[-self.num_layers:])
        w = F.softmax(self.weights, dim=0)
        encoder_layer = (w * x).sum(dim=0)  # (bs, max_len, hidden_size)

        #------------ Multi-head attention  ----------------------------------------------#
        extended_attention_mask = attention_mask[:, None, None, :]
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
        encoder_layer = self.mha_layer_norm(encoder_layer)
        encoder_layer = self.attention(encoder_layer, extended_attention_mask)[0]

        #------------ mean-pooling  ------------------------------------------------------#
        features = self.pool(encoder_layer, attention_mask)  # mean pooling

        #------------ layer-normalization  -----------------------------------------------#
        features = self.layer_norm(features)  # (bs, num_features)

        return features

# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Model
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


class FeedbackModel(nn.Module):
    """
    The feedback-ells model with separate task specific heads
    """

    def __init__(self, config):
        print("initializing the feedback model...")

        super(FeedbackModel, self).__init__()
        config = config["model"]
        self.config = config
        self.target_names = config["target_names"]
        self.num_targets = len(self.target_names)

        #----------------------------- Backbone -----------------------------------------#
        backbone_config = AutoConfig.from_pretrained(self.config["backbone_path"])
        backbone_config.update(
            {
                "hidden_dropout_prob": 0.0,
                "attention_probs_dropout_prob": 0.0,
                "use_cache": False,
            }
        )

        self.backbone = AutoModel.from_pretrained(self.config["backbone_path"], config=backbone_config)

        # resize model embeddings
        print("resizing model embeddings...")
        print(f"tokenizer length = {config['len_tokenizer']}")
        self.backbone.resize_token_embeddings(config["len_tokenizer"])


        #----------------------------- Feature Extractor ---------------------------------#
        hidden_size = num_features = self.backbone.config.hidden_size
        config["feature_extractor"]["hidden_size"] = hidden_size

        self.feature_extractors = nn.ModuleList(
            [
                FeatureExtractor(config["feature_extractor"]) for i in range(self.num_targets)
            ]
        )

        #----------------------------- Classifiers -------------------------------------#
        self.classifiers = nn.ModuleList(
            [
                nn.Linear(num_features, 1) for i in range(self.num_targets)
            ]
        )

    def encode(self, input_ids, attention_mask, token_type_ids):
        outputs = self.backbone(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            output_hidden_states=True,
        )
        features = [extractor(outputs, attention_mask) for extractor in self.feature_extractors]
        embeddings = torch.stack(features, dim=1)  # (batch, num_targets, num_features)

        return embeddings

    def forward(
        self,
        input_ids,
        attention_mask,
        token_type_ids,
        labels=None,
        aux_labels=None,
        **kwargs
    ):

        # features
        features = self.encode(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
        )  # (bs, num_targets, num_features)

        # logits
        logits = [classifier(features[:, idx]) for idx, classifier in enumerate(self.classifiers)]
        logits = torch.cat(logits, dim=-1)  # (bs, num_features)

        return logits

In [None]:
%%writefile run_inference_main.py
import time

t0 = time.perf_counter()

import sys
sys.path.append("../input/omegaconf")

import argparse
import gc
import json
import sys
from dataclasses import dataclass
from itertools import chain

import numpy as np
import pandas as pd
import torch
import torch.utils.checkpoint
from accelerate import Accelerator
from datasets import load_from_disk
from omegaconf import OmegaConf
from tokenizers import AddedToken
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoTokenizer, DataCollatorWithPadding

#### 
try:
    from fb_model_main import FeedbackModel

except Exception as e:
    print(e)
    raise ImportError

#--------------- Tokenizer ---------------------------------------------#
NEW_TOKENS = [
    "[LF]",
]


def get_tokenizer(cfg):
    """load the tokenizer"""
    tokenizer_path = cfg.model.backbone_path
    print(f"loading tokenizer from {tokenizer_path}")
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # NEW TOKENS
    print("adding new tokens...")
    tokens_to_add = []
    for this_tok in NEW_TOKENS:
        tokens_to_add.append(AddedToken(this_tok, lstrip=False, rstrip=False))
    tokenizer.add_tokens(tokens_to_add)

    print(f"tokenizer len: {len(tokenizer)}")

    test_string = "This is a test \n [LF]!"
    tokenized_string = tokenizer.tokenize(test_string)
    print(f"tokenizer test: {tokenized_string}")
    return tokenizer

#------ DataLoader --------------------------------------------------------------#

@dataclass
class CustomDataCollatorWithPadding(DataCollatorWithPadding):
    """
    data collector for seq classification
    """

    tokenizer = None
    padding = True
    max_length = None
    pad_to_multiple_of = None
    return_tensors = "pt"

    def __call__(self, features):
        labels = None
        if "labels" in features[0].keys():
            labels = [feature["labels"] for feature in features]

        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=None,
        )

        if labels is not None:
            batch["labels"] = labels

        batch = {k: (torch.tensor(v, dtype=torch.float32) if k in ["labels", "aux_labels"] else torch.tensor(
            v, dtype=torch.int64)) for k, v in batch.items()}
        return batch


#------ Inference Function ------------------------------------------------------#
def inference_fn(model, infer_dl):
    all_preds = []
    
    accelerator = Accelerator()
    model, infer_dl = accelerator.prepare(model, infer_dl)
    model.eval()
    
    tk0 = tqdm(infer_dl, total=len(infer_dl))
    for batch in tk0:
        with torch.no_grad():
            preds = model(**batch)                
        all_preds.append(preds)

    all_preds = [p.to('cpu').detach().numpy().tolist() for p in all_preds]
    all_preds = np.array(list(chain(*all_preds)))   
    
    print("preds:")
    print(all_preds[0, :])
    print("=="*40)
    return all_preds


##################################################################################
# Execution
##################################################################################

if __name__ == "__main__":
    #--------------- config -----------------------------------------------------#
    ap = argparse.ArgumentParser()
    ap.add_argument('--config_path', type=str, required=True)
    ap.add_argument('--dataset_path', type=str, required=True)
    ap.add_argument('--checkpoints', nargs='+', type=str, required=True)
    ap.add_argument('--save_path', type=str, required=True)
    ap.add_argument('--num_thread', type=int, required=False, default=1)
    args = ap.parse_args()
    # ======= set 1 cpu ====== #
    if args.num_thread > 0:
        torch.set_num_threads(args.num_thread)
    
    # print(args.checkpoints)
    
    # read configuration
    cfg = OmegaConf.load(args.config_path)
    cfg_dict = OmegaConf.to_container(cfg, resolve=True)
    print("=="*40)
    print(json.dumps(cfg_dict, indent=4))
    print("=="*40)
    
    #--------------- inputs ----------------------------------------------------#
    # load the dataset
    print("loading the dataset...")
    infer_ds = load_from_disk(args.dataset_path)
    print("done!")
    
    print("creating the dataloader...")
    tokenizer = get_tokenizer(cfg)
    cfg_dict["model"]["len_tokenizer"] = len(tokenizer)
    data_collector = CustomDataCollatorWithPadding(tokenizer=tokenizer)

    # sort valid dataset for faster evaluation
    infer_ds = infer_ds.sort("input_length")
    SORTED_TEXT_IDS = infer_ds["essay_id"]

    infer_ds.set_format(
        type=None,
        columns=['input_ids', 'attention_mask', 'token_type_ids']
    )

    infer_dl = DataLoader(
        infer_ds,
        batch_size=cfg.infer_params.infer_bs,
        shuffle=False,
        collate_fn=data_collector,
    )
    print("data preparation done...")
    print("=="*40)
    
    #--------------- inference ----------------------------------------------#
    list_preds = []
    for model_id, checkpoint in enumerate(args.checkpoints):
        print(f"infering from {checkpoint}")

        model = FeedbackModel(cfg_dict)
        ckpt = torch.load(checkpoint, map_location=torch.device('cpu'))
        print(f"validation score for fold {model_id} = {ckpt['lb']}")

        model.load_state_dict(ckpt['state_dict'], strict=False)
        ckpt_preds = inference_fn(model, infer_dl)
        list_preds.append(ckpt_preds)
    
    # clean up
    del model, infer_dl
    gc.collect()
    torch.cuda.empty_cache()
    
    # aggregation
    if cfg.infer_params.agg_fn == "mean":
        model_preds = np.mean(list_preds, axis=0)
    else:
        raise NotImplementedError
        
    preds_df = pd.DataFrame()
    preds_df["essay_id"] = SORTED_TEXT_IDS
    
    TARGET_NAMES = cfg.model.target_names
    num_targets = len(TARGET_NAMES)

    for i in range(num_targets):
        preds_df[TARGET_NAMES[i]] = model_preds[:, i]
        
    # print
    print(preds_df.head())
        
    # save predictions
    print(f"saving predictions to {args.save_path}")
    preds_df.to_csv(args.save_path, index=False)
    import time
    print(f"done! {time.perf_counter() - t0:.2f}")

In [None]:
import os 
os.makedirs("./datasets", exist_ok=True)
os.makedirs("./predictions", exist_ok=True)

In [None]:
%%writefile eff_inference.yaml

model:
    backbone_path: /kaggle/input/debertav3xsmall/deberta-v3-xsmall
    feature_extractor:
        num_layers: 4
    max_length: 448
    target_names:
        - cohesion
        - syntax
        - vocabulary
        - phraseology
        - grammar
        - conventions
    len_tokenizer: ???
    loss_fn: mse

infer_params:
    input_path: ../input/learning-agency-lab-automated-essay-scoring-2/test.csv
    infer_bs: 4
    agg_fn: mean

In [None]:
%%time
!python create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=0

In [None]:
%%time
!python create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=1

In [None]:
!python create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=2

In [None]:
!python create_datasets_main.py \
--config_path eff_inference.yaml \
--save_path ./datasets/task_dataset_{rank} \
--rank=3

In [None]:
%%writefile run.sh
# export OMP_SCHEDULE=STATIC
# export OMP_PROC_BIND=CLOSE
# export GOMP_CPU_AFFINITY="N-M"
# export KMP_AFFINITY=granularity=fine,proclist=[0-0],explicit
python run_inference_main.py \
--config_path eff_inference.yaml \
--dataset_path ./datasets/task_dataset_0 \
--checkpoints \
    /kaggle/input/eff-xsmall-448/eff_full_xsmall_448.pth.tar \
--save_path ./predictions/eff_preds_0.csv \
--num_thread 1 &

python run_inference_main.py \
--config_path eff_inference.yaml \
--dataset_path ./datasets/task_dataset_1 \
--checkpoints \
    /kaggle/input/eff-xsmall-448/eff_full_xsmall_448.pth.tar \
--save_path ./predictions/eff_preds_1.csv \
--num_thread 1 &

python run_inference_main.py \
--config_path eff_inference.yaml \
--dataset_path ./datasets/task_dataset_2 \
--checkpoints \
    /kaggle/input/eff-xsmall-448/eff_full_xsmall_448.pth.tar \
--save_path ./predictions/eff_preds_2.csv &

python run_inference_main.py \
--config_path eff_inference.yaml \
--dataset_path ./datasets/task_dataset_3 \
--checkpoints \
    /kaggle/input/eff-xsmall-448/eff_full_xsmall_448.pth.tar \
--save_path ./predictions/eff_preds_3.csv &

wait 
echo "All done"

In [None]:
!sh run.sh

In [None]:
eff_df_0 = pd.read_csv("./predictions/eff_preds_0.csv")
# eff_df_0 = eff_df_0.reset_index(drop=True)

eff_df_1 = pd.read_csv("./predictions/eff_preds_1.csv")
eff_df_2 = pd.read_csv("./predictions/eff_preds_2.csv")
eff_df_3 = pd.read_csv("./predictions/eff_preds_3.csv")
# eff_df_1 = eff_df_1.reset_index(drop=True)

eff_df = pd.concat([eff_df_0, eff_df_1, eff_df_2, eff_df_3], axis=0)
eff_df = eff_df.sort_values(by="essay_id")
eff_df = eff_df.reset_index(drop=True)

In [None]:
MODEL_WEIGHTS = [1.0] #[0.34, 0.33, 0.33]
print(f"sum of weights {np.sum(MODEL_WEIGHTS)}")

submission_df = pd.DataFrame()

pred_dfs  = [  
    eff_df,
]

TARGET_COLS = ["cohesion", "syntax", "vocabulary", "phraseology", "grammar", "conventions"]

submission_df["essay_id"] =  pred_dfs[0]["essay_id"].values
for model_idx, model_preds in enumerate(pred_dfs):
    if model_idx == 0:
        for target in TARGET_COLS:
            submission_df[target]  =  MODEL_WEIGHTS[model_idx] * model_preds[target]
    else:
        for target in TARGET_COLS:
            submission_df[target]  +=  MODEL_WEIGHTS[model_idx] * model_preds[target] 

eff_df
eff_df = eff_df.drop_duplicates(subset=['essay_id'], keep='first')
eff_df

## feature engineering for text

In [None]:
import re
import polars as pl
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.ensemble import VotingClassifier,VotingRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from tqdm.auto import tqdm,trange
from lightgbm import log_evaluation, early_stopping
from catboost import CatBoostClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
import pickle

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
class FeatureEngineering():
    def __init__(self):
        self.columns = [
            (pl.col("full_text").str.split(by="\n\n").alias("paragraph"))
        ]
        self.train_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv').with_columns(self.columns)
        self.test_dataset = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv').with_columns(self.columns)
        # feature_eng
        self.sentence_fea = ['sentence_len','sentence_word_cnt']
        # feature_eng
        self.paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
        self.vectorizer = TfidfVectorizer(tokenizer=lambda x: x,
                                          preprocessor=lambda x: x,
                                          token_pattern=None,
                                          strip_accents='unicode',
                                          analyzer = 'word',
                                          ngram_range=(2,3),
                                          min_df=0.05,
                                          max_df=0.9,
                                          sublinear_tf=True  
        )
    def removeHTML(self,x):
        html=re.compile(r'<.*?>')
        return html.sub(r'',x)
    def dataPreprocessing(self,x):
        x = x.lower()             # covert all letter to lower form
        x = self.removeHTML(x)
        x = re.sub("@\w+", '',x)
        x = re.sub("'\d+", '',x)
        x = re.sub("\d+", '',x)
        x = re.sub("http\w+", '',x)
        x = re.sub(r"\s+", " ",x) # replace any sequence of whitespace characters with a sigle whitespace
        x = re.sub(r"\.+", ".",x) # replace any sequence of periods with a sigle periods
        x = re.sub(r"\,+", ",",x) # replace any sequence of commas with a sigle comma
        x = x.strip()
        return x 
    def Paragraph_Preprocess(self,tmp):
        tmp = tmp.explode('paragraph')
        # preprocess
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(self.dataPreprocessing))
        # paragraph_len
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x:len(x)).alias("paragraph_len"))
        # filter
        tmp = tmp.filter(pl.col('paragraph_len')>=25)
        # paragraph_sentence_count/paragraph_word_count
        tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split("."))).alias("paragraph_sentence_cnt"),
                               pl.col('paragraph').map_elements(lambda x: len(x.split(" "))).alias("paragraph_word_cnt")
                              )
        return tmp
    def Paragraph_Eng(self,train_tmp):
        aggs = [
            # paragraph_len_cnt
            *[pl.col('paragraph').filter(pl.col('paragraph_len')>=i)
            .count().alias(f'paragraph_{i}_cnt') for i in [25,100,200,300,400,500,600,700]],
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.paragraph_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.paragraph_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.paragraph_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.paragraph_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.paragraph_fea],
        ]
        df = train_tmp.group_by(["essay_id"], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Paragraph_Eng +",len(df.columns),"features")
        return df
    def Sentence_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=".").alias("sentence"))
        tmp = tmp.explode('sentence')
        # sentence_len
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
        # filter
        tmp = tmp.filter(pl.col('sentence_len')>=15)
        # sentence_word_cnt
        tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))

        return tmp
    def Sentence_Eng(self,train_tmp):
        aggs = [
            # sentence_cnt
            *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt") for i in [15,50,100,150,200,250,300] ], 
            # other
            *[pl.col(fea).max().alias(f"{fea}_max") for fea in self.sentence_fea],
            *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in self.sentence_fea],
            *[pl.col(fea).min().alias(f"{fea}_min") for fea in self.sentence_fea],
            *[pl.col(fea).first().alias(f"{fea}_first") for fea in self.sentence_fea],
            *[pl.col(fea).last().alias(f"{fea}_last") for fea in self.sentence_fea],
            ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Sentence_Eng +",len(df.columns),"features")
        return df
    # word feature
    def Word_Preprocess(self,tmp):
        tmp = tmp.with_columns(pl.col('full_text').map_elements(self.dataPreprocessing).str.split(by=" ").alias("word"))
        tmp = tmp.explode('word')
        # word_len
        tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
        # filter
        tmp = tmp.filter(pl.col('word_len')!=0)

        return tmp
    # feature_eng
    def Word_Eng(self,train_tmp):
        aggs = [
            # word_cnt
            *[pl.col('word').filter(pl.col('word_len') >= i+1)
              .count().alias(f"word_{i+1}_cnt") for i in range(15)], 
            # other
            pl.col('word_len').max().alias(f"word_len_max"),
            pl.col('word_len').mean().alias(f"word_len_mean"),
            pl.col('word_len').std().alias(f"word_len_std"),
            pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
            pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
            pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]
        df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
        df = df.to_pandas()
        print("done Word_Eng +",len(df.columns),"features")
        return df
    def process(self):
        tmp = self.Paragraph_Preprocess(self.train_dataset)
        train_feats = self.Paragraph_Eng(tmp)
        train_feats['score'] = self.train_dataset['score']
        
        tmp = self.Sentence_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Sentence_Eng(tmp), on='essay_id', how='left')
        
        tmp = self.Word_Preprocess(self.train_dataset)
        train_feats = train_feats.merge(self.Word_Eng(tmp), on='essay_id', how='left')
        
        train_tfid = self.vectorizer.fit_transform([i for i in self.train_dataset['full_text']])
        dense_matrix = train_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
#         print("----------------------------------------------------------")
        df['essay_id'] = train_feats['essay_id']
        # merge
        train_feats = train_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(train_feats.columns)-2)
        return train_feats
    def process_test(self):
        temp = self.Paragraph_Preprocess(self.test_dataset)
        test_feats = self.Paragraph_Eng(temp)
        
        temp = self.Sentence_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Sentence_Eng(temp), on='essay_id', how='left')
        
        temp = self.Word_Preprocess(self.test_dataset)
        test_feats = test_feats.merge(self.Word_Eng(temp), on='essay_id', how='left')
        
        test_tfid = self.vectorizer.transform([i for i in self.test_dataset['full_text']])
        dense_matrix = test_tfid.toarray()
        df = pd.DataFrame(dense_matrix)
        tfid_columns = [ f'tfid_{i}' for i in range(len(df.columns))]
        df.columns = tfid_columns
#         print(df)
        df['essay_id'] = test_feats['essay_id']
        # merge
        test_feats = test_feats.merge(df, on='essay_id', how='left')
        print('feature_num: ',len(test_feats.columns)-2)
        
        return test_feats

In [None]:
class LGBM():
    def __init__(self):
        self.data_train = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/train.csv')
        self.data_test = pl.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv')
        self.num_models = 3
        self.acc_metrics = []
        self.cohen_metrics = []
        
        # coef for cohen kappa score
        self.a = 2.948
        self.b = 1.092
        
        self.lgb_parameters = {  
                                 'metrics': 'None',
                                 'objective': self.qwk_obj,
                                 'learning_rate': 0.05,
                                 'max_depth': 5,
                                 'num_leaves': 10, # should be a number smaller than "max_depth"^2
                                 'colsample_bytree': 0.3,
                                 'min_data_in_leaf': 100,
                                 'reg_alpha': 0.7,
                                 'reg_lambda' : 0.1,
                                 'n_estimators': 700,
                                 'extra_trees' : True,
                                 'verbosity': -100,
#                                  'device' : "gpu"
        }
        self.model = VotingRegressor(
            estimators = [
                            (f"lgb_{i}",lgb.LGBMRegressor(**self.lgb_parameters, random_state=i+40),)for i in range(self.num_models)
                         ],
                        n_jobs=-1
        )
        
    def quadratic_weighted_kappa(self,y_true,y_pred):
        y_true = y_true + self.a
        y_pred = (y_pred + self.a).clip(1,6).round()
#         print(y_true)
#         print(y_pred)
        qwk = cohen_kappa_score(y_true,y_pred,weights='quadratic')
        
        return "QWK",qwk,True
    def qwk_obj(self,y_true,y_pred):
        labels = y_true + self.a
        preds = y_pred + self.a
        preds = preds.clip(1,6)
        f = 1/2 * np.sum((preds-labels)**2)
        g = 1/2 * np.sum((preds-self.a)**2+self.b)
        df = preds - labels
        dg = preds - self.a
        grad = (df/g - f*dg/g**2)*len(labels)
        hess = np.ones(len(labels))
        
        return grad,hess
    def split_folds(self, df):
        feature_names = [col for col in df.columns if col not in ['essay_id', 'score']]
        x = df[feature_names].values
        y = df['score'].values
        
        kfold = KFold(n_splits=5, random_state=44, shuffle=True)
        
        return kfold.split(x, y)
    
    def fit(self, df,debug=False):
        folds = self.split_folds(df)

        for fold_id, (trn_idx, val_idx) in enumerate(folds):
            if fold_id != 0 and debug==True:
                break 

            X_train, X_val = df.iloc[trn_idx][feature_names], df.iloc[val_idx][feature_names]
            Y_train, Y_val = df.iloc[trn_idx]['score'] - self.a, df.iloc[val_idx]['score'] - self.a
            
            print(f'\nFold_{fold_id} Training ================================\n')
            
            self.model.fit(X_train, Y_train)
            pred_val = self.model.predict(X_val)
            
            df_tmp = df.iloc[val_idx][['essay_id', 'score']].copy()
            df_tmp['pred'] = pred_val
            
            # plot confusion matrix
            y_true = Y_val.values+np.ones_like(Y_val.shape)*self.a
            y_pred = (pred_val + np.ones_like(pred_val)*self.a).clip(1,6).round()
            cm = confusion_matrix(y_true,y_pred)
            sns.heatmap(cm, 
                        annot=True,
                        fmt='g', 
                        xticklabels=['1','2','3','4','5','6'],
                        yticklabels=['1','2','3','4','6','6'])
            plt.ylabel('Prediction',fontsize=13)
            plt.xlabel('Actual',fontsize=13)
            plt.title('Confusion Matrix',fontsize=17)
            plt.show()
                                  
            cohen_score = self.quadratic_weighted_kappa(Y_val.values, df_tmp['pred'])
            self.cohen_metrics.append(cohen_score[1])

        average_cohen = np.mean(self.cohen_metrics)
        print(f'Average Cohen all fold: {average_cohen:.4f}')
    def save_model(self):
        pass
    def predict(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id','score'], df.columns))
        
        predictions = self.model.predict(df[feature_names])
        predictions += self.a
        predictions = predictions.clip(1,6).round()
#         predictions = self.model.predict(df[feature_names])
        return predictions
    def submit(self,df):
        feature_names = list(filter(lambda x: x not in ['essay_id'], df.columns))
        return self.data_test.select('essay_id').with_columns(score = (self.model.predict(df[feature_names])+self.a).clip(1, 6).round())

In [None]:
FE = FeatureEngineering()
train_feature = FE.process()
test_feature = FE.process_test()
test_feature = pd.merge(test_feature,eff_df,on='essay_id')
test_feature.shape

In [None]:
test_feature = test_feature.drop(['essay_id'],axis=1)
test_feature

## predict

In [None]:
from joblib import load
model = load(f'/kaggle/input/train-lgbm-extra-feature-from-feedback-comp/saved_models/model.joblib')

In [None]:
pred = model.predict(test_feature)

## submission

In [None]:
submission = pd.read_csv('/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv')
submission['score'] = pred
submission['score']=submission['score'].astype(int)
submission.to_csv("submission.csv",index=None)
display(submission.head())