#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > TABLE OF CONTENTS<br><div>  
* [INTRODUCTION](#2)   
    * [UTILITIES](#2.1)  
    * [FOREWORD](#2.2)
    * [VERSION DETAILS](#2.3)
* [MODEL PREDICTIONS](#3) 
    * [PREPROCESSING](#3.1)
    * [PUBLIC DEBERTA V3 LARGE](#3.2)
    * [LGBM MODEL](#3.3)        

In [None]:
%%time

# Installing select libraries:-
from gc import collect;
from warnings import filterwarnings;
filterwarnings('ignore');
from IPython.display import display_html, clear_output;
import logging;
from shutil import copyfile

from copy import deepcopy;
import pandas as pd, polars as pl, numpy as np;
import polars.selectors as cs;

from os import path, walk, getpid
from psutil import Process
import re
from collections import Counter
from itertools import product
import spacy, string, random

from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');
import joblib;
import os;

from tqdm.notebook import tqdm;
import seaborn as sns;
import matplotlib.pyplot as plt;
from matplotlib.colors import ListedColormap as LCM;
%matplotlib inline

from pprint import pprint;
from functools import partial;

import ctypes
libc = ctypes.CDLL("libc.so.6")

print();
collect();
clear_output();

In [None]:
%%time

# Importing model and pipeline specifics:-
from category_encoders import OrdinalEncoder, OneHotEncoder;

# Pipeline specifics:-
from sklearn.preprocessing import (RobustScaler,
                                   MinMaxScaler,
                                   StandardScaler,
                                   FunctionTransformer as FT,
                                   PowerTransformer,
                                  );
from sklearn.impute import SimpleImputer as SI;
from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF,
                                     StratifiedKFold as SKF,
                                     StratifiedGroupKFold as SGKF,
                                     KFold,
                                     RepeatedKFold as RKF,
                                     cross_val_score,
                                     cross_val_predict
                                    );
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, make_pipeline;
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin;
from sklearn.compose import ColumnTransformer;

# ML Model training:-
from sklearn.metrics import cohen_kappa_score, accuracy_score;
from xgboost import DMatrix, XGBRegressor as XGBR;
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR;
from catboost import CatBoostRegressor as CBR, Pool;

# Ensemble and tuning:-
import optuna;
from optuna import Trial, trial, create_study;
from optuna.pruners import HyperbandPruner;
from optuna.samplers import TPESampler, CmaEsSampler;
optuna.logging.set_verbosity = optuna.logging.ERROR;
optuna.logging.disable_default_handler()

clear_output();
print();
collect();

In [None]:
%%time

# Setting rc parameters in seaborn for plots and graphs-
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.75,
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : '#0099e6',
         'axes.titlesize'       : 8.5,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,
        });

# Setting global configuration for polars
pl.Config.activate_decimals(True).set_tbl_hide_column_data_types(True)
pl.Config(**dict(tbl_formatting = 'ASCII_FULL_CONDENSED',
                 tbl_hide_column_data_types = True,
                 tbl_hide_dataframe_shape = True,
                 fmt_float = "mixed",
                 tbl_cell_alignment = 'CENTER',
                 tbl_hide_dtype_separator = True,
                 tbl_cols = 100,
                 tbl_rows = 50,
                 fmt_str_lengths = 100,
                )
         )

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config;
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();


<a id="2"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > INTRODUCTION<br><div>

<a id="2.1"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > UTILITIES<br><div>

In [None]:
%%time

class Utility:
    """
    This class serves to do the below-
    1. Define method to print in color
    2. Define the garbage cleaning process
    """;

    def PrintColor(self,text:str, color = Fore.BLUE, style = Style.BRIGHT):
        "Prints color outputs using colorama using a text F-string";
        print(style + color + text + Style.RESET_ALL)

    def ScoreMetric(self, ytrue, ypred)-> float:
        """
        This method calculates the custom metric from the imported script
        Inputs- ytrue, ypred:- input truth and predictions
        Output- float:- competition metric
        """;

        y_pred = np.uint8(np.around(np.clip(ypred, a_min = 1, a_max = 6)))
        return cohen_kappa_score(np.uint8(ytrue), y_pred, weights = "quadratic")

    def CleanMemory(self):
        "This method cleans the memory off unused objects and displays the cleaned state RAM usage";

        collect();
        libc.malloc_trim(0)
        pid        = getpid()
        py         = Process(pid)
        memory_use = py.memory_info()[0] / 2. ** 30
        return f"\nRAM usage = {memory_use :.4} GB"

Utils = Utility()
print();

In [None]:
%%time

class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    Some parameters may be unused here as this is a general configuration class
    """;

    # Data preparation:-
    exp_nb             = 1;
    version_nb         = 18;
    test_req           = "N";
    test_sample_frac   = 0.025;
    gpu_switch         = "ON";
    state              = 42;
    target             = "score";
    path               = f"/kaggle/input/aes2024ancillary";
    op_path            = f"/kaggle/working"
    vocab_path         = f'/kaggle/input/english-word-hx/words.txt'
    llm_path           = f'/kaggle/input/aes2-400-20240419134941'

    dtl_preproc_req    = "Y";
    ftre_plots_req     = 'Y';
    ftre_imp_req       = "Y";

    # Model Training:-
    ML                 = "Y";
    nb_models          = 10;
    n_splits           = 3 if test_req == "Y" else 15;
    n_repeats          = 1 ;
    nbrnd_erly_stp     = 75;
    mdlcv_mthd         = 'RSKF';

    a                  = 2.998
    b                  = 1.042

    # Ensemble:-
    ensemble_req       = "N";
    metric_obj         = 'maximize';
    ntrials            = 10 if test_req == "Y" else 250;

    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--',
                           'color': 'lightgrey', 'linewidth': 0.75};
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': '#992600'};

print();
Utils.PrintColor(f"--> Configuration done!\n");
collect();

| Parameter         | Description                                             | Possible value choices|
| ---               | ---                                                     | :-:                   |
|  exp_nb           | Experiment Number                                       | integer               |
|  version_nb       | Version Number                                          | integer               |
|  test_req         | Are we testing syntax here?                             | Y/N                   |  
|  test_sample_frac | Sample size for syntax test                             | float(0-1)/ int       |     
|  gpu_switch       | GPU switch                                              | ON/OFF                |
|  state            | Random state for most purposes                          | integer               |
|  target           | Target column names                                     | string                |   
|  path             | Path for input data files                               |                       |
|  op_path          | Path for output data files                              |                       |
|  llm_path         | Path for LLM OOF data files                             |                       |
|  vocab_path       | Path for English vocab files                            |                       |
|  dtl_preproc_req  | Proprocessing required                                  | Y/N                   |   
|  ftre_plots_req   | Feature plots required                                  | Y/N                   |
|  ftre_imp_req     | Feature importance required                             | Y/N                   |   
|  ML               | Machine Learning Models                                 | Y/N                   |
|  n_splits         | Number of CV splits                                     | integer               |
|  n_repeats        | Number of CV repeats                                    | integer               |
|  nbrnd_erly_stp   | Number of early stopping rounds                         | integer               |
|  mdl_cv_mthd      | Model CV method name                                    | RKF/ RSKF/ SKF/ KFold |
|  ensemble_req     | Ensemble required                                       | Y/N                   |
|  metric_obj       | Metric objective                                        | maximize/ minimize    |  
|  ntrials          | Number of trials                                        | int                   |  

<a id="2.2"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > FOREWORD<br><div>

This competition aims to grade essays into 6 grades from 1-6 using a training corpus data. We are asked to use **Quadratic Kappa Score** as the metric. <br>
Scoring rubric is explained in detail [here](https://storage.googleapis.com/kaggle-forum-message-attachments/2733927/20538/Rubric_%20Holistic%20Essay%20Scoring.pdf) as part of the competition overview and evaluation guidelines <br>

### **KERNEL SOURCES** <br>
1. https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments <br>
2. https://www.kaggle.com/code/hideyukizushi/aes2-5folddeberta-lgbm-countvectorizer-lb-810 <br>
3. https://www.kaggle.com/datasets/hideyukizushi/aes2-400-20240419134941 <br>
4. https://www.kaggle.com/code/yongsukprasertsuk/0-818-deberta-v3-large-lgbm-baseline <br>

### **MY CONTRIBUTION** <br>
1. I created a script to collate all preprocessing functions and use them in the train and inference kernels separately <br>
2. I have created a class for the vectorizer instead of the function and lambda expression <br>
3. I added a few more models and created an Optuna ensemble, CV score is better this way <br>

### **TRAINING KERNEL** <br>
One could build this pipeline using the kernel [here](https://www.kaggle.com/code/ravi20076/aes2024-baseline-ml-training) <br>
Training kernel is separated from the inference kernel, significantly reducing the inference time. We save the OOF predictions and the train set (excluding the DeBERTa features) for easy reuse as well <br>

<a id="2.3"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > VERSION DETAILS<br><div>

|Experiment <br> Number|Version <br> Number|Details|CV score|LB score|
|-----|-------|----------| :-: | :-:|
|E1   | V1 | * All features from public notebook- 3361 features <br> * 4 LGBM regressors <br> * Optuna ensemble | 0.80812 |0.796 | 
|E1   | V2 | * All features from public notebook- 3237 features <br> * 4 LGBM regressors <br> * Optuna ensemble | 0.80631 |0.796 | 
|E1   | V3 | * All features from public notebook- 3237 features <br> * 4 LGBM regressors <br> * Optuna ensemble <br> * Repeated k fold CV 10x1 | 0.80683 |0.799 | 
|E1   | V4 | * All features from public notebook- 3237 features <br> * 4 LGBM regressors <br> * Optuna ensemble <br> * Repeated k fold CV 5x1 <br> * Slightly altered parameters| 0.80453 | 0.796| 
|E1   | V5 | * All features from public notebook- 3361 features <br> * 10 LGBM regressors with simgle model and varying states <br> * Optuna ensemble <br> * Repeated k fold CV 5x1| 0.80738 |0.796 | 
|E1 | V6 |* Public notebook features - 8742 features <br> * Retained infinity values <br> * ML model training, 4 models <br> * Optuna ensemble <br> * 5x1 RSKF |0.80684|0.796|
|E1 | V7 |* Mode blending - Versions 1, 5, 6 ||0.796|
|E1 | V8 |Mean blending <br> * E1V6, E1V5, E1V1 <br> * DeBERTa V3 Large <br> * Best public LGBM |0.80684||
|E1 | V9 |* Stacking DeBERTa V3 Large and LGBM <br> * 13781 features <br> * 15x1 RSKF  |0.83627|0.806|
|E1 | V10 |* Stacking DeBERTa V3 Large and LGBM classifier <br> * 21867 + 12 features <br> * 12x1 RSKF |0.83319|0.808|
|E1 | V11 |* Stacking DeBERTa V3 Large (new public version) and LGBM classifier <br> * 21867 + 12 features <br> * 12x1 RSKF |0.83948|0.796|
|E1 | V12 |* Stacking DeBERTa V3 Large (old + new public version) and LGBM classifier <br> * 21867 + 18 features <br> * 12x1 RSKF |0.84148|0.806|
|E1 | V13 |* Stacking DeBERTa V3 Large (old + new public version) <br> * 21867 + 12 features <br> * 12x1 RSKF |0.84118|0.813|
|E1 | V14 |* Stacking DeBERTa V3 Large (old + new public version) <br> * 27056 + 12 features <br> * 12x1 RSKF |0.84082|0.806|
|E1 | V15 |* Stacking DeBERTa V3 Large (new public version) <br> * 27056 + 6 features <br> * 15x1 RSKF |0.83909|0.811|
|E1 | V16 |* Stacking DeBERTa V3 Large (new public version) <br> * 24434 + 6 features <br> * 15x1 RSKF |0.83909|0.811|
|E1 | V18 |* Stacking DeBERTa V3 Large (new public version) <br> * 24434 + 6 features <br> * 15x1 RSKF |0.83846||
|E1 | V19 |* Stacking DeBERTa V3 Large (new public version) <br> * 24468 + 6 features <br> * 15x1 RSKF <br> * Fold level feature selection to 13000 features|0.83798|0.607 <br> Bug |
|E1 | V20 |* Stacking DeBERTa V3 Large (new public version) <br> * 21990 + 6 features <br> * 15x1 RSKF <br> * Fold level feature selection to 13000 features|0.83795| |

<a id="3"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > MODEL INFERENCING<br><div>

<a id="3.1"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > PREPROCESSING<br><div>

In [None]:
%%time 

# Storing the output predictions:-
sub_fl    = pd.read_csv(f"/kaggle/input/learning-agency-lab-automated-essay-scoring-2/sample_submission.csv")

test = \
pl.read_csv(os.path.join("/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv")).\
with_columns([( pl.col("full_text").str.split(by="\n\n").alias("paragraph"))])

Utils.PrintColor(f"\nTest data sample\n")
display(test.head(1))
print();

copyfile(src = os.path.join(CFG.path, "fe.py"), dst = "fe.py");

import fe
from fe import *

Xtest = Make_Features(df     = test,
                      lbl    = "test",
                      target = CFG.target,
                      path   = CFG.path, 
                      vocab_path = CFG.vocab_path
                     )

Xtest = ReduceMem(Xtest)

print()
_ = Utils.CleanMemory()

<a id="3.2"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > PUBLIC DEBERTA V3 LARGE<br><div>

In [None]:
%%time 

from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification, 
                          Trainer, 
                          TrainingArguments, 
                          DataCollatorWithPadding
                         )
from datasets import Dataset
from glob import glob
from scipy.special import softmax
import torch

MAX_LENGTH      = 1024
TEST_DATA_PATH  = "/kaggle/input/learning-agency-lab-automated-essay-scoring-2/test.csv"

model_paths = {"debertav5_": f'//kaggle/input/aes2-400-20240419134941/*/*',}
EVAL_BATCH_SIZE = 1

def tokenize(sample):
    return tokenizer(sample['full_text'], max_length=MAX_LENGTH, truncation=True)

clear_output();

Utils.PrintColor(f"\nData size and shape before DeBERTa predictions = {Xtest.shape}")
for prefix_lbl, MODEL_PATH in model_paths.items():

    models      = glob(MODEL_PATH)
    tokenizer   = AutoTokenizer.from_pretrained(models[0])
    df_test     = pd.read_csv(TEST_DATA_PATH)
    ds          = Dataset.from_pandas(df_test).map(tokenize).remove_columns(['essay_id', 'full_text'])
    args        = TrainingArguments(".", per_device_eval_batch_size=EVAL_BATCH_SIZE, report_to="none")
    predictions = []

    for model in models:
        model   = AutoModelForSequenceClassification.from_pretrained(model)
        trainer = Trainer(model=model, args=args, data_collator=DataCollatorWithPadding(tokenizer), tokenizer=tokenizer)    
        preds   = trainer.predict(ds).predictions
        predictions.append(softmax(preds, axis=-1))
        del model, trainer
        torch.cuda.empty_cache()
        _ = Utils.CleanMemory();

    predicted_score = 0.0
    for p in predictions:
        predicted_score += p
    predicted_score /= len(predictions)

    df    = pd.DataFrame(predicted_score, index = df_test["essay_id"]).add_prefix(prefix_lbl)
    Xtest = Xtest.merge(df, how = "left", left_on = "essay_id", right_index = True)
    
    Utils.PrintColor(f"Data size and shape after merge {prefix_lbl} = {Xtest.shape}")
    print()
    display(df.head(5).style.format(precision = 5).set_caption(f"DeBERTA predictions"))
    print()
    _  = Utils.CleanMemory()
       
_  = Utils.CleanMemory()

<a id="3.2"></a>
#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:black; font-size:160%; text-align:left;padding:3.0px; background: lightgrey; border-bottom: 8px solid maroon" > ML REGRESSOR<br><div>

In [None]:
%%time

class LGBMSupport:
    """
    This class supports the LGBM training with a custom metric and objective function
    """

    def __init__(self, a, b):
        """
        Initializing class level parameters
        """
        self.a = a
        self.b = b

    def _LGBMetric(self, y_true, y_pred):
        """
        This is the custom metric used in evaluation and early stopping
        """;

        y_true = y_true + self.a
        y_pred = (y_pred + self.a).clip(1, 6).round()
        qwk = cohen_kappa_score(np.around(y_true,0), y_pred, weights="quadratic")
        return 'QWK', qwk, True

    def _LGBObj(self, y_true, y_pred):
        "This is the custom objective function for the LGBM"

        labels = y_true  + self.a
        preds  = y_pred  + self.a
        preds = preds.clip(1, 6)

        f = 1/2 * np.sum((preds-labels)**2)
        g = 1/2 * np.sum((preds- self.a)**2  + self.b)

        df = preds - labels
        dg = preds - self.a

        grad = (df/g - f*dg/g**2)*len(labels)
        hess = np.ones(len(labels))
        return grad, hess

class MyLogger:
    """
    This class helps to suppress logs in lightgbm and Optuna
    Source - https://github.com/microsoft/LightGBM/issues/6014
    """

    def init(self, logging_lbl: str):
        self.logger = logging.getLogger(logging_lbl)
        self.logger.setLevel(logging.ERROR)

    def info(self, message):
        pass

    def warning(self, message):
        pass

    def error(self, message):
        self.logger.error(message)

class VotingModelMaker(BaseEstimator, RegressorMixin):
    def __init__(self, estimators: list, weights: list, a = CFG.a, b = CFG.b):
        super().__init__()
        self.estimators = estimators
        self.weights    = weights
        self.a = a
        self.b = b

    def fit(self, X, y=None):
        return self

    def mdlpredict(self, X):
        y_preds = \
        pd.DataFrame(columns = [f"Est{i}" for i in range(len((self.estimators)))],
                     index = range(len(X))
                    )

        for i, estm in enumerate(self.estimators):
            y_preds[f"Est{i}"] = estm.predict(X) + self.a

        if self.weights != []:
            return np.average(y_preds, axis=1, weights = self.weights)
        else:
            return np.mean(y_preds, axis=1,)

    def mdlpredictproba(self, X):
        y_preds = \
        pd.DataFrame(columns = [f"Est{i}" for i in range(len(range(self.estimators)))],
                     index = range(len(X))
                    )

        for i, estm in enumerate(self.estimators):
            y_preds[f"Est{i}"] = estm.predict_proba(X)

        if self.weights != []:
            return np.average(y_preds, axis=1, weights = self.weights)
        else:
            return np.mean(y_preds, axis=1,)

In [None]:
%%time 

models    = joblib.load(os.path.join(CFG.path, f"VR_E{CFG.exp_nb}V{CFG.version_nb}"))
drop_cols = ["essay_id", "id", "Source"]

Mdl_Preds = pd.DataFrame(index = Xtest["essay_id"])
for i, mdl in enumerate(models):
    Utils.PrintColor(f"---> Model {i}");
    Mdl_Preds[f"Model{i}"] = \
    mdl.mdlpredict(Xtest.drop(drop_cols, axis=1, errors = "ignore")).values
    
print()
display(Mdl_Preds.head(10).style.format(precision = 2).set_caption("Model predictions file"));

if len(models) > 1:
    sub_fl[CFG.target] = np.round(np.clip(np.mean(Mdl_Preds, axis=1), a_min = 1, a_max = 6),0)
else:
    sub_fl[CFG.target] = np.round(np.clip(Mdl_Preds["Model0"].values, a_min = 1, a_max = 6),0)

del Mdl_Preds, Xtest, models, drop_cols;
_  = Utils.CleanMemory()

print("\n\n")
display(sub_fl.head(10).style.format(precision = 2).set_caption("Submission file"));

sub_fl.to_csv("submission.csv", index = None);
print();
_  = Utils.CleanMemory()

!ls