#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > TABLE OF CONTENTS<br><div>  
* [INTRODUCTION](#2)   
    * [UTILITIES](#2.1)  
    * [FOREWORD](#2.2)
    * [VERSION DETAILS](#2.3)
* [PREPROCESSING](#3)  
* [MODEL TRAINING](#4)
      

In [None]:
%%time

# Installing select libraries:-
from gc import collect;
from warnings import filterwarnings;
filterwarnings('ignore');
from IPython.display import display_html, clear_output;
import logging
import sys

from copy import deepcopy;
import pandas as pd, polars as pl, numpy as np;
import polars.selectors as cs;
import spacy, string, random

from os import path, walk, getpid
from psutil import Process
import re
from collections import Counter
from itertools import product

from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');
import joblib;
import os;

from tqdm.notebook import tqdm;
import seaborn as sns;
import matplotlib.pyplot as plt;
from matplotlib.colors import ListedColormap as LCM;
%matplotlib inline

from pprint import pprint;
from functools import partial;

import ctypes
libc = ctypes.CDLL("libc.so.6")

print();
collect();
clear_output();

In [None]:
%%time

# Importing model and pipeline specifics:-
from category_encoders import OrdinalEncoder, OneHotEncoder;

# Pipeline specifics:-
from sklearn.preprocessing import (RobustScaler,
                                   MinMaxScaler,
                                   StandardScaler,
                                   FunctionTransformer as FT,
                                   PowerTransformer,
                                  );
from sklearn.impute import SimpleImputer as SI;
from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF,
                                     StratifiedKFold as SKF,
                                     StratifiedGroupKFold as SGKF,
                                     KFold,
                                     RepeatedKFold as RKF,
                                     cross_val_score,
                                     cross_val_predict
                                    );
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline;
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin;
from sklearn.compose import ColumnTransformer;

# ML Model training:-
from sklearn.metrics import (cohen_kappa_score,
                             accuracy_score,
                             roc_auc_score,
                             confusion_matrix,
                             ConfusionMatrixDisplay,
                             f1_score)
from xgboost import DMatrix, XGBRegressor as XGBR;
import xgboost as xgb

clear_output();
print();
collect();

In [None]:
%%time

# Setting rc parameters in seaborn for plots and graphs-
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.75,
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : '#0099e6',
         'axes.titlesize'       : 8.5,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,
        });

# Setting global configuration for polars
pl.Config.activate_decimals(True).set_tbl_hide_column_data_types(True)
pl.Config(**dict(tbl_formatting = 'ASCII_FULL_CONDENSED',
                 tbl_hide_column_data_types = True,
                 tbl_hide_dataframe_shape = True,
                 fmt_float = "mixed",
                 tbl_cell_alignment = 'CENTER',
                 tbl_hide_dtype_separator = True,
                 tbl_cols = 100,
                 tbl_rows = 50,
                 fmt_str_lengths = 100,
                )
         )

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config;
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();


<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > INTRODUCTION<br><div>

<a id="2.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > UTILITIES<br><div>

In [None]:
%%time

class Utility:
    """
    This class serves to do the below-
    1. Define method to print in color
    2. Define the garbage cleaning process
    """;

    def PrintColor(self,text:str, color = Fore.BLUE, style = Style.BRIGHT):
        "Prints color outputs using colorama using a text F-string";
        print(style + color + text + Style.RESET_ALL)

    def ScoreMetric(self, ytrue, ypred)-> float:
        """
        This method calculates the custom metric from the imported script
        Inputs- ytrue, ypred:- input truth and predictions
        Output- float:- competition metric
        """;

        y_pred = np.uint8(np.around(np.clip(ypred, a_min = 1, a_max = 6)))
        return cohen_kappa_score(np.uint8(ytrue), y_pred, weights = "quadratic")

    def ClsfMetric(self, ytrue, ypred)-> float:
        """
        This method calculates the classifier model metric
        Inputs- ytrue, ypred:- input truth and predictions
        Output- float:- classifier metric
        """;

        return roc_auc_score(np.uint8(ytrue), ypred, multi_class = "ovr")

    def CleanMemory(self):
        "This method cleans the memory off unused objects and displays the cleaned state RAM usage";

        collect();
        libc.malloc_trim(0)
        pid        = getpid()
        py         = Process(pid)
        memory_use = py.memory_info()[0] / 2. ** 30
        return f"\nRAM usage = {memory_use :.4} GB"

Utils = Utility()
print();

<a id="2.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > CONFIGURATION<br><div>

In [None]:
%%time

class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    Some parameters may be unused here as this is a general configuration class
    
    for syntax check - use test_req = Y
    for actual run, turn on the gpu_switch = ON and test_req = N
    """;

    # Data preparation:-
    exp_nb             = 1;
    version_nb         = 1;
    test_req           = "N";
    nb_cols_test       = 100;
    gpu_switch         = "ON";

    load_train_data    = True

    cv_state           = 0;
    state              = 42;

    target             = "score";
    path               = f"/kaggle/input/learning-agency-lab-automated-essay-scoring-2";
    op_path            = f"/kaggle/working"
    vocab_path         = f'/kaggle/input/english-word-hx/words.txt'
    mydatapath         = f"/kaggle/input/aes2024startertraintest/Datasets"

    # Model Training:-
    ML                 = "Y";
    n_splits           = 3 if test_req == "Y" else 15;
    n_repeats          = 1 ;
    nbrnd_erly_stp     = 75;
    mdlcv_mthd         = 'RSKF';

    a                  = 2.998
    b                  = 1.042

    # Ensemble:-
    ensemble_req       = "N";
    metric_obj         = 'maximize';
    ntrials            = 10 if test_req == "Y" else 250;

    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--',
                           'color': 'lightgrey', 'linewidth': 0.75};
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': '#992600'};

print();
Utils.PrintColor(f"--> Configuration done!\n");
collect();

<a id="2.3"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > FOREWORD<br><div>

This competition aims to grade essays into 6 grades from 1-6 using a training corpus data. We are asked to use **Quadratic Kappa Score** as the metric. <br>
Scoring rubric is explained in detail [here](https://storage.googleapis.com/kaggle-forum-message-attachments/2733927/20538/Rubric_%20Holistic%20Essay%20Scoring.pdf) as part of the competition overview and evaluation guidelines <br>

### **KERNEL OBJECTIVE** <br>
I delve into the data a bit here, use the public notebook to generate features and use them effectively to train ML models. <br>
This is not a tuned process. I aim to showcase the usage of XGboost training with custom loss function herewith. Feel free to use this in your pipeline and let me know in the comments regarding your thoughts! <br>

### **USING THE NATIVE XGBOOST SYNTAX** <br>
I like to use the xgboost **native syntax** for both classification and regression problems. I feel it is much easier to set up an objective, including custom objectives and evaluation metrics and provide labels to evaluation sets hereby. Also, the scikit-learn syntax for XGboost has changed over recent versions, deprecating traditional booster parameters like eval_metric and  callbacks from the fit() method. The native XGboost syntax is not affected with this problem though. A downside of this syntax is that one may not be able to use it in a pipeline as-is and will need to create a custom transformer to be able to use the predict/ fit methods. <br>

### **BUILDING AND IMPROVING SINGLE MODELS** <br>
I recommend one and all **not to overtly rely on blends and stacks** now and rather focus on single models. Such a pipeline will help you fathom the power of a single model. Hope this is useful!

### **KERNEL SOURCES** <br>
https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments <br>
https://www.kaggle.com/datasets/hideyukizushi/aes2-400-20240419134941 <br>
https://www.kaggle.com/code/hideyukizushi/aes2-deberta-lgbm-countvectorizer-lb-814 <br>
https://www.kaggle.com/code/yongsukprasertsuk/0-818-deberta-v3-large-lgbm-baseline <br>



<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > PREPROCESSING<br><div>

In [None]:
%%time

nlp = spacy.load("en_core_web_sm")
with open(CFG.vocab_path, 'r') as file:
    english_vocab = set(word.strip().lower() for word in file)

def count_spelling_errors(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_.lower() for token in doc]
    spelling_errors = sum(1 for token in lemmatized_tokens if token not in english_vocab)
    return spelling_errors

def removeHTML(x):
    html = re.compile(r'<.*?>')
    return html.sub(r'', x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

def remove_punctuation(text):
    """
    Remove all punctuation from the input text.

    Args:
    - text (str): The input text.

    Returns:
    - str: The text with punctuation removed.
    """
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def Paragraph_Preprocess(tmp):
    tmp = tmp.explode('paragraph')
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(remove_punctuation).alias('paragraph_no_punctuation'))
    tmp = tmp.with_columns(pl.col('paragraph_no_punctuation').map_elements(count_spelling_errors).alias("paragraph_error_num"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                           pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),
                           )
    return tmp

paragraph_fea  = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
paragraph_fea2 = ['paragraph_error_num'] + paragraph_fea

def Paragraph_Eng(train_tmp):
    num_list  = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600]
    num_list2 = [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700]

    aggs = [
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).\
          count().\
          alias(f"paragraph_{i}_cnt")
          for i in [0, 50,75,100,125,150,175,200,250,300,350,400,500,600,700]
          ],
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).\
          count().\
          alias(f"paragraph_{i}_cnt") for i in [25,49]
          ],
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea2],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea2],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea2],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in paragraph_fea2],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea2],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea2],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in paragraph_fea2],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in paragraph_fea2],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()

    return df

def Sentence_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    tmp = tmp.filter(pl.col('sentence_len')>=15)
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    return tmp

sentence_fea = ['sentence_len','sentence_word_cnt']

def Sentence_Eng(train_tmp):
    aggs = [
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).\
          count().\
          alias(f"sentence_{i}_cnt")
          for i in [0,15,50,100,150,200,250,300]
          ],
        *[pl.col('sentence').filter(pl.col('sentence_len') <= i).count().alias(f"sentence_lt{i}_cnt") for i in [15,50] ],
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).sum().alias(f"{fea}_sum") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        *[pl.col(fea).kurtosis().alias(f"{fea}_kurtosis") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.25).alias(f"{fea}_q1") for fea in sentence_fea],
        *[pl.col(fea).quantile(0.75).alias(f"{fea}_q3") for fea in sentence_fea],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

def Word_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    tmp = tmp.filter(pl.col('word_len')!=0)
    return tmp

def Word_Eng(train_tmp):
    aggs = [
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt")
        for i in range(15)
        ],
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

def Make_Feature(train, test):
    """
    This function uses all other functions to make the associated features
    """

    Utils.PrintColor(f"Preprocessing")
    tmp         = Paragraph_Preprocess(train)
    train_feats = Paragraph_Eng(tmp)

    tmp         = Paragraph_Preprocess(test)
    test_feats  = Paragraph_Eng(tmp)
    Utils.PrintColor(f"1. Paragraph Preprocessing train-test = {train_feats.shape} | {test_feats.shape}",
                     color = Fore.CYAN)

    tmp         = Sentence_Preprocess(train)
    train_feats = train_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')

    tmp         = Sentence_Preprocess(test)
    test_feats  = test_feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
    Utils.PrintColor(f"2. Sentence Preprocessing train-test = {train_feats.shape} | {test_feats.shape}",
                     color = Fore.CYAN)

    tmp         = Word_Preprocess(train)
    train_feats = train_feats.merge(Word_Eng(tmp), on='essay_id', how='left')

    tmp         = Word_Preprocess(test)
    test_feats  = test_feats.merge(Word_Eng(tmp), on='essay_id', how='left')
    Utils.PrintColor(f"3. Word Preprocessing train-test = {train_feats.shape} | {test_feats.shape}",
                     color = Fore.CYAN)

    vectorizer = \
    TfidfVectorizer(tokenizer=lambda x: x,
                    preprocessor=lambda x: x,
                    token_pattern=None,
                    strip_accents='unicode',
                    analyzer = 'word',
                    ngram_range=(3,6),
                    min_df=0.05,
                    max_df=0.95,
                    sublinear_tf=True,
                    )

    train_tfid     = vectorizer.fit_transform([i for i in train['full_text']])
    df             = pd.DataFrame(train_tfid.toarray())
    tfid_columns   = [ f'tfid_{i}' for i in range(len(df.columns))]
    df.columns     = tfid_columns
    df['essay_id'] = train_feats['essay_id']
    train_feats    = train_feats.merge(df, on='essay_id', how='left')

    test_tfid      = vectorizer.transform([i for i in test['full_text']])
    df             = pd.DataFrame(test_tfid.toarray())
    tfid_columns   = [ f'tfid_{i}' for i in range(len(df.columns))]
    df.columns     = tfid_columns
    df['essay_id'] = test_feats['essay_id']
    test_feats     = test_feats.merge(df, on='essay_id', how='left')
    Utils.PrintColor(f"4. TFIDF = {train_feats.shape} | {test_feats.shape}", color = Fore.CYAN)

    vectorizer_cnt = \
    CountVectorizer(tokenizer=lambda x: x,
                    preprocessor=lambda x: x,
                    token_pattern=None,
                    strip_accents='unicode',
                    analyzer = 'word',
                    ngram_range=(2,3),
                    min_df=0.10,
                    max_df=0.85,
                    )

    train_tfid     = vectorizer_cnt.fit_transform([i for i in train['full_text']])
    dense_matrix   = train_tfid.toarray()
    df             = pd.DataFrame(dense_matrix)
    tfid_columns   = [ f'tfid_cnt_{i}' for i in range(len(df.columns))]
    df.columns     = tfid_columns
    df['essay_id'] = train_feats['essay_id']
    train_feats    = train_feats.merge(df, on='essay_id', how='left')

    test_tfid      = vectorizer_cnt.transform([i for i in test['full_text']])
    df             = pd.DataFrame(test_tfid.toarray())
    tfid_columns   = [ f'tfid_{i}' for i in range(len(df.columns))]
    df.columns     = tfid_columns
    df['essay_id'] = test_feats['essay_id']
    test_feats     = test_feats.merge(df, on='essay_id', how='left')
    Utils.PrintColor(f"5. Count vectorizer = {train_feats.shape} | {test_feats.shape}", color = Fore.CYAN)

    return train_feats, test_feats

In [None]:
%%time

if CFG.load_train_data == False:
    train = pl.read_csv(os.path.join(CFG.path, "train.csv")).\
    with_columns([( pl.col("full_text").str.split(by="\n\n").alias("paragraph"))])

    test = pl.read_csv(os.path.join(CFG.path, "test.csv")).\
    with_columns([( pl.col("full_text").str.split(by="\n\n").alias("paragraph"))])
    Utils.PrintColor(f"Train test shape = {train.shape} | {test.shape}")

    Xtrain, Xtest  = MakeFeature(train, test)
    Utils.PrintColor(f"\nTest data preprocessing")
    print("\n")

    Utils.PrintColor(f"Train test shape = {Xtrain.shape} | {Xtest.shape}")

    Xtrain.to_parquet(os.path.join(CFG.op_path, f"Xtrain_E{CFG.exp_nb}.parquet"))
    Xtest.to_parquet(os.path.join(CFG.op_path, f"Xtest_E{CFG.exp_nb}.parquet"))
    
    ytrain = train[CFG.target].to_pandas().astype(np.float32) - CFG.a
    ygrp   = train[CFG.target].to_pandas().astype(np.uint8)

elif CFG.load_train_data == True:
    ytrain = pd.read_csv(os.path.join(CFG.path, "train.csv"),
                         usecols = ["essay_id", CFG.target]
                        )
    
    Xtrain = joblib.load(os.path.join(CFG.mydatapath, f"Xtrain.pickle"))
    Xtest  = joblib.load(os.path.join(CFG.mydatapath, f"Xtest.pickle"))
    Xtrain = Xtrain.merge(ytrain, how = "left", on  = "essay_id")
    Xtrain.index = range(len(Xtrain))

    ytrain = Xtrain[CFG.target].astype(np.float32) - CFG.a
    ygrp   = Xtrain[CFG.target].astype(np.uint8)
    Xtrain = Xtrain.drop(columns = [CFG.target])

Xtrain.columns = Xtrain.columns.str.replace(r"<", "le").str.replace(r">", "ge").str.replace(r"?", "Q")
Xtest.columns  = Xtrain.columns
Utils.PrintColor(f"Train test shape = {Xtrain.shape} | {Xtest.shape} | {ytrain.shape} | {ygrp.shape}")

if CFG.test_req == "Y":
    Utils.PrintColor(f"\nSelecting a small set of columns for the syntax check")
    Xtrain = Xtrain.iloc[:, 0: CFG.nb_cols_test]
    Xtest  = Xtest[Xtrain.columns]
    
    Utils.PrintColor(f"Train test shape after syntax check = {Xtrain.shape} | {Xtest.shape}")
    
else:
    pass

_ = Utils.CleanMemory()

<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > MODEL TRAINING<br><div>

In [None]:
%%time

class XGBSupport:
    """
    This class designs helpers for the subseqent XGBoost using custom loss and evaluations
    """
    
    def __init__(self, a, b):
        self.a = a
        self.b = b
    
    def MakeMetricXGB(self, y_pred, dm):
        """
        This method prepares the custom eval metric for XGboost
        """
        
        y_true = dm.get_label() + self.a
        y_pred = np.round(np.clip(y_pred + self.a, a_min = 1, a_max = 6))

        score = cohen_kappa_score(y_true, y_pred, weights = "quadratic")
        return 'QWK', score

    def MakeObjXGB(self, y_pred, dm):
        """
        This method designs the custom objective for XGBoost
        """
        
        y_true = dm.get_label()
        labels = y_true + self.a
        preds  = y_pred + self.a
        preds  = np.clip(preds, a_min = 1, a_max = 6)
        
        f      = 1/2*np.sum((preds- labels) ** 2)
        g      = 1/2*np.sum((preds- self.a) **2 + self.b)
        
        df   = preds - labels
        dg   = preds - self.a
        grad = (df / g - f*dg / g**2) * len(labels)
        hess = np.ones(len(labels))
        
        return grad, hess

# Customizing logging for XGBoost
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s')

stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.INFO)
stdout_handler.setFormatter(formatter)

file_handler = logging.FileHandler(f'xgb_optimize.log')
file_handler.setLevel(logging.ERROR)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stdout_handler)

class XGBLogging(xgb.callback.TrainingCallback):
    """
    This class designs the custom logging for XGboost 
    This is to be used inside the XGboost callback
    """

    def __init__(self, epoch_log_interval=100):
        self.epoch_log_interval = epoch_log_interval

    def after_iteration(self, model, epoch:int, evals_log:xgb.callback.TrainingCallback.EvalsLog):
        if self.epoch_log_interval <= 0:
            pass
        
        elif (epoch %  self.epoch_log_interval == 0):
            for data, metric in evals_log.items():
                for metric_name, log in metric.items():
                    score = log[-1][0] if isinstance(log[-1], tuple) else log[-1]
                    logger.info(f"XGBLogging epoch {epoch} dataset {data} {metric_name} {score}")

        return False



In [None]:
%%time

cv     = SKF(n_splits= CFG.n_splits, shuffle= True, random_state= CFG.cv_state)
models = []
Scores = pd.DataFrame(index = range(CFG.n_splits), 
                      columns = ["Fscore", "QWK"]
                     )
OOF_Preds = pd.DataFrame(columns = ["essay_id", CFG.target, "Preds"])
FtreImp   = pd.DataFrame(columns = ["Imp", "Fold_Nb"])
FtreImp.index.name = "Feature"

params = \
dict(   learning_rate      = 0.05,
        max_depth          = 5,
        num_leaves         = 10,
        colsample_bytree   = 0.3,
        colsample_bylevel  = 0.35,
        subsample          = 0.55,
        reg_alpha          = 0.7,
        reg_lambda         = 0.1,
        random_state       = 42,
        min_child_weight   = 25,
        verbose            = 0,
        device             = "cuda" if CFG.test_req != "Y" else "cpu",
        enable_categorical = True,
        disable_default_eval_metric = 1
        )

xgbhelper = XGBSupport(a = CFG.a, b = CFG.b)
essay_id  = Xtrain["essay_id"]

for i, (train_index, test_index) in enumerate(cv.split(Xtrain, ygrp)):

    Xtr  = Xtrain.drop(columns = ['essay_id']).iloc[train_index]
    Xdev = Xtrain.drop(columns = ['essay_id']).iloc[test_index]
    ytr  = ytrain.iloc[train_index]
    ydev = ytrain.iloc[test_index]

    dmtrain = DMatrix(Xtr, label = ytr)
    dmdev   = DMatrix(Xdev,  label = ydev)
    dmtest  = DMatrix(Xtest.drop(columns = ["essay_id"]))

    predictor = xgb.train(params = params,
                          dtrain = dmtrain,
                          num_boost_round = 3500,
                          evals = [(dmdev, "Dev")],
                          verbose_eval = 0,
                          obj = xgbhelper.MakeObjXGB,
                          custom_metric = xgbhelper.MakeMetricXGB,
                          callbacks = [XGBLogging(epoch_log_interval= 0)],
                          maximize = True,
                          early_stopping_rounds = CFG.nbrnd_erly_stp,
                          )
    
    predictor.save_model(os.path.join(CFG.op_path, 
                                      f"XGB1R_E{CFG.exp_nb}V{CFG.version_nb}_Fold{i}.txt")
                        )
    models.append(predictor)
    
    ftreimp = \
    pd.DataFrame.from_dict(predictor.get_score(), 
                           orient = "index", 
                           columns = ["Imp"]
                          )
    ftreimp["Fold_Nb"] = i
    ftreimp.columns = ["Imp", "Fold_Nb"]
    FtreImp = pd.concat([FtreImp, ftreimp], axis=0)
    
    dev_preds = predictor.predict(dmdev)
    dev_preds = dev_preds + CFG.a
    OOF_Preds = pd.concat([OOF_Preds, 
                           pd.DataFrame({"essay_id": essay_id.iloc[test_index].values,
                                         CFG.target: ygrp.iloc[test_index].values,
                                         "Preds"   : dev_preds
                                        }
                                       )
                          ],
                          axis = 0,
                          ignore_index = True
                         )  
    
    dev_preds = np.round(np.clip(dev_preds, a_min = 1, a_max = 6), 0)
    
    f_dev      = f1_score(ygrp.iloc[test_index], dev_preds, average='weighted')
    kappa_dev  = cohen_kappa_score(ygrp.iloc[test_index], dev_preds, weights = 'quadratic')
       
    Scores.loc[i] = [f_dev, kappa_dev]
    Utils.PrintColor(f"Fscore = {f_dev:.5f} | QWK = {kappa_dev:.5f} | Fold {i}")

joblib.dump(models, os.path.join(CFG.op_path, f"XGB1R_E{CFG.exp_nb}V{CFG.version_nb}"))
OOF_Preds.to_csv(os.path.join(CFG.op_path, f"OOF_Preds_E{CFG.exp_nb}V{CFG.version_nb}"))
_ = Utils.CleanMemory()

In [None]:
%%time 

fig, axes = plt.subplots(CFG.n_splits, 1, 
                         figsize = (20, CFG.n_splits * 9),
                         gridspec_kw = {"wspace": 1.8, "hspace" : 0.4}
                        )

for i in range(CFG.n_splits):
    ax = axes[i]
    FtreImp.loc[FtreImp.Fold_Nb == i, "Imp"].\
    sort_values(ascending = False).\
    head(50).plot.bar(ax = ax, color = "tab:blue")
    ax.set_title(f"Fold{i} feature importances", **CFG.title_specs)
    
plt.tight_layout()
plt.show()