#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > TABLE OF CONTENTS<br><div>  
* [IMPORTS](#1)
* [INTRODUCTION](#2)
    * [UTILITIES](#2.1)
    * [CONFIGURATION](#2.2)    
    * [FOREWORD](#2.3)
    * [VERSION DETAILS](#2.4)
* [PREPROCESSING](#3)
* [MODEL TRAINING](#4)      
* [PLANNED WAY FORWARD](#5)  

#### <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color:white; font-size:180%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > KEY NOTE<br><div> 
    
This kernel is executed on Google Colab with the T4 GPU and high RAM settings, so please modify the code in a Kaggle environment and reuse <br>

**KEY FEATURES** <br>
1. Ability to add more models to the pipeline <br>
2. Train-Inference separated <br>
3. Vectorizer saved as objects <br>
4. Configuration and utilities for easy experimentation 
    

In [None]:
%%time

# Installing select libraries:-
from gc import collect;
from warnings import filterwarnings;
filterwarnings('ignore');
from IPython.display import display_html, clear_output;
import logging;

from copy import deepcopy;
import pandas as pd, polars as pl, numpy as np;
import polars.selectors as cs;

from os import path, walk, getpid
from psutil import Process
import re
from collections import Counter
from itertools import product

from colorama import Fore, Style, init;
from warnings import filterwarnings;
filterwarnings('ignore');
import joblib;
import os;

from tqdm.notebook import tqdm;
import seaborn as sns;
import matplotlib.pyplot as plt;
from matplotlib.colors import ListedColormap as LCM;
%matplotlib inline

from pprint import pprint;
from functools import partial;

import ctypes
libc = ctypes.CDLL("libc.so.6")

print();
collect();
clear_output();

In [None]:
%%time

# Importing model and pipeline specifics:-
from category_encoders import OrdinalEncoder, OneHotEncoder;

# Pipeline specifics:-
from sklearn.preprocessing import (RobustScaler,
                                   MinMaxScaler,
                                   StandardScaler,
                                   FunctionTransformer as FT,
                                   PowerTransformer,
                                  );
from sklearn.impute import SimpleImputer as SI;
from sklearn.model_selection import (RepeatedStratifiedKFold as RSKF,
                                     StratifiedKFold as SKF,
                                     StratifiedGroupKFold as SGKF,
                                     KFold,
                                     RepeatedKFold as RKF,
                                     cross_val_score,
                                     cross_val_predict
                                    );
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline, make_pipeline;
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin;
from sklearn.compose import ColumnTransformer;

# ML Model training:-
from sklearn.metrics import cohen_kappa_score, accuracy_score;
from xgboost import DMatrix, XGBRegressor as XGBR;
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR;
from catboost import CatBoostRegressor as CBR, Pool;
from sklearn.ensemble import HistGradientBoostingRegressor as HGBR, RandomForestRegressor as RFR;

# Ensemble and tuning:-
import optuna;
from optuna import Trial, trial, create_study;
from optuna.pruners import HyperbandPruner;
from optuna.samplers import TPESampler, CmaEsSampler;
optuna.logging.set_verbosity = optuna.logging.ERROR;
optuna.logging.disable_default_handler()

clear_output();
print();
collect();

In [None]:
%%time

# Setting rc parameters in seaborn for plots and graphs-
# Reference - https://matplotlib.org/stable/tutorials/introductory/customizing.html:-
# To alter this, refer to matplotlib.rcParams.keys()

sns.set({"axes.facecolor"       : "#ffffff",
         "figure.facecolor"     : "#ffffff",
         "axes.edgecolor"       : "#000000",
         "grid.color"           : "#ffffff",
         "font.family"          : ['Cambria'],
         "axes.labelcolor"      : "#000000",
         "xtick.color"          : "#000000",
         "ytick.color"          : "#000000",
         "grid.linewidth"       : 0.75,
         "grid.linestyle"       : "--",
         "axes.titlecolor"      : '#0099e6',
         'axes.titlesize'       : 8.5,
         'axes.labelweight'     : "bold",
         'legend.fontsize'      : 7.0,
         'legend.title_fontsize': 7.0,
         'font.size'            : 7.5,
         'xtick.labelsize'      : 7.5,
         'ytick.labelsize'      : 7.5,
        });

# Setting global configuration for polars
pl.Config.activate_decimals(True).set_tbl_hide_column_data_types(True)
pl.Config(**dict(tbl_formatting = 'ASCII_FULL_CONDENSED',
                 tbl_hide_column_data_types = True,
                 tbl_hide_dataframe_shape = True,
                 fmt_float = "mixed",
                 tbl_cell_alignment = 'CENTER',
                 tbl_hide_dtype_separator = True,
                 tbl_cols = 100,
                 tbl_rows = 50,
                 fmt_str_lengths = 100,
                )
         )

# Making sklearn pipeline outputs as dataframe:-
from sklearn import set_config;
set_config(transform_output = "pandas");
pd.set_option('display.max_columns', 50);
pd.set_option('display.max_rows', 50);

print();
collect();


<a id="2"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > INTRODUCTION<br><div>

<a id="2.1"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > UTILITIES<br><div>

In [None]:
%%time

class Utility:
    """
    This class serves to do the below-
    1. Define method to print in color
    2. Define the garbage cleaning process
    """;

    def PrintColor(self,text:str, color = Fore.BLUE, style = Style.BRIGHT):
        "Prints color outputs using colorama using a text F-string";
        print(style + color + text + Style.RESET_ALL)

    def ScoreMetric(self, ytrue, ypred)-> float:
        """
        This method calculates the custom metric from the imported script
        Inputs- ytrue, ypred:- input truth and predictions
        Output- float:- competition metric
        """;

        y_pred = np.uint8(np.around(np.clip(ypred, a_min = 1, a_max = 6)))
        return cohen_kappa_score(np.uint8(ytrue), y_pred, weights = "quadratic")

    def CleanMemory(self):
        "This method cleans the memory off unused objects and displays the cleaned state RAM usage";

        collect();
        libc.malloc_trim(0)
        pid        = getpid()
        py         = Process(pid)
        memory_use = py.memory_info()[0] / 2. ** 30
        return f"\nRAM usage = {memory_use :.4} GB"

Utils = Utility()
print();

<a id="2.2"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > CONFIGURATION<br><div>

| Parameter         | Description                                             | Possible value choices|
| ---               | ---                                                     | :-:                   |
|  exp_nb           | Experiment Number                                       | integer               |
|  version_nb       | Version Number                                          | integer               |
|  test_req         | Are we testing syntax here?                             | Y/N                   |  
|  test_sample_frac | Sample size for syntax test                             | float(0-1)/ int       |     
|  gpu_switch       | GPU switch                                              | ON/OFF                |
|  state            | Random state for most purposes                          | integer               |
|  target           | Target column names                                     | string                |    
|  path             | Path for input data files                               |                       |
|  dtl_preproc_req  | Proprocessing required                                  | Y/N                   |    
|  ftre_plots_req   | Feature plots required                                  | Y/N                   |
|  ftre_imp_req     | Feature importance required                             | Y/N                   |   
|  ML               | Machine Learning Models                                 | Y/N                   |
|  nb_models        | Number of models                                        | integer               |
|  n_splits         | Number of CV splits                                     | integer               |
|  n_repeats        | Number of CV repeats                                    | integer               |
|  nbrnd_erly_stp   | Number of early stopping rounds                         | integer               |
|  mdl_cv_mthd      | Model CV method name                                    | RKF/ RSKF/ SKF/ KFold |
|  ensemble_req     | Ensemble required                                       | Y/N                   |
|  metric_obj       | Metric objective                                        | maximize/ minimize    |  
|  ntrials          | Number of trials                                        | int                   |  

In [None]:
%%time

class CFG:
    """
    Configuration class for parameters and CV strategy for tuning and training
    Some parameters may be unused here as this is a general configuration class
    """;

    # Data preparation:-
    exp_nb             = 1;
    version_nb         = 10;
    test_req           = "N";
    test_sample_frac   = 0.025;
    gpu_switch         = "ON";
    state              = 42;
    target             = "score";
    path               = f"/input/learning-agency-lab-automated-essay-scoring-2";
    op_path            = f"/content/drive/MyDrive/AES24/Interim"

    dtl_preproc_req    = "Y";
    ftre_plots_req     = 'Y';
    ftre_imp_req       = "Y";

    # Model Training:-
    ML                 = "Y";
    nb_models          = 10;
    n_splits           = 3 if test_req == "Y" else 12;
    n_repeats          = 1 ;
    nbrnd_erly_stp     = 100;
    mdlcv_mthd         = 'RSKF';

    a                  = 2.998
    b                  = 1.042

    # Ensemble:-
    ensemble_req       = "N";
    metric_obj         = 'maximize';
    ntrials            = 10 if test_req == "Y" else 250;

    # Global variables for plotting:-
    grid_specs = {'visible': True, 'which': 'both', 'linestyle': '--',
                           'color': 'lightgrey', 'linewidth': 0.75};
    title_specs = {'fontsize': 9, 'fontweight': 'bold', 'color': '#992600'};

print();
Utils.PrintColor(f"--> Configuration done!\n");
collect();

<a id="2.3"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > FOREWORD<br><div>

This competition aims to grade essays into 6 grades from 1-6 using a training corpus data. We are asked to use **Quadratic Kappa Score** as the metric. <br>
Scoring rubric is explained in detail [here](https://storage.googleapis.com/kaggle-forum-message-attachments/2733927/20538/Rubric_%20Holistic%20Essay%20Scoring.pdf) as part of the competition overview and evaluation guidelines <br>

### **KERNEL OBJECTIVE** <br>
I delve into the data a bit here, use the public notebook to generate features and use them effectively to train ML models. <br>
This is the training kernel. I shall infer this separately and submit later <br>

### **KERNEL SOURCES** <br>
https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments <br>

### **MY CONTRIBUTION** <br>
1. I created a Utility script to collate all preprocessing functions and use them in the train and inference kernels separately <br>
2. I have created a class for the vectorizer instead of the function and lambda expression <br>
3. I added a few more models and created an Optuna ensemble, CV score is better this way <br>

<a id="2.4"></a>
## <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: black; font-size:120%; text-align:left;padding:3.0px; background: #c2d6d6; border-bottom: 8px solid black" > VERSION DETAILS<br><div>

|Experiment <br> Number| Version<br>Number | Version Details | CV score| Single/ Ensemble|Public LB Score|
|----| :-: | --- | :-: | :-: |:-:|
|1 | **V1** |* Public notebook features - 3361 features <br> * ML model training <br> * Optuna ensemble <br> * 10x1 RSKF|0.80812|Ensemble <br> Optuna|0.796|
|1 | **V2** |* Public notebook features - 3237 features <br> * ML model training <br> * Optuna ensemble <br> * 10x1 RSKF|0.80631|Ensemble <br> Optuna|0.796|
|1 | **V3** |* Public notebook features - 3237 features <br> * ML model training <br> * Optuna ensemble <br> * 10x1 Repeated K Fold |0.80683|Ensemble <br> Optuna|0.799|
|1 | **V4** |* Public notebook features - 3237 features <br> * Retained infinity values <br> * ML model training <br> * Optuna ensemble <br> * 5x1 Repeated K Fold |0.80453|Ensemble <br> Optuna|0.796|
|1 | **V5** |* Public notebook features - 3361 features <br> * Retained infinity values <br> * ML model training, 1 model with 10 random states <br> * Optuna ensemble <br> * 5x1 Repeated K Fold |0.80738|Ensemble <br> Optuna|0.796|
|1 | **V6** |* Public notebook features - 8742 features <br> * Retained infinity values <br> * ML model training, 4 models <br> * Optuna ensemble <br> * 5x1 RSKF |0.80684|Ensemble <br> Optuna|0.796|
|1 | **V9** |* Public notebook features - 13786 features <br> * Retained infinity values <br> * ML model training, 1 model <br> * Stacked with DeBERTAa large model <br> * 18x1 RSKF ||Stacking||

<a id="3"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > PREPROCESSING<br><div>

In [None]:
%%writefile "fe.py"

# Feature engineering functions and classes to be used in the inference kernel also
import polars as pl, pandas as pd, numpy as np, re, joblib;
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer;
from sklearn.base import RegressorMixin, ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.metrics import cohen_kappa_score
import lightgbm as lgb
import os

print(f"\n---> Sourced from 'https://www.kaggle.com/code/ye11725/tfidf-lgbm-baseline-with-code-comments'\n")

def removeHTML(x):
    html=re.compile(r'<.*?>')
    return html.sub(r'',x)

def dataPreprocessing(x):
    x = x.lower()
    x = removeHTML(x)
    x = re.sub("@\w+", '',x)
    x = re.sub("'\d+", '',x)
    x = re.sub("\d+", '',x)
    x = re.sub("http\w+", '',x)
    x = re.sub(r"\s+", " ", x)
    x = re.sub(r"\.+", ".", x)
    x = re.sub(r"\,+", ",", x)
    x = x.strip()
    return x

def Paragraph_Preprocess(tmp):
    tmp = tmp.explode('paragraph')
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(dataPreprocessing))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x)).alias("paragraph_len"))
    tmp = tmp.with_columns(pl.col('paragraph').map_elements(lambda x: len(x.split('.'))).alias("paragraph_sentence_cnt"),
                    pl.col('paragraph').map_elements(lambda x: len(x.split(' '))).alias("paragraph_word_cnt"),)
    return tmp

def Paragraph_Eng(train_tmp):

    paragraph_fea = ['paragraph_len','paragraph_sentence_cnt','paragraph_word_cnt']
    aggs = [
        *[pl.col('paragraph').filter(pl.col('paragraph_len') >= i).count().alias(f"paragraph_{i}_cnt")
          for i in [50,75,100,125,150,175,200,250,300,350,400,500,600,700]
         ],
        *[pl.col('paragraph').filter(pl.col('paragraph_len') <= i).count().alias(f"paragraph_{i}_cnt")
          for i in [25,49]
         ],
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in paragraph_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in paragraph_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in paragraph_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in paragraph_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in paragraph_fea],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

def Sentence_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=".").alias("sentence"))
    tmp = tmp.explode('sentence')
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x)).alias("sentence_len"))
    tmp = tmp.filter(pl.col('sentence_len')>=15)
    tmp = tmp.with_columns(pl.col('sentence').map_elements(lambda x: len(x.split(' '))).alias("sentence_word_cnt"))
    return tmp

def Sentence_Eng(train_tmp):

    sentence_fea = ['sentence_len','sentence_word_cnt']
    aggs = [
        *[pl.col('sentence').filter(pl.col('sentence_len') >= i).count().alias(f"sentence_{i}_cnt")
          for i in [15,50,100,150,200,250,300]
         ],
        *[pl.col(fea).max().alias(f"{fea}_max") for fea in sentence_fea],
        *[pl.col(fea).mean().alias(f"{fea}_mean") for fea in sentence_fea],
        *[pl.col(fea).min().alias(f"{fea}_min") for fea in sentence_fea],
        *[pl.col(fea).first().alias(f"{fea}_first") for fea in sentence_fea],
        *[pl.col(fea).last().alias(f"{fea}_last") for fea in sentence_fea],
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

def Word_Preprocess(tmp):
    tmp = tmp.with_columns(pl.col('full_text').map_elements(dataPreprocessing).str.split(by=" ").alias("word"))
    tmp = tmp.explode('word')
    tmp = tmp.with_columns(pl.col('word').map_elements(lambda x: len(x)).alias("word_len"))
    tmp = tmp.filter(pl.col('word_len')!=0)
    return tmp

def Word_Eng(train_tmp):
    aggs = [
        *[pl.col('word').filter(pl.col('word_len') >= i+1).count().alias(f"word_{i+1}_cnt") for i in range(15) ],
        pl.col('word_len').max().alias(f"word_len_max"),
        pl.col('word_len').mean().alias(f"word_len_mean"),
        pl.col('word_len').std().alias(f"word_len_std"),
        pl.col('word_len').quantile(0.25).alias(f"word_len_q1"),
        pl.col('word_len').quantile(0.50).alias(f"word_len_q2"),
        pl.col('word_len').quantile(0.75).alias(f"word_len_q3"),
        ]

    df = train_tmp.group_by(['essay_id'], maintain_order=True).agg(aggs).sort("essay_id")
    df = df.to_pandas()
    return df

def _maketk(x): return x
def _makepp(x): return x

class TFIDFMaker:
    """
    This class creates/ implements a Tf-IDF and count vectorizer on the dataset provided
    """

    def __init__(self, lbl: str, path: str, maketk = _maketk, makepp = _makepp):
        """
        This method initializes the vectorizer/ loads the vectorizer based on the label
        Inputs -
        lbl - string - Train/ Test
        path - string - path to save/ load the vectorizer
        """

        self.lbl   = lbl
        self.path  = path

        if self.lbl.lower() == "train":
            self.vectorizer = \
            TfidfVectorizer(tokenizer     =  maketk,
                            preprocessor  =  makepp,
                            token_pattern =  None,
                            strip_accents = 'unicode',
                            analyzer      = 'word',
                            ngram_range   = (3,6),
                            min_df        =  0.05,
                            max_df        =  0.95,
                            sublinear_tf  =  True,
                           )

            self.vectorizer_cnt  = \
            CountVectorizer(tokenizer     =  maketk,
                            preprocessor  =  makepp,
                            token_pattern =  None,
                            strip_accents =  'unicode',
                            analyzer      =  'word',
                            ngram_range   = (2,3),
                            min_df        = 0.10,
                            max_df        = 0.85,
                           )

        else:
            self.vectorizer     = joblib.load(os.path.join(self.path, "vec"))
            self.vectorizer_cnt = joblib.load(os.path.join(self.path, "cnt_vec"))

    def fit(self, X, y = None, **params):
        "This method fits the vectorizer for the training data"
        self.vectorizer.fit([i for i in X['full_text']])
        joblib.dump(self.vectorizer, os.path.join(self.path, "vec"))

        self.vectorizer_cnt.fit([i for i in X['full_text']])
        joblib.dump(self.vectorizer_cnt, os.path.join(self.path, "cnt_vec"))
        return self

    def transform(self, X, y = None, **params):
        "This method transforms the provided data using the vectorizer"

        df1 = self.vectorizer.transform([i for i in X['full_text']])
        df2 = self.vectorizer_cnt.transform([i for i in X['full_text']])
        return (pd.DataFrame(df1.toarray(), dtype = np.float32).add_prefix("tfidf_"),
                pd.DataFrame(df2.toarray(), dtype = np.float32).add_prefix("tfid_cnt_"),
               )

    def fit_transform(self, X, y = None, **params):
        "Defines the fit-transform process on the provided data"

        self.fit(X)
        return self.transform(X)


def Make_Features(df, path: str, lbl: str,target: str = "score", maketk = _maketk, makepp = _makepp):
    """
    This function uses all other previous functions and prepares the features for the provided dataset
    """;

    print(f"\n{'=' * 15} PREPROCESSING- {lbl.upper()} MODE {'=' * 15}\n")

    tmp   = Paragraph_Preprocess(df)
    feats = Paragraph_Eng(tmp)

    if lbl == "Train":
        feats[target] = df[target]

    feature_names = list(filter(lambda x: x not in ['essay_id',target], feats.columns))
    print(f"1. Paragraph features = {feats.shape}")

    tmp   = Sentence_Preprocess(df)
    feats = feats.merge(Sentence_Eng(tmp), on='essay_id', how='left')
    feature_names = list(filter(lambda x: x not in ['essay_id',target], feats.columns))
    print(f"2. Sentence features = {feats.shape}")

    tmp   = Word_Preprocess(df)
    feats = feats.merge(Word_Eng(tmp), on='essay_id', how='left')
    feature_names = list(filter(lambda x: x not in ['essay_id','score'], feats.columns))
    print(f"3. Word features = {feats.shape}")

    tfidf = TFIDFMaker(lbl, path, maketk = _maketk, makepp = _makepp )
    if lbl == "Train":
        df_tfidf, df_cnt = tfidf.fit_transform(df)
    else:
        df_tfidf, df_cnt = tfidf.transform(df)

    df_tfidf["essay_id"] =  df["essay_id"]
    df_cnt["essay_id"]   =  df["essay_id"]
    feats = feats.merge(df_tfidf, on='essay_id', how='left')
    feats = feats.merge(df_cnt, on='essay_id', how='left')
    print(f"4. TFIDF features and count vectorizer = {feats.shape}")

    return feats

def ReduceMem(df: pd.DataFrame):
    "This method reduces memory for numeric columns in the dataframe";

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64', "uint16", "uint32", "uint64"];
    start_mem = df.memory_usage().sum() / 1024**2;

    for col in df.columns:
        col_type = df[col].dtypes

        if col_type in numerics:
            c_min = df[col].min();
            c_max = df[col].max();

            if "int" in str(col_type):
                if c_min >= np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min >= np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                if c_min >= np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2

    print(f"Start - end memory:- {start_mem:5.2f} - {end_mem:5.2f} Mb");
    return df;

In [None]:
from shutil import copyfile
import fe
from fe import *

In [None]:
%%time

train = \
pl.read_csv(os.path.join(CFG.path, "train.csv")).\
with_columns([( pl.col("full_text").str.split(by="\n\n").alias("paragraph"))])

Utils.PrintColor(f"\nTrain data sample\n")
display(train.head(1))

test = \
pl.read_csv(os.path.join(CFG.path, "test.csv")).\
with_columns([( pl.col("full_text").str.split(by="\n\n").alias("paragraph"))])

Utils.PrintColor(f"\nTest data sample\n")
display(test.head(1))

# Utility script for feature engineering
XYtrain = Make_Features(df = train,
                        path = CFG.op_path,
                        lbl = "Train",
                        target = CFG.target,
                       )

XYtrain = ReduceMem(XYtrain)

print();
Xtest = Make_Features(df = test,
                      lbl = "test",
                      target = CFG.target,
                      path = CFG.op_path,
                     )
Xtest = ReduceMem(Xtest)

In [None]:
%%time

# Stacking the DeBERTA large model results herewith:-
df = \
pd.DataFrame(data = joblib.load('/content/drive/MyDrive/AES24/Imports/oof.pkl'),
             index = XYtrain["essay_id"],
             ).add_prefix("deberta_")

XYtrain = XYtrain.merge(df, how = "left", left_on = "essay_id", right_index = True)

for col in df.columns:
    Xtest[col] = 0.5

# Saving the train set for future usage:-
XYtrain.to_parquet(os.path.join(CFG.op_path, f"XYtrain_E{CFG.exp_nb}.parquet"))

del df;
_ = Utils.CleanMemory()

In [None]:
%time

fig, ax = plt.subplots(1,1, figsize = (7, 5))

XYtrain[CFG.target].value_counts().sort_index(ascending = True).plot.bar(ax=ax, color = "tab:blue")
ax.set(xlabel = "", ylabel = "");
ax.set_title(f"Target plot", **CFG.title_specs)
ax.grid(**CFG.grid_specs)
ax.set_yticks(range(0, 7001, 250), labels = range(0, 7001, 250), fontsize = 7)
ax.set_xticks(range(0,6,1), labels = range(1,7,1), rotation = 0)

plt.tight_layout()
plt.show()

<a id="4"></a>
# <div style= "font-family: Cambria; font-weight:bold; letter-spacing: 0px; color: white; font-size:120%; text-align:left;padding:3.0px; background: maroon; border-bottom: 8px solid black" > MODEL TRAINING<br><div>

In [None]:
%%time

class OptunaEnsembler:
    """
    This is the Optuna ensemble class-
    Source- https://www.kaggle.com/code/arunklenin/ps3e26-cirrhosis-survial-prediction-multiclass
    """;

    def __init__(self, model_mode:str = "Regression"):
        """
        Key parameter:-
        model_mode- string :-
        Regression- uses the Cohen Kappa metric
        Classification - uses the multiclass auc metric
        """

        self.study        = None;
        self.weights      = None;
        self.random_state = CFG.state;
        self.n_trials     = CFG.ntrials;
        self.direction    = CFG.metric_obj;
        self.model_mode   = model_mode;

        if self.model_mode == "Regression":
            self.ScoreMetric = Utils.ScoreMetric
        else:
            self.ScoreMetric = Utils.ClsfMetric

    def _objective(self, trial, y_true, y_preds):
        """
        This method defines the objective function for the ensemble
        """;

        if isinstance(y_preds, pd.DataFrame) or isinstance(y_preds, np.ndarray):
            weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(y_preds.shape[-1])];
            axis = 1;
        elif isinstance(y_preds, list):
            weights = [trial.suggest_float(f"weight{n}", 0, 1) for n in range(len(y_preds))];
            axis = 0;

        # Calculating the weighted prediction:-
        weighted_pred  = np.average(np.array(y_preds), axis = axis, weights = weights);
        score          = self.ScoreMetric(y_true, weighted_pred);
        return score;

    def fit(self, y_true, y_preds):
        "This method fits the Optuna objective on the fold level data";

        optuna.logging.set_verbosity = optuna.logging.ERROR;
        self.study = \
        optuna.create_study(sampler    = TPESampler(seed = self.random_state),
                            pruner     = HyperbandPruner(),
                            study_name = "Ensemble",
                            direction  = self.direction,
                           );

        obj = partial(self._objective, y_true = y_true, y_preds = y_preds);
        self.study.optimize(obj, n_trials = self.n_trials);

        if isinstance(y_preds, list):
            self.weights = [self.study.best_params[f"weight{n}"] for n in range(len(y_preds))];
        else:
            self.weights = [self.study.best_params[f"weight{n}"] for n in range(y_preds.shape[-1])];
        clear_output();

    def predict(self, y_preds):
        "This method predicts using the fitted Optuna objective";

        assert self.weights is not None, 'OptunaWeights error, must be fitted before predict';

        if isinstance(y_preds, list):
            weighted_pred = np.average(np.array(y_preds), axis=0, weights = self.weights);
        else:
            weighted_pred = np.average(np.array(y_preds), axis=1, weights = self.weights);
        return weighted_pred;

    def fit_predict(self, y_true, y_preds):
        """
        This method fits the Optuna objective on the fold data, then predicts the test set
        """;
        self.fit(y_true, y_preds);
        return self.predict(y_preds);

    def weights(self):
        return self.weights;

print();
collect();

In [None]:
%%time

class LGBMSupport:
    """
    This class supports the LGBM training with a custom metric and objective function
    """

    def __init__(self, a, b):
        """
        Initializing class level parameters
        """
        self.a = a
        self.b = b

    def _LGBMetric(self, y_true, y_pred):
        """
        This is the custom metric used in evaluation and early stopping
        """;

        y_true = y_true + self.a
        y_pred = (y_pred + self.a).clip(1, 6).round()
        qwk = cohen_kappa_score(np.around(y_true,0), y_pred, weights="quadratic")
        return 'QWK', qwk, True

    def _LGBObj(self, y_true, y_pred):
        "This is the custom objective function for the LGBM"

        labels = y_true  + self.a
        preds  = y_pred  + self.a
        preds = preds.clip(1, 6)

        f = 1/2 * np.sum((preds-labels)**2)
        g = 1/2 * np.sum((preds- self.a)**2  + self.b)

        df = preds - labels
        dg = preds - self.a

        grad = (df/g - f*dg/g**2)*len(labels)
        hess = np.ones(len(labels))
        return grad, hess

class MyLogger:
    """
    This class helps to suppress logs in lightgbm and Optuna
    Source - https://github.com/microsoft/LightGBM/issues/6014
    """

    def init(self, logging_lbl: str):
        self.logger = logging.getLogger(logging_lbl)
        self.logger.setLevel(logging.ERROR)

    def info(self, message):
        pass

    def warning(self, message):
        pass

    def error(self, message):
        self.logger.error(message)

class VotingModelMaker(BaseEstimator, RegressorMixin):
    def __init__(self, estimators: list, weights: list, a = CFG.a, b = CFG.b):
        super().__init__()
        self.estimators = estimators
        self.weights    = weights
        self.a = a
        self.b = b

    def fit(self, X, y=None):
        return self

    def mdlpredict(self, X):
        y_preds = \
        pd.DataFrame(columns = [f"Est{i}" for i in range(len((self.estimators)))],
                     index = range(len(X))
                    )

        for i, estm in enumerate(self.estimators):
            y_preds[f"Est{i}"] = estm.predict(X) + self.a

        if self.weights != []:
            return np.average(y_preds, axis=1, weights = self.weights)
        else:
            return np.mean(y_preds, axis=1,)

    def mdlpredictproba(self, X):
        y_preds = \
        pd.DataFrame(columns = [f"Est{i}" for i in range(len(range(self.estimators)))],
                     index = range(len(X))
                    )

        for i, estm in enumerate(self.estimators):
            y_preds[f"Est{i}"] = estm.predict_proba(X)[:,1]

        if self.weights != []:
            return np.average(y_preds, axis=1, weights = self.weights)
        else:
            return np.mean(y_preds, axis=1,)

mdlsupport = LGBMSupport(a = CFG.a,b = CFG.b)

In [None]:
%%time

class MdlDeveloper(CFG):
    """
    This class implements the training pipeline elements-
    1. Initializes the Model predictions
    2. Trains and infers models
    3. Returns the OOF and model test set predictions
    """;

    def __init__(self, Xtrain, ytrain, ygrp, Xtest, sel_cols, cat_cols, model_mode,
                 **kwarg
                ):
        """
        In this method, we initialize the below-
        1. Train-test data, selected columns
        2. Metric, custom scorer, model and cv object
        3. Output tables for score and predictions
        """;

        self.Xtrain      = Xtrain
        self.ytrain      = ytrain
        self.y_grp       = ygrp
        self.Xtest       = Xtest
        self.sel_cols    = sel_cols
        self.cat_cols    = cat_cols
        self.model_mode  = model_mode
        self.num_class   = 6

        if self.model_mode == "Regression":
            self.ScoreMetric = Utils.ScoreMetric
        else:
            self.ScoreMetric = Utils.ClsfMetric

        self._DefineModels();
        self.cv          = self.all_cv[self.mdlcv_mthd];

        self.methods = list(self.Mdl_Master.keys());

        if self.model_mode == "Regression":
            self.methods = [c for c in self.methods if c.endswith("R")]
            self.OOF_Preds   = pd.DataFrame();
            self.Mdl_Preds   = pd.DataFrame();

        else:
            self.methods   = [c for c in self.methods if c.endswith("C")]
            self.OOF_Preds = pd.DataFrame(columns = [f"class{i}" for i in range(self.num_class)])
            self.Mdl_Preds = pd.DataFrame(columns = [f"class{i}" for i in range(self.num_class)])

        self.Scores = pd.DataFrame(columns = self.methods + ["Ensemble"],
                                   index = range(self.n_splits * self.n_repeats)
                                  );

        self.TrainScores = pd.DataFrame(columns = self.methods,
                                        index = range(self.n_splits * self.n_repeats)
                                       );

        self.AllFittedModels = []

        Utils.PrintColor(f"\n---> Selected model options-");
        try:
            with np.printoptions(linewidth = 150):
                pprint(np.array(self.methods), depth = 1, width = 100, indent = 5);
        except:
            pprint(self.methods, depth = 1, width = 100, indent = 5);

    def _DefineModels(self):
        """
        This method initiliazes models for the analysis
        It also initializes the CV methods and class-weights that could be tuned going ahead.
        """;

        # Commonly used CV strategies for later usage:-
        self.all_cv = \
        {'KF'  : KFold(n_splits = self.n_splits, shuffle = True, random_state= self.state),
         'RKF' : RKF(n_splits   = self.n_splits, n_repeats = self.n_repeats, random_state= self.state),
         'RSKF': RSKF(n_splits  = self.n_splits, n_repeats = self.n_repeats, random_state= self.state),
         'SKF' : SKF(n_splits   = self.n_splits, shuffle = True, random_state= self.state),
         'SGKF': SGKF(n_splits  = self.n_splits, shuffle= True, random_state= self.state),
        };

        self.Mdl_Master = \
        {
         'LGBM1R':LGBMR(objective        = mdlsupport._LGBObj,
                        metrics          = 'None',
                        learning_rate    = 0.10,
                        max_depth        = 6,
                        num_leaves       = 10,
                        colsample_bytree = 0.50,
                        reg_alpha        = 0.10,
                        reg_lambda       = 0.80,
                        n_estimators     = 1024,
                        random_state     = self.state,
                        extra_trees      = True,
                        class_weight     = 'balanced',
                        verbosity        = - 1,
                        device           = "gpu" if self.gpu_switch == "ON" else "cpu",
                       ),

        'LGBM1C':LGBMC(objective        = "multiclass",
                       metrics          = 'auc_mu',
                       num_class        = self.num_class,
                       learning_rate    = 0.10,
                       max_depth        = 6,
                       num_leaves       = 10,
                       colsample_bytree = 0.50,
                       reg_alpha        = 0.10,
                       reg_lambda       = 0.80,
                       n_estimators     = 1024,
                       random_state     = self.state,
                       extra_trees      = True,
                       class_weight     = 'balanced',
                       verbosity        = - 1,
                       device           = "gpu" if self.gpu_switch == "ON" else "cpu",
                       ),
        }
        return self;

    def PostProcessPred(self, ypred):
        """
        This is an optional post-processing method
        We clip the values of the target between 1 and 6
        """;
        return ypred;

    def TrainMdl(self, target: str, test_preds_req: str = "Y", save_models = "N",):
        """
        This method trains and infers from the model suite and returns the predictions and scores
        It optionally predicts the test set too, if desired by the user
        """;

        # Initializing I-O:-
        X,y  = self.Xtrain[self.sel_cols], self.ytrain.copy(deep = True);
        Xt   = self.Xtest[self.sel_cols];

        cols_drop  = ["Source", "essay_id",];
        ens        = OptunaEnsembler(model_mode = self.model_mode);

        self.FtreImp = pd.DataFrame(columns = self.methods,
                                    index   = [c for c in self.sel_cols if c not in cols_drop]
                                   ).fillna(0);

        # Making CV folds:-
        for fold_nb, (train_idx, dev_idx) in tqdm(enumerate(self.cv.split(X, self.y_grp))):
            Xtr  = X.iloc[train_idx].drop(columns = cols_drop, errors = 'ignore');
            Xdev = X.iloc[dev_idx].drop(columns = cols_drop, errors = 'ignore');
            ytr  = y.loc[y.index.isin(Xtr.index)];
            ydev = y.loc[y.index.isin(Xdev.index)];

            fitted_models = [];

            # Initializing the OOF and test set predictions:-
            if self.model_mode == "Regression":
                oof_preds = pd.DataFrame(columns = self.methods, index = Xdev.index);
                mdl_preds = pd.DataFrame(columns = self.methods, index = Xt.index);

            else:
                oof_preds = []
                mdl_preds = []

            Utils.PrintColor(f"\n{'=' * 5} FOLD {fold_nb + 1} {'=' * 5}\n");

            # Initializing models across methods:-
            for method in tqdm(self.methods):

                model = self.Mdl_Master.get(method);

                # Fitting the model:-
                if "CB" in method:
                    model.fit(Xtr, ytr,
                              eval_set = [(Xdev, ydev)],
                              verbose = 0,
                              early_stopping_rounds = CFG.nbrnd_erly_stp,
                             );

                elif "LGBM" in method and method.endswith("R"):
                    model.fit(Xtr, ytr,
                              eval_set = [(Xdev, ydev)],
                              eval_names = [("Dev")],
                              eval_metric = mdlsupport._LGBMetric,
                              callbacks = [log_evaluation(250),
                                           early_stopping(stopping_rounds = self.nbrnd_erly_stp,
                                                          verbose = False,),
                                          ],
                             );

                elif "LGBM" in method and method.endswith("C"):
                    model.fit(Xtr, ytr,
                              eval_set = [(Xdev, ydev)],
                              eval_names = [("Dev")],
                              callbacks = [log_evaluation(250),
                                           early_stopping(stopping_rounds = self.nbrnd_erly_stp,
                                                          verbose = False,),
                                          ],
                             );

                elif "XGB" in method:
                     model.fit(Xtr, ytr,
                               eval_set = [(Xdev, ydev)],
                               verbose  = 0,
                              );

                else:
                    model.fit(Xtr, ytr);

                # Collating feature importance:-
                try:
                    self.FtreImp[method] += model.feature_importances_;
                except:
                    pass;

                fitted_models.append(model);

                # Collecting predictions and scores and post-processing OOF based on model method:-
                if method.endswith("R"):
                    dev_preds    = self.PostProcessPred(model.predict(Xdev)) + self.a ;
                    train_preds  = self.PostProcessPred(model.predict(Xtr)) + self.a ;
                    tr_score     = self.ScoreMetric(ytr.values.flatten()  + self.a, train_preds,);
                    score        = self.ScoreMetric(ydev.values.flatten() + self.a, dev_preds);

                    oof_preds[method] = dev_preds;

                    if test_preds_req == "Y":
                        mdl_preds[method] = \
                        self.PostProcessPred(model.predict(Xt.drop(columns = cols_drop, errors = "ignore"))) + self.a;

                else:
                    dev_preds    = model.predict_proba(Xdev) ;
                    train_preds  = model.predict_proba(Xtr) ;
                    tr_score     = self.ScoreMetric(ytr.values, train_preds,);
                    score        = self.ScoreMetric(ydev.values, dev_preds);

                    oof_preds.append(dev_preds)

                    if test_preds_req == "Y":
                        test_preds = model.predict_proba(Xt.drop(columns = cols_drop, errors = "ignore"))
                        mdl_preds.append(test_preds)
                        del test_preds

                Utils.PrintColor(f"OOF = {score:.5f} | Train = {tr_score:.5f} | {method}",color = Fore.CYAN);

                # Integrating the predictions and scores:-
                self.Scores.at[fold_nb, method]      = np.round(score, decimals= 6);
                self.TrainScores.at[fold_nb, method] = np.round(tr_score, decimals= 6);

            try:
                del dev_preds, train_preds, tr_score, score;
            except:
                pass;

            # Ensembling the predictions with post-processing:-
            if self.ensemble_req == "Y" and self.model_mode == "Regression":
                oof_preds["Ensemble"]  = self.PostProcessPred(ens.fit_predict(ydev + self.a, oof_preds[self.methods]));
                score                  = self.ScoreMetric(ydev + self.a, oof_preds["Ensemble"].values);
                oof_preds["Ensemble"]  = oof_preds["Ensemble"]
                self.OOF_Preds         = pd.concat([self.OOF_Preds, oof_preds], axis = 0, ignore_index = False);
                self.Scores.at[fold_nb, "Ensemble"] = np.round(score,6);

                if test_preds_req == "Y":
                    mdl_preds["Ensemble"] = ens.predict(mdl_preds[self.methods]);
                    self.Mdl_Preds        = pd.concat([self.Mdl_Preds, mdl_preds], axis = 0, ignore_index = False);

                vote_model = VotingModelMaker(estimators = fitted_models, weights = ens.weights)

                if save_models == "Y" and self.model_mode == "Regression":
                    joblib.dump(vote_model,
                                os.path.join(self.op_path, f"VR_E{self.exp_nb}V{self.version_nb}_Fold{fold_nb}.model")
                               );
                elif save_models == "Y" and self.model_mode == "Classification":
                    joblib.dump(vote_model,
                                os.path.join(self.op_path, f"VC_E{self.exp_nb}V{self.version_nb}_Fold{fold_nb}.model")
                               );

                self.AllFittedModels.append(vote_model)

            if self.ensemble_req == "Y" and self.model_mode == "Classification":
                ens_preds       = ens.fit_predict(ydev, oof_preds);
                score           = self.ScoreMetric(ydev, ens_preds);
                ens_preds       = pd.DataFrame(ens_preds, index = ydev.index, columns = self.OOF_Preds.columns)
                self.OOF_Preds  = pd.concat([self.OOF_Preds, ens_preds], axis = 0, ignore_index = False)
                self.Scores.at[fold_nb, "Ensemble"] = np.round(score,6);

                if test_preds_req == "Y":
                    test_preds      = pd.DataFrame(ens.predict(mdl_preds), index = Xt.index, columns = self.Mdl_Preds.columns)
                    self.Mdl_Preds  = pd.concat([self.Mdl_Preds, test_preds], axis = 0, ignore_index = False);

            elif self.ensemble_req != "Y" and self.model_mode == "Regression":
                self.AllFittedModels.append(model)
                self.OOF_Preds = pd.concat([self.OOF_Preds, oof_preds], axis = 0, ignore_index = False)

                if test_preds_req == "Y":
                    mdl_preds[f"Ensemble"]  = ens.predict(mdl_preds[self.methods]) + self.a
                    self.Mdl_Preds = pd.concat([self.Mdl_Preds , mdl_preds], axis = 0, ignore_index = False)

            elif self.ensemble_req != "Y" and self.model_mode == "Classification":
                self.AllFittedModels.append(model)
                self.OOF_Preds = pd.concat([self.OOF_Preds,
                                            pd.DataFrame(oof_preds[0],
                                                         index = ydev.index,
                                                         columns = self.OOF_Preds.columns
                                                         )
                                            ],
                                           axis = 0, ignore_index = False
                                           )

                if test_preds_req == "Y":
                    self.Mdl_Preds  = pd.concat([self.Mdl_Preds,
                                                 pd.DataFrame(ens.predict(mdl_preds),
                                                              index = Xt.index,
                                                              columns = self.Mdl_Preds.columns
                                                              )
                                                 ] , axis=0, ignore_index = False
                                                )
            else:
                pass

        # Averaging the predictions after all folds:-
        self.OOF_Preds = self.OOF_Preds.groupby(level = 0).mean();

        if self.model_mode == "Regression":
            if test_preds_req == "Y" and self.ensemble == "Y":
                self.Mdl_Preds = self.Mdl_Preds[self.methods + ["Ensemble"]].groupby(level=0).mean();
            elif test_preds_req == "Y" and self.ensemble != "Y":
                self.Mdl_Preds = self.Mdl_Preds[self.methods].groupby(level=0).mean();
            elif test_preds_req != "Y":
                pass;

        elif self.model_mode == "Classification":
            if test_preds_req == "Y":
                self.Mdl_Preds = self.Mdl_Preds.groupby(level=0).mean();
            elif test_preds_req != "Y":
                pass;

        else:
            pass

        if self.ensemble_req == "Y" and self.model_mode == "Regression":
            joblib.dump(self.AllFittedModels,
                        os.path.join(self.op_path, f"VR_E{self.exp_nb}V{self.version_nb}")
                       );

        elif self.ensemble_req == "Y" and self.model_mode == "Classification":
            joblib.dump(self.AllFittedModels,
                        os.path.join(self.op_path, f"VC_E{self.exp_nb}V{self.version_nb}")
                       );

        elif self.ensemble_req != "Y" and self.model_mode == "Regression":
            vote_model = []
            vote_model.append(VotingModelMaker(estimators = self.AllFittedModels, weights = []))

            joblib.dump(vote_model,
                        os.path.join(self.op_path, f"VR_E{self.exp_nb}V{self.version_nb}")
                       );
            del vote_model;

        elif self.ensemble_req != "Y" and self.model_mode == "Classification":
            vote_model = []
            vote_model.append(VotingModelMaker(estimators = self.AllFittedModels, weights = []))

            joblib.dump(vote_model,
                        os.path.join(self.op_path, f"VC_E{self.exp_nb}V{self.version_nb}")
                       );
            del vote_model;

        else:
            pass

        return self.OOF_Preds, self.Mdl_Preds, self.Scores, self.TrainScores;

print();
collect();

In [None]:
%%time

class DisplayMaker:
    """
    This class plots the final scores and generates adjutant model utilities
    """;

    def __init__(self, target):
        self.target   = target;
        self.ensemble = CFG.ensemble_req

    def DisplayAdjTbl(self, *args):
        """
        This function displays pandas tables in an adjacent manner, sourced from the below link-
        https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
        """;

        html_str = '';
        for df in args:
            html_str += df.to_html();
        display_html(html_str.replace('table','table style="display:inline"'),raw=True);
        collect();

    def DisplayScores(self, Scores: pd.DataFrame, TrainScores: pd.DataFrame, methods: list):
        "This method displays the scores and their means";

        if self.ensemble == "Y":
            cols  = methods + ["Ensemble"]
        else:
            cols = methods
        args = \
        [Scores.style.format(precision = 5).\
         background_gradient(cmap = "mako", subset = cols).\
         set_caption(f"\nOOF scores across methods and folds\n").\
         set_properties(**{'text-align': 'centre'}),

         TrainScores.style.format(precision = 5).\
         background_gradient(cmap = "mako", subset = methods).\
         set_caption(f"\nTrain scores across methods and folds\n").\
         set_properties(**{'text-align': 'centre'})
        ];

        self.DisplayAdjTbl(*args);

        print('\n');
        display(Scores.mean().to_frame().\
                transpose().\
                style.format(precision = 5).\
                background_gradient(cmap = "mako", axis=1, subset = Scores.columns).\
                set_caption(f"\nOOF mean scores across methods and folds\n").\
                set_properties(**{'text-align': 'centre'})
               );

    def MakeMLPlots(self, methods: list, FtreImp: pd.DataFrame):
        """
        This method makes plots for the ML models, including feature importance
        """;

        fig, axes = plt.subplots(len(methods), 1, figsize = (12, len(methods) * 6),
                                 gridspec_kw = {'hspace': 0.8, 'wspace': 0.2},
                                );

        for i, col in enumerate(methods):
            try:
                ax = axes[i];
            except:
                ax = axes

            FtreImp[[col]].\
            sort_values(col, ascending = False).head(50).\
            plot.bar(ax = ax, color = '#0073e6');
            ax.set_title(f"{col} Importances", **CFG.title_specs);
            ax.set(xlabel = '', ylabel = '');

        plt.tight_layout();
        plt.show();

collect();
print();

In [None]:
%%time

l = MyLogger()
l.init(logging_lbl = "lightgbm_custom")
lgb.register_logger(l)

if CFG.ML == "Y":
    md = MdlDeveloper(XYtrain.drop(columns = CFG.target),
                        XYtrain[CFG.target] - 1,
                        XYtrain[CFG.target] - 1,
                        Xtest,
                        sel_cols = list(XYtrain.drop(columns = CFG.target).columns),
                        cat_cols = [],
                        model_mode = "Classification",
                        );
    OOF_Preds, Mdl_Preds, Scores, TrainScores = \
    md.TrainMdl(test_preds_req = "N", target = CFG.target, save_models = "Y");

    print("\n\n\n");
    disp = DisplayMaker(CFG.target);
    disp.DisplayScores(Scores, TrainScores, methods = md.methods);
    disp.MakeMLPlots(methods = list(md.methods), FtreImp = md.FtreImp)
    _ = Utils.CleanMemory()

In [None]:
%%time

if CFG.ML == "Y":
    XYtrain = pd.concat([XYtrain, OOF_Preds], axis=1)
    for col in [f"class{i}" for i in range(6)]:
        Xtest[col] = 0.5

    Utils.PrintColor(f"\nTrain Test data shape = {XYtrain.shape} | {Xtest.shape}\n")

    md = MdlDeveloper(XYtrain.drop(columns = CFG.target),
                      XYtrain[CFG.target] - CFG.a,
                      XYtrain[CFG.target].astype(np.uint8),
                      Xtest,
                      sel_cols = list(XYtrain.drop(columns = CFG.target).columns),
                      cat_cols = [],
                      model_mode = "Regression",
                      );

    OOF_Preds, Mdl_Preds, Scores, TrainScores = \
    md.TrainMdl(test_preds_req = "N", target = CFG.target, save_models = "Y");

    OOF_Preds[CFG.target] = XYtrain[CFG.target].values
    OOF_Preds.to_csv(os.path.join(CFG.op_path, f"OOF_Preds_E{CFG.exp_nb}V{CFG.version_nb}.csv"))
    md.FtreImp.to_csv(os.path.join(CFG.op_path,f"FtreImp_E{CFG.exp_nb}V{CFG.version_nb}.csv"))

    disp = DisplayMaker(CFG.target);
    disp.DisplayScores(Scores, TrainScores, methods = list(md.methods));

    print("\n\n")
    disp.MakeMLPlots(methods = list(md.methods), FtreImp = md.FtreImp)

_ = Utils.CleanMemory()
