In [1]:
import re
import torch
import polars as pl
import pandas as pd
import numpy as np
import warnings
from autogluon.tabular import TabularPredictor
import seaborn as sns
from typing import Any
from functools import partial
from datasets import load_dataset, Dataset
import optuna
from optuna.samplers import TPESampler
from nltk.corpus import stopwords
from string import punctuation as PUNCT
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    EarlyStoppingCallback,
)
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    multilabel_confusion_matrix,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    hamming_loss,
    precision_score,
    recall_score,
)
from torch import nn
from torch.autograd import Variable

In [2]:
warnings.filterwarnings("ignore")

# Dataset

In [3]:
dataset = pl.read_csv("preprocessing.csv")
dataset.head()

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,False,1.00001,0.010663,1e-05,1e-05
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,False,0.437833,0.056662,0.140346,0.051855
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,False,1e-05,1e-05,1e-05,1.00001
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,False,0.930468,0.025189,0.119213,0.000378
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,False,0.069552,0.217348,0.019134,0.507822


In [4]:
dataset["Тег"].value_counts()

Тег,count
str,u32
"""VC2 VP2 VC3 VC…",1
"""VC3 H3 D""",1
"""S1 H1 VC2""",1
"""T1 T3""",2
"""H3 S1 VC2""",1
…,…
"""S1 VC1 VP1 VC2…",1
"""VC1 VC2 H2 VC4…",2
"""VC1 VP1 VP4 VC…",1
"""VP2 """,7


In [5]:
dataset = dataset.with_columns(
    (pl.col("Тег").apply(lambda x: " ".join(re.findall(r"[A-Z]{1,2}\d|LMS", x)))).alias("corrected_tag")
    )

In [6]:
null_filter = (
    (pl.col("corrected_tag").eq(""))
)

dataset = dataset.filter(~null_filter)
dataset = dataset.filter(~(pl.col("Комментарий").is_null()))

In [7]:
dataset = dataset.with_columns(
    pl.col("corrected_tag")
    .str.replace_all(r"VC4|VP4|VC5|S4|T4|H4|EA1", "")
    .str.strip()
    .str.replace(r"\s\s+", " ")
    .str.replace(r"GH3", "H3")
    .str.replace(r"HH3", "H3")
    .str.replace(r"BP3", "VP3")
    .str.replace(r"V3", "VC3")
    .str.replace(r"V2", "VP2")
)

dataset = dataset.filter(~(pl.col("corrected_tag").eq("")))

In [8]:
dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""H3""",20391
"""VC2""",14414
"""VC3""",8111
"""VP3""",4972
"""VP2""",4897
…,…
"""LMS""",669
"""T2""",548
"""T1""",399
"""T3""",234


In [9]:
dataset = dataset.filter(~pl.col("corrected_tag").str.contains("E2"))

In [10]:
def remove_sub_tags(tags: str):
    split = tags.split(sep=" ")
    new_tag = [x[:-1] if x[-1].isdigit() else x for x in split]
    return " ".join(new_tag)

dataset = dataset.with_columns(
    pl.col("corrected_tag").apply(remove_sub_tags)
)

dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""VC""",26304
"""H""",24437
"""VP""",11560
"""S""",1973
"""E""",1785
"""T""",1181
"""LMS""",669


In [11]:
target = dataset["corrected_tag"].str.split(by = " ").explode().unique().sort().to_list()
target = dict(zip(target, range(len(target))))
reverse_target = {v : k for k, v in target.items()}

In [12]:
def vectorize(tags: str) -> list[float]:
    """Turn str with tags into list with digit labels.

    Args:
        tags (str): tag text representation.

    Returns:
        list[float]: numeric labels.
    """
    split = tags.split(sep = " ")
    res = np.zeros(len(target))
    for x in split:
        res[target[x]] = 1
    return res.tolist()

In [13]:
dataset = dataset.with_columns(pl.col("corrected_tag").apply(vectorize).alias("labels"))

In [14]:
dataset

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP,corrected_tag,labels
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64,str,list[f64]
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,false,1.00001,0.010663,0.00001,0.00001,"""VP""","[0.0, 0.0, … 1.0]"
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,false,0.437833,0.056662,0.140346,0.051855,"""H""","[0.0, 1.0, … 0.0]"
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,false,0.00001,0.00001,0.00001,1.00001,"""H""","[0.0, 1.0, … 0.0]"
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,false,0.930468,0.025189,0.119213,0.000378,"""H E""","[1.0, 1.0, … 0.0]"
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,false,0.069552,0.217348,0.019134,0.507822,"""H""","[0.0, 1.0, … 0.0]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Z""",133.0,,3.0,"""ДЗ""","""H2""","""требуемый форм…",,0.0,0.0,0.0,0,false,0.822199,0.013647,0.020974,0.00001,"""H""","[0.0, 1.0, … 0.0]"
"""Z""",,,0.0,,"""S1""","""заплатила и да…",,0.0,0.0,0.0,0,false,0.228166,0.042098,0.644235,0.006108,"""S""","[0.0, 0.0, … 0.0]"
"""Z""",,,7.0,,"""LMS""","""Крайне раздраж…",,0.0,0.0,0.0,0,false,0.692652,0.073706,0.262852,0.00523,"""LMS""","[0.0, 0.0, … 0.0]"
"""Z""",,,,,"""VC2 VP2""","""Аналитик данны…",,0.0,0.0,0.0,0,true,0.705795,0.053413,0.320831,0.001511,"""VC VP""","[0.0, 0.0, … 1.0]"


In [15]:
clear_dataset = dataset.select(
    pl.col("Комментарий"),
    pl.col("Направление"),
    pl.col("Факультет"),
    pl.col("Оценка"),
    pl.col("Neutral"),
    pl.col("Positive"),
    pl.col("Negative"),
    pl.col("Exclamations"),
    pl.col("have_code"),
    pl.col("Neutral_NLP"),
    pl.col("Positive_NLP"),
    pl.col("Negative_NLP"),
    pl.col("Speech_NLP"),
    pl.col("corrected_tag"),
    pl.col("labels"),
    pl.col("corrected_tag").str.split(by=" ").alias("temp"),
)
clear_dataset = clear_dataset.explode(columns=["temp"])

In [16]:
train_df, test_df = train_test_split(
    clear_dataset,
    test_size=0.1,
    random_state=3317,
    stratify=clear_dataset["temp"],
)

train_df = train_df.drop(columns=["corrected_tag", "temp"])
test_df = test_df.drop(columns=["corrected_tag", "temp"])

train_df = train_df.rename({"Комментарий": "text"})
test_df = test_df.rename({"Комментарий": "text"})

In [17]:
train_dataset = Dataset.from_pandas(train_df.to_pandas(), split="train")
test_dataset = Dataset.from_pandas(test_df.to_pandas(), split="test")

In [18]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


def preprocess_data(sample: dict[str, Any]) -> dict[str, Any]:
    """Encode input text into sequence of tokens.
    Also add corresponding labels.

    Args:
        sample (dict[str, Any]): raw input text.

    Returns:
        dict[str, Any]: transformed sample with tokenized text and labels.
    """
    text = sample["text"]
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    encoding["labels"] = sample["labels"]
    return encoding

In [19]:
encoded_train = train_dataset.map(
    preprocess_data, batched=True, remove_columns=train_dataset.column_names
)
encoded_test = test_dataset.map(
    preprocess_data, batched=True, remove_columns=test_dataset.column_names
)
encoded_train.set_format("torch")
encoded_test.set_format("torch")

Map:   0%|          | 0/61118 [00:00<?, ? examples/s]

Map:   0%|          | 0/6791 [00:00<?, ? examples/s]

# Bert Training with FocalLoss

In [20]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [21]:
class MultiLabelFocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, ignore_index=-100, reduction='mean'):
        super().__init__()
        self.CE = nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        minus_logpt = self.CE(input, target)
        pt = torch.exp(-minus_logpt) 
        focal_loss = (1-pt)**self.gamma * minus_logpt

        if self.alpha != None:
            focal_loss *= self.alpha.gather(0, target)
        
        if self.reduction == 'mean':
            focal_loss = focal_loss.mean()
        elif self.reduction == 'sum':
            focal_loss = focal_loss.sum()
        return focal_loss

In [22]:
class FocalBert(BertForSequenceClassification):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = MultiLabelFocalLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [23]:
def multi_label_metrics(
    predictions: np.ndarray, labels: np.ndarray, threshold: float = 0.5
) -> dict[str, float]:
    """Compute mltilabel metrics.

    Args:
        predictions (np.ndarray): logits array
        labels (np.ndarray): labels array
        threshold (float, optional): activation threshold. Defaults to 0.5.

    Returns:
        dict[str, float]: metrics dict
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(labels, y_pred, average="micro")
    accuracy = accuracy_score(labels, y_pred)
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction) -> dict[str, float]:
    """Metrics computation wrapper.

    Args:
        p (EvalPrediction): hf model output

    Returns:
        dict[str, float]: metrics dict
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [24]:
def make_training_pipeline(
    exp_name: str,
    tokenizer: AutoTokenizer,
    train_dataset: Dataset,
    eval_dataset: Dataset,
    batch_size: int = 64,
    lr: float = 2e-5,
    epochs_num: int = 20,
    model_name=None
) -> Trainer:
    """Training process wrapper.

    Args:
        exp_name (str): name of the local folder
        for saving model checkpoints.
        tokenizer (AutoTokenizer): model tokenizer
        train_dataset (Dataset): train dataset split
        eval_dataset (Dataset): test dataset split
        batch_size (int, optional): number of samples
        in sigle batch. Defaults to 32.
        lr (float, optional): model's learning rate. Defaults to 2e-5.
        epochs_num (int, optional):
        number of training iterations. Defaults to 20.

    Returns:
        Trainer: hf training pipeline abstraction class.
    """
    args = TrainingArguments(
        exp_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs_num,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
    )

    model = FocalBert.from_pretrained(
        "cointegrated/rubert-tiny2",
        problem_type="multi_label_classification",
        num_labels=len(target),
        id2label=target,
        label2id=reverse_target
    )
    if model_name is not None:
        model.load_state_dict(torch.load(model_name))

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    return trainer

In [25]:
BATCH_SIZE = 190
EPOCHS = 75
LR = 2e-5

In [26]:
trainer = make_training_pipeline("f_focal", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR)

Some weights of FocalBert were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.352816,0.622865,0.742203,0.452511
2,No log,0.27609,0.739378,0.821514,0.556766
3,No log,0.240829,0.776942,0.851053,0.583566
4,0.345200,0.220402,0.795464,0.863187,0.612723
5,0.345200,0.206655,0.8062,0.86993,0.624503
6,0.345200,0.198192,0.81074,0.871455,0.635547
7,0.210700,0.190963,0.818527,0.877186,0.645119
8,0.210700,0.18607,0.822552,0.879243,0.652334
9,0.210700,0.181839,0.828879,0.885139,0.661464
10,0.175800,0.177369,0.831437,0.884065,0.666029


TrainOutput(global_step=9821, training_loss=0.10332825630311507, metrics={'train_runtime': 5649.8407, 'train_samples_per_second': 811.324, 'train_steps_per_second': 2.137, 'total_flos': 2.751042768246989e+16, 'train_loss': 0.10332825630311507, 'epoch': 61.0})

In [28]:
trainer.evaluate()

{'eval_loss': 0.16114938259124756,
 'eval_f1': 0.8859830473433946,
 'eval_roc_auc': 0.93231951687974,
 'eval_accuracy': 0.7904579590634664,
 'eval_runtime': 4.2563,
 'eval_samples_per_second': 1595.508,
 'eval_steps_per_second': 4.229,
 'epoch': 61.0}

In [29]:
train_preds = trainer.predict(encoded_train)

In [30]:
test_preds = trainer.predict(encoded_test)

In [31]:
print(compute_metrics(test_preds))

{'f1': 0.8859830473433946, 'roc_auc': 0.93231951687974, 'accuracy': 0.7904579590634664}


In [38]:
trainer = make_training_pipeline("v", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR, model_name="f/checkpoint-4590/pytorch_model.bin")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
train_preds = trainer.predict(encoded_train)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [40]:
test_preds = trainer.predict(encoded_test)

# Meta Model Training 

Сделать сравнение с Focal Loss и без него

In [34]:
train_dataset

Dataset({
    features: ['text', 'Направление', 'Факультет', 'Оценка', 'Neutral', 'Positive', 'Negative', 'Exclamations', 'have_code', 'Neutral_NLP', 'Positive_NLP', 'Negative_NLP', 'Speech_NLP', 'labels'],
    num_rows: 61118
})

In [35]:
directions = pd.get_dummies(train_df.to_pandas()["Направление"])
departments = pd.get_dummies(train_df.to_pandas()["Факультет"])
meta_dataset_train = train_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_train = pd.concat([meta_dataset_train, directions, departments, pd.DataFrame(train_preds.predictions)], axis=1)
meta_dataset_train = meta_dataset_train.drop(columns=["text"])

meta_dataset_test = test_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_test = pd.concat([meta_dataset_test, directions, departments, pd.DataFrame(test_preds.predictions)], axis=1)
meta_dataset_test = meta_dataset_test.drop(columns=["text"])

meta_dataset_test = meta_dataset_test.dropna(subset=["labels"])


In [32]:
from catboost import CatBoostClassifier, Pool

In [36]:
X_train, y_train = meta_dataset_train.drop('labels', axis=1), np.array(meta_dataset_train["labels"].to_list())
X_test, y_test = meta_dataset_test.drop('labels', axis=1), np.array(meta_dataset_test["labels"].to_list())


In [37]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [39]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 500, 2000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False,
        task_type="GPU",
        devices='0',
        loss_function = trial.suggest_categorical("loss_function", ["MultiCrossEntropy", "MultiLogloss"]))
    model.fit(train_pool, eval_set=test_pool)
    y_pred = model.predict(test_pool)
    return hamming_loss(y_test, y_pred)


In [40]:
sampler = TPESampler(seed=1337)
study = optuna.create_study(study_name="catboost", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)


[I 2024-05-23 00:20:46,405] A new study created in memory with name: catboost
[I 2024-05-23 00:20:52,849] Trial 0 finished with value: 0.04768916843721733 and parameters: {'iterations': 893, 'learning_rate': 0.0020766721769608226, 'depth': 5, 'l2_leaf_reg': 0.00039189423261207457, 'bootstrap_type': 'Bayesian', 'random_strength': 7.74470472863429e-06, 'bagging_temperature': 5.183928205975371, 'od_type': 'Iter', 'od_wait': 40, 'loss_function': 'MultiLogloss'}. Best is trial 0 with value: 0.04768916843721733.
[I 2024-05-23 00:21:25,381] Trial 1 finished with value: 0.04653217493741717 and parameters: {'iterations': 1443, 'learning_rate': 0.0017787538482591226, 'depth': 10, 'l2_leaf_reg': 0.00027055071527787685, 'bootstrap_type': 'Bayesian', 'random_strength': 0.12765125185466167, 'bagging_temperature': 7.941185757915866, 'od_type': 'Iter', 'od_wait': 33, 'loss_function': 'MultiCrossEntropy'}. Best is trial 1 with value: 0.04653217493741717.
[I 2024-05-23 00:21:26,498] Trial 2 finished wit

In [41]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


Number of finished trials:  50
Best trial:
  Value:  0.04609041378294802
  Params: 
    iterations: 1699
    learning_rate: 0.08949523912226792
    depth: 10
    l2_leaf_reg: 7.674731241803363e-05
    bootstrap_type: Bayesian
    random_strength: 1.8084451393310196e-07
    bagging_temperature: 7.710292433867646
    od_type: Iter
    od_wait: 23
    loss_function: MultiCrossEntropy


In [42]:
model = CatBoostClassifier(**trial.params, verbose=True)
model.fit(train_pool, eval_set=test_pool)
pred_labels = model.predict(test_pool)


0:	learn: 0.4319112	test: 0.4491328	best: 0.4491328 (0)	total: 228ms	remaining: 6m 26s
1:	learn: 0.2655963	test: 0.3014698	best: 0.3014698 (1)	total: 464ms	remaining: 6m 33s
2:	learn: 0.1758874	test: 0.2264591	best: 0.2264591 (2)	total: 697ms	remaining: 6m 33s
3:	learn: 0.1199314	test: 0.1836601	best: 0.1836601 (3)	total: 920ms	remaining: 6m 29s
4:	learn: 0.0883319	test: 0.1635228	best: 0.1635228 (4)	total: 1.16s	remaining: 6m 32s
5:	learn: 0.0701446	test: 0.1556353	best: 0.1556353 (5)	total: 1.4s	remaining: 6m 33s
6:	learn: 0.0568605	test: 0.1515256	best: 0.1515256 (6)	total: 1.62s	remaining: 6m 30s
7:	learn: 0.0481014	test: 0.1515955	best: 0.1515256 (6)	total: 1.85s	remaining: 6m 30s
8:	learn: 0.0426349	test: 0.1527650	best: 0.1515256 (6)	total: 2.08s	remaining: 6m 31s
9:	learn: 0.0401586	test: 0.1538804	best: 0.1515256 (6)	total: 2.24s	remaining: 6m 18s
10:	learn: 0.0379284	test: 0.1551610	best: 0.1515256 (6)	total: 2.38s	remaining: 6m 4s
11:	learn: 0.0363394	test: 0.1562246	best: 0

In [43]:
cr = classification_report(np.array(meta_dataset_test["labels"].to_list()), pred_labels, output_dict=True)
cr = pd.DataFrame(cr).T
print(cr)

              precision    recall  f1-score  support
0              0.738739  0.747720  0.743202    329.0
1              0.875122  0.922523  0.898198   2917.0
2              0.894737  0.531250  0.666667    128.0
3              0.862319  0.860241  0.861279    415.0
4              0.731092  0.595890  0.656604    146.0
5              0.883407  0.914762  0.898811   3719.0
6              0.882719  0.893382  0.888019   1904.0
micro avg      0.873099  0.894748  0.883791   9558.0
macro avg      0.838305  0.780824  0.801826   9558.0
weighted avg   0.872671  0.894748  0.882679   9558.0
samples avg    0.857532  0.875853  0.858406   9558.0
