In [1]:
import re
import torch
import polars as pl
import pandas as pd
import numpy as np
import warnings
from autogluon.tabular import TabularPredictor
import seaborn as sns
from typing import Any
from functools import partial
from datasets import load_dataset, Dataset
import optuna
from optuna.samplers import TPESampler
from nltk.corpus import stopwords
from string import punctuation as PUNCT
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    EarlyStoppingCallback,
)
from transformers.modeling_outputs import SequenceClassifierOutput

from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    multilabel_confusion_matrix,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    hamming_loss,
    precision_score,
    recall_score,
)
from torch import nn
from torch.autograd import Variable

In [2]:
warnings.filterwarnings("ignore")

# Dataset

In [3]:
dataset = pl.read_csv("preprocessing.csv")
dataset.head()

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,False,1.00001,0.010663,1e-05,1e-05
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,False,0.437833,0.056662,0.140346,0.051855
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,False,1e-05,1e-05,1e-05,1.00001
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,False,0.930468,0.025189,0.119213,0.000378
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,False,0.069552,0.217348,0.019134,0.507822


In [4]:
dataset["Тег"].value_counts()

Тег,count
str,u32
"""S3 VC3 LMS""",2
"""VP3 S1""",20
"""VC2 S4""",10
"""VC1 VP2 T1""",1
"""H3 VC3 S1 VC2""",1
…,…
"""S1 S3 S4""",1
"""S3 VC1 VC2 S1…",1
"""VC2 H1 VC4""",1
"""S3 VP3 VC2""",2


In [5]:
dataset = dataset.with_columns(
    (pl.col("Тег").apply(lambda x: " ".join(re.findall(r"[A-Z]{1,2}\d|LMS", x)))).alias("corrected_tag")
    )

In [6]:
null_filter = (
    (pl.col("corrected_tag").eq(""))
)

dataset = dataset.filter(~null_filter)
dataset = dataset.filter(~(pl.col("Комментарий").is_null()))

In [7]:
dataset = dataset.with_columns(
    pl.col("corrected_tag")
    .str.replace_all(r"VC4|VP4|VC5|S4|T4|H4|EA1", "")
    .str.strip()
    .str.replace(r"\s\s+", " ")
    .str.replace(r"GH3", "H3")
    .str.replace(r"HH3", "H3")
    .str.replace(r"BP3", "VP3")
    .str.replace(r"V3", "VC3")
    .str.replace(r"V2", "VP2")
)

dataset = dataset.filter(~(pl.col("corrected_tag").eq("")))

In [8]:
dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""H3""",20391
"""VC2""",14414
"""VC3""",8111
"""VP3""",4972
"""VP2""",4897
…,…
"""LMS""",669
"""T2""",548
"""T1""",399
"""T3""",234


In [9]:
dataset = dataset.filter(~pl.col("corrected_tag").str.contains("E2"))

In [10]:
def remove_sub_tags(tags: str):
    split = tags.split(sep=" ")
    new_tag = [x[:-1] if x[-1].isdigit() else x for x in split]
    return " ".join(new_tag)

dataset = dataset.with_columns(
    pl.col("corrected_tag").apply(remove_sub_tags)
)

dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""VC""",26304
"""H""",24437
"""VP""",11560
"""S""",1973
"""E""",1785
"""T""",1181
"""LMS""",669


In [11]:
target = dataset["corrected_tag"].str.split(by = " ").explode().unique().sort().to_list()
target = dict(zip(target, range(len(target))))
reverse_target = {v : k for k, v in target.items()}

In [12]:
def vectorize(tags: str) -> list[float]:
    """Turn str with tags into list with digit labels.

    Args:
        tags (str): tag text representation.

    Returns:
        list[float]: numeric labels.
    """
    split = tags.split(sep = " ")
    res = np.zeros(len(target))
    for x in split:
        res[target[x]] = 1
    return res.tolist()

In [13]:
dataset = dataset.with_columns(pl.col("corrected_tag").apply(vectorize).alias("labels"))

In [14]:
dataset

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP,corrected_tag,labels
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64,str,list[f64]
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,false,1.00001,0.010663,0.00001,0.00001,"""VP""","[0.0, 0.0, … 1.0]"
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,false,0.437833,0.056662,0.140346,0.051855,"""H""","[0.0, 1.0, … 0.0]"
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,false,0.00001,0.00001,0.00001,1.00001,"""H""","[0.0, 1.0, … 0.0]"
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,false,0.930468,0.025189,0.119213,0.000378,"""H E""","[1.0, 1.0, … 0.0]"
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,false,0.069552,0.217348,0.019134,0.507822,"""H""","[0.0, 1.0, … 0.0]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Z""",133.0,,3.0,"""ДЗ""","""H2""","""требуемый форм…",,0.0,0.0,0.0,0,false,0.822199,0.013647,0.020974,0.00001,"""H""","[0.0, 1.0, … 0.0]"
"""Z""",,,0.0,,"""S1""","""заплатила и да…",,0.0,0.0,0.0,0,false,0.228166,0.042098,0.644235,0.006108,"""S""","[0.0, 0.0, … 0.0]"
"""Z""",,,7.0,,"""LMS""","""Крайне раздраж…",,0.0,0.0,0.0,0,false,0.692652,0.073706,0.262852,0.00523,"""LMS""","[0.0, 0.0, … 0.0]"
"""Z""",,,,,"""VC2 VP2""","""Аналитик данны…",,0.0,0.0,0.0,0,true,0.705795,0.053413,0.320831,0.001511,"""VC VP""","[0.0, 0.0, … 1.0]"


In [15]:
clear_dataset = dataset.select(
    pl.col("Комментарий"),
    pl.col("Направление"),
    pl.col("Факультет"),
    pl.col("Оценка"),
    pl.col("Neutral"),
    pl.col("Positive"),
    pl.col("Negative"),
    pl.col("Exclamations"),
    pl.col("have_code"),
    pl.col("Neutral_NLP"),
    pl.col("Positive_NLP"),
    pl.col("Negative_NLP"),
    pl.col("Speech_NLP"),
    pl.col("corrected_tag"),
    pl.col("labels"),
    pl.col("corrected_tag").str.split(by=" ").alias("temp"),
)
clear_dataset = clear_dataset.explode(columns=["temp"])

In [16]:
train_df, test_df = train_test_split(
    clear_dataset,
    test_size=0.1,
    random_state=3317,
    stratify=clear_dataset["temp"],
)

train_df = train_df.drop(columns=["corrected_tag", "temp"])
test_df = test_df.drop(columns=["corrected_tag", "temp"])

train_df = train_df.rename({"Комментарий": "text"})
test_df = test_df.rename({"Комментарий": "text"})

In [17]:
train_dataset = Dataset.from_pandas(train_df.to_pandas(), split="train")
test_dataset = Dataset.from_pandas(test_df.to_pandas(), split="test")

In [18]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


def preprocess_data(sample: dict[str, Any]) -> dict[str, Any]:
    """Encode input text into sequence of tokens.
    Also add corresponding labels.

    Args:
        sample (dict[str, Any]): raw input text.

    Returns:
        dict[str, Any]: transformed sample with tokenized text and labels.
    """
    text = sample["text"]
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    encoding["labels"] = sample["labels"]
    return encoding

In [19]:
encoded_train = train_dataset.map(
    preprocess_data, batched=True, remove_columns=train_dataset.column_names
)
encoded_test = test_dataset.map(
    preprocess_data, batched=True, remove_columns=test_dataset.column_names
)
encoded_train.set_format("torch")
encoded_test.set_format("torch")

Map:   0%|          | 0/61118 [00:00<?, ? examples/s]

Map:   0%|          | 0/6791 [00:00<?, ? examples/s]

# Bert Training with FocalLoss

In [20]:
class FocalLoss(nn.Module):
    def __init__(self, gamma=0, alpha=None, size_average=True):
        super(FocalLoss, self).__init__()
        self.gamma = gamma
        self.alpha = alpha
        if isinstance(alpha,(float,int)): self.alpha = torch.Tensor([alpha,1-alpha])
        if isinstance(alpha,list): self.alpha = torch.Tensor(alpha)
        self.size_average = size_average

    def forward(self, input, target):
        if input.dim()>2:
            input = input.view(input.size(0),input.size(1),-1)  # N,C,H,W => N,C,H*W
            input = input.transpose(1,2)    # N,C,H*W => N,H*W,C
            input = input.contiguous().view(-1,input.size(2))   # N,H*W,C => N*H*W,C
        target = target.view(-1,1)

        logpt = F.log_softmax(input)
        logpt = logpt.gather(1,target)
        logpt = logpt.view(-1)
        pt = Variable(logpt.data.exp())

        if self.alpha is not None:
            if self.alpha.type()!=input.data.type():
                self.alpha = self.alpha.type_as(input.data)
            at = self.alpha.gather(0,target.data.view(-1))
            logpt = logpt * Variable(at)

        loss = -1 * (1-pt)**self.gamma * logpt
        if self.size_average: return loss.mean()
        else: return loss.sum()

In [21]:
class MultiLabelFocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2, ignore_index=-100, reduction='mean'):
        super().__init__()
        self.CE = nn.CrossEntropyLoss(reduction='none', ignore_index=ignore_index)
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input, target):
        minus_logpt = self.CE(input, target)
        pt = torch.exp(-minus_logpt) 
        focal_loss = (1-pt)**self.gamma * minus_logpt

        if self.alpha != None:
            focal_loss *= self.alpha.gather(0, target)
        
        if self.reduction == 'mean':
            focal_loss = focal_loss.mean()
        elif self.reduction == 'sum':
            focal_loss = focal_loss.sum()
        return focal_loss

In [22]:
class FocalBert(BertForSequenceClassification):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
    ):
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        pooled_output = outputs[1]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = nn.MSELoss()
                if self.num_labels == 1:
                    loss = loss_fct(logits.squeeze(), labels.squeeze())
                else:
                    loss = loss_fct(logits, labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = MultiLabelFocalLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = nn.BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


In [23]:
def multi_label_metrics(
    predictions: np.ndarray, labels: np.ndarray, threshold: float = 0.5
) -> dict[str, float]:
    """Compute mltilabel metrics.

    Args:
        predictions (np.ndarray): logits array
        labels (np.ndarray): labels array
        threshold (float, optional): activation threshold. Defaults to 0.5.

    Returns:
        dict[str, float]: metrics dict
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(labels, y_pred, average="micro")
    accuracy = accuracy_score(labels, y_pred)
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction) -> dict[str, float]:
    """Metrics computation wrapper.

    Args:
        p (EvalPrediction): hf model output

    Returns:
        dict[str, float]: metrics dict
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [24]:
def make_training_pipeline(
    exp_name: str,
    tokenizer: AutoTokenizer,
    train_dataset: Dataset,
    eval_dataset: Dataset,
    batch_size: int = 64,
    lr: float = 2e-5,
    epochs_num: int = 20,
    model_name=None
) -> Trainer:
    """Training process wrapper.

    Args:
        exp_name (str): name of the local folder
        for saving model checkpoints.
        tokenizer (AutoTokenizer): model tokenizer
        train_dataset (Dataset): train dataset split
        eval_dataset (Dataset): test dataset split
        batch_size (int, optional): number of samples
        in sigle batch. Defaults to 32.
        lr (float, optional): model's learning rate. Defaults to 2e-5.
        epochs_num (int, optional):
        number of training iterations. Defaults to 20.

    Returns:
        Trainer: hf training pipeline abstraction class.
    """
    args = TrainingArguments(
        exp_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs_num,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
    )

    model = FocalBert.from_pretrained(
        "cointegrated/rubert-tiny2",
        problem_type="multi_label_classification",
        num_labels=len(target),
        id2label=target,
        label2id=reverse_target
    )
    if model_name is not None:
        model.load_state_dict(torch.load(model_name))

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    return trainer

In [25]:
BATCH_SIZE = 180
EPOCHS = 75
LR = 2e-5

In [26]:
trainer = make_training_pipeline("f_focal", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR)

Some weights of FocalBert were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.340161,0.636617,0.749821,0.462229
2,No log,0.273522,0.740604,0.823965,0.544692
3,0.348400,0.244477,0.777118,0.853319,0.580769
4,0.348400,0.220424,0.793016,0.863309,0.601826


In [None]:
trainer.evaluate()

{'eval_loss': 0.15058404207229614,
 'eval_f1': 0.8803578859758635,
 'eval_roc_auc': 0.9268149647197956,
 'eval_accuracy': 0.768811662494478,
 'eval_runtime': 8.3648,
 'eval_samples_per_second': 811.851,
 'eval_steps_per_second': 6.456,
 'epoch': 30.0}

In [None]:
train_preds = trainer.predict(encoded_train)

In [None]:
test_preds = trainer.predict(encoded_test)

In [31]:
print(compute_metrics(test_preds))

{'f1': 0.8803578859758635, 'roc_auc': 0.9268149647197956, 'accuracy': 0.768811662494478}


In [38]:
trainer = make_training_pipeline("v", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR, model_name="f/checkpoint-4590/pytorch_model.bin")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
train_preds = trainer.predict(encoded_train)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [40]:
test_preds = trainer.predict(encoded_test)

# Meta Model Training 

Сделать сравнение с Focal Loss и без него

In [41]:
train_dataset

Dataset({
    features: ['text', 'Направление', 'Факультет', 'Оценка', 'Neutral', 'Positive', 'Negative', 'Exclamations', 'have_code', 'Neutral_NLP', 'Positive_NLP', 'Negative_NLP', 'Speech_NLP', 'labels'],
    num_rows: 61118
})

In [63]:
directions = pd.get_dummies(train_df.to_pandas()["Направление"])
departments = pd.get_dummies(train_df.to_pandas()["Факультет"])
meta_dataset_train = train_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_train = pd.concat([meta_dataset_train, directions, departments, pd.DataFrame(train_preds.predictions)], axis=1)
meta_dataset_train = meta_dataset_train.drop(columns=["text"])

meta_dataset_test = test_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_test = pd.concat([meta_dataset_test, directions, departments, pd.DataFrame(test_preds.predictions)], axis=1)
meta_dataset_test = meta_dataset_test.drop(columns=["text"])

meta_dataset_test = meta_dataset_test.dropna(subset=["labels"])


In [64]:
meta_dataset_train[[f"label{i}" for i in range(7)]] = np.array([el for el in meta_dataset_train["labels"].to_numpy()])
meta_dataset_test[[f"label{i}" for i in range(7)]] = np.array([el for el in meta_dataset_test["labels"].to_numpy()])

In [65]:
meta_dataset_train = meta_dataset_train.drop(columns=["labels"])
meta_dataset_test = meta_dataset_test.drop(columns=["labels"])

In [66]:
from autogluon.tabular import TabularDataset, TabularPredictor
from autogluon.common.utils.utils import setup_outputdir
from autogluon.core.utils.loaders import load_pkl
from autogluon.core.utils.savers import save_pkl
import os.path

class MultilabelPredictor():
    """ Tabular Predictor for predicting multiple columns in table.
        Creates multiple TabularPredictor objects which you can also use individually.
        You can access the TabularPredictor for a particular label via: `multilabel_predictor.get_predictor(label_i)`

        Parameters
        ----------
        labels : List[str]
            The ith element of this list is the column (i.e. `label`) predicted by the ith TabularPredictor stored in this object.
        path : str, default = None
            Path to directory where models and intermediate outputs should be saved.
            If unspecified, a time-stamped folder called "AutogluonModels/ag-[TIMESTAMP]" will be created in the working directory to store all models.
            Note: To call `fit()` twice and save all results of each fit, you must specify different `path` locations or don't specify `path` at all.
            Otherwise files from first `fit()` will be overwritten by second `fit()`.
            Caution: when predicting many labels, this directory may grow large as it needs to store many TabularPredictors.
        problem_types : List[str], default = None
            The ith element is the `problem_type` for the ith TabularPredictor stored in this object.
        eval_metrics : List[str], default = None
            The ith element is the `eval_metric` for the ith TabularPredictor stored in this object.
        consider_labels_correlation : bool, default = True
            Whether the predictions of multiple labels should account for label correlations or predict each label independently of the others.
            If True, the ordering of `labels` may affect resulting accuracy as each label is predicted conditional on the previous labels appearing earlier in this list (i.e. in an auto-regressive fashion).
            Set to False if during inference you may want to individually use just the ith TabularPredictor without predicting all the other labels.
        kwargs :
            Arguments passed into the initialization of each TabularPredictor.

    """

    multi_predictor_file = 'multilabel_predictor.pkl'

    def __init__(self, labels, path=None, problem_types=None, eval_metrics=None, consider_labels_correlation=True, **kwargs):
        if len(labels) < 2:
            raise ValueError("MultilabelPredictor is only intended for predicting MULTIPLE labels (columns), use TabularPredictor for predicting one label (column).")
        if (problem_types is not None) and (len(problem_types) != len(labels)):
            raise ValueError("If provided, `problem_types` must have same length as `labels`")
        if (eval_metrics is not None) and (len(eval_metrics) != len(labels)):
            raise ValueError("If provided, `eval_metrics` must have same length as `labels`")
        self.path = setup_outputdir(path, warn_if_exist=False)
        self.labels = labels
        self.consider_labels_correlation = consider_labels_correlation
        self.predictors = {}  # key = label, value = TabularPredictor or str path to the TabularPredictor for this label
        if eval_metrics is None:
            self.eval_metrics = {}
        else:
            self.eval_metrics = {labels[i] : eval_metrics[i] for i in range(len(labels))}
        problem_type = None
        eval_metric = None
        for i in range(len(labels)):
            label = labels[i]
            path_i = self.path + "Predictor_" + label
            if problem_types is not None:
                problem_type = problem_types[i]
            if eval_metrics is not None:
                eval_metric = eval_metrics[i]
            self.predictors[label] = TabularPredictor(label=label, problem_type=problem_type, eval_metric=eval_metric, path=path_i, **kwargs)

    def fit(self, train_data, tuning_data=None, **kwargs):
        """ Fits a separate TabularPredictor to predict each of the labels.

            Parameters
            ----------
            train_data, tuning_data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                See documentation for `TabularPredictor.fit()`.
            kwargs :
                Arguments passed into the `fit()` call for each TabularPredictor.
        """
        if isinstance(train_data, str):
            train_data = TabularDataset(train_data)
        if tuning_data is not None and isinstance(tuning_data, str):
            tuning_data = TabularDataset(tuning_data)
        train_data_og = train_data.copy()
        if tuning_data is not None:
            tuning_data_og = tuning_data.copy()
        else:
            tuning_data_og = None
        save_metrics = len(self.eval_metrics) == 0
        for i in range(len(self.labels)):
            label = self.labels[i]
            predictor = self.get_predictor(label)
            if not self.consider_labels_correlation:
                labels_to_drop = [l for l in self.labels if l != label]
            else:
                labels_to_drop = [self.labels[j] for j in range(i+1, len(self.labels))]
            train_data = train_data_og.drop(labels_to_drop, axis=1)
            if tuning_data is not None:
                tuning_data = tuning_data_og.drop(labels_to_drop, axis=1)
            print(f"Fitting TabularPredictor for label: {label} ...")
            predictor.fit(train_data=train_data, tuning_data=tuning_data, **kwargs)
            self.predictors[label] = predictor.path
            if save_metrics:
                self.eval_metrics[label] = predictor.eval_metric
        self.save()

    def predict(self, data, **kwargs):
        """ Returns DataFrame with label columns containing predictions for each label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. If label columns are present in this data, they will be ignored. See documentation for `TabularPredictor.predict()`.
            kwargs :
                Arguments passed into the predict() call for each TabularPredictor.
        """
        return self._predict(data, as_proba=False, **kwargs)

    def predict_proba(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `predict_proba()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to make predictions for. See documentation for `TabularPredictor.predict()` and `TabularPredictor.predict_proba()`.
            kwargs :
                Arguments passed into the `predict_proba()` call for each TabularPredictor (also passed into a `predict()` call).
        """
        return self._predict(data, as_proba=True, **kwargs)

    def evaluate(self, data, **kwargs):
        """ Returns dict where each key is a label and the corresponding value is the `evaluate()` output for just that label.

            Parameters
            ----------
            data : str or autogluon.tabular.TabularDataset or pd.DataFrame
                Data to evalate predictions of all labels for, must contain all labels as columns. See documentation for `TabularPredictor.evaluate()`.
            kwargs :
                Arguments passed into the `evaluate()` call for each TabularPredictor (also passed into the `predict()` call).
        """
        data = self._get_data(data)
        eval_dict = {}
        for label in self.labels:
            print(f"Evaluating TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            eval_dict[label] = predictor.evaluate(data, **kwargs)
            if self.consider_labels_correlation:
                data[label] = predictor.predict(data, **kwargs)
        return eval_dict

    def save(self):
        """ Save MultilabelPredictor to disk. """
        for label in self.labels:
            if not isinstance(self.predictors[label], str):
                self.predictors[label] = self.predictors[label].path
        save_pkl.save(path=self.path+self.multi_predictor_file, object=self)
        print(f"MultilabelPredictor saved to disk. Load with: MultilabelPredictor.load('{self.path}')")

    @classmethod
    def load(cls, path):
        """ Load MultilabelPredictor from disk `path` previously specified when creating this MultilabelPredictor. """
        path = os.path.expanduser(path)
        if path[-1] != os.path.sep:
            path = path + os.path.sep
        return load_pkl.load(path=path+cls.multi_predictor_file)

    def get_predictor(self, label):
        """ Returns TabularPredictor which is used to predict this label. """
        predictor = self.predictors[label]
        if isinstance(predictor, str):
            return TabularPredictor.load(path=predictor)
        return predictor

    def _get_data(self, data):
        if isinstance(data, str):
            return TabularDataset(data)
        return data.copy()

    def _predict(self, data, as_proba=False, **kwargs):
        data = self._get_data(data)
        if as_proba:
            predproba_dict = {}
        for label in self.labels:
            print(f"Predicting with TabularPredictor for label: {label} ...")
            predictor = self.get_predictor(label)
            if as_proba:
                predproba_dict[label] = predictor.predict_proba(data, as_multiclass=True, **kwargs)
            data[label] = predictor.predict(data, **kwargs)
        if not as_proba:
            return data[self.labels]
        else:
            return predproba_dict

In [69]:
save_path = "agModels-predictClass"
labels = [f"label{i}" for i in range(7)]
problem_types = [f"binary" for i in range(7)]
eval_metrics = [f"roc_auc" for i in range(7)]

In [70]:
predictor = MultilabelPredictor(labels=labels, problem_types=problem_types, eval_metrics=eval_metrics, path=save_path).fit(meta_dataset_train, presets='best_quality')
results = predictor.fit_summary()

Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 3600 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: agModels-predictClassPredictor_label0/ds_sub_fit/sub_fit_ho.
Beginning AutoGluon training ... Time limit = 900s
AutoGluon will save models to "agModels-predictClassPredictor_label0/ds_sub_fit/sub_fit_ho"
AutoGluon Version:  1.0.0
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #112-Ubuntu SMP Tue Ma

Fitting TabularPredictor for label: label0 ...


			Note: Converting 65 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Useless Original Features (Count: 1): ['U']
		These features carry no predictive signal and should be manually investigated.
		This is typically a feature which has the same value for all rows.
		These features do not need to be present at inference time.
	Unused Original Features (Count: 19): ['H', 'M', 'N', 'Q', 'S', 'T', 'V', 'W', 'X', '100.0', '109.0', '112.0', '113.0', '114.0', '115.0', '125.0', '127.0', '129.0', '132.0']
		These features were not used to generate any of the output features. Add a feature generator compatible with these features to utilize them.
		Features can also be unused if they carry very little information, such as being categorical b

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

	0.9824	 = Validation score   (roc_auc)
	11.83s	 = Training   runtime
	0.8s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 582.27s of the 882.14s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy (8 workers, per: cpus=8, gpus=0, memory=0.06%)
	0.9838	 = Validation score   (roc_auc)
	5.05s	 = Training   runtime
	0.37s	 = Validation runtime
Fitting model: RandomForestGini_BAG_L1 ... Training model for up to 575.85s of the 875.72s of remaining time.
	0.9865	 = Validation score   (roc_auc)
	0.95s	 = Training   runtime
	1.26s	 = Validation runtime
Fitting model: RandomForestEntr_BAG_L1 ... Training model for up to 573.43s of the 873.31s of remaining time.
	0.9867	 = Validation score   (roc_auc)
	0.89s	 = Training   runtime
	1.13s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 571.24s of the 871.11s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting wit

In [None]:
model = RandomForestClassifier(max_depth=10, criterion="gini", n_estimators=150, class_weight="balanced")

In [None]:
model.fit(X=train_df.loc[:, train_df.columns != 'Категория'], y=train_df["Категория"])

In [None]:
pred_labels = model.predict(X=test_df.loc[:, train_df.columns != 'Категория'])
gt_labels = test_df["Категория"].values
pred_labels = [id2label[x] for x in pred_labels]
gt_labels = [id2label[x] for x in gt_labels]

In [None]:
cr = classification_report(gt_labels, pred_labels, output_dict=True)
cr = pd.DataFrame(cr).T
print(cr)

cm = confusion_matrix(gt_labels, pred_labels, labels=list(label2id.keys()))

              precision    recall  f1-score      support
Видео          0.938500  0.865975  0.900780  4335.000000
ДЗ             0.962364  0.918466  0.939903  4538.000000
Лонгрид        0.265690  0.622549  0.372434   408.000000
Тест           0.824786  0.804167  0.814346   240.000000
accuracy       0.879004  0.879004  0.879004     0.879004
macro avg      0.747835  0.802789  0.756866  9521.000000
weighted avg   0.918176  0.879004  0.894607  9521.000000


In [None]:
import plotly.figure_factory as ff
x = list(label2id.keys())
y = list(reversed(label2id.keys()))
fig = ff.create_annotated_heatmap(np.flipud(cm), x=x, y=y, colorscale="Viridis")
fig.update_layout(title_text="Confusion matrix")
fig.add_annotation(
    dict(
        x=0.5,
        y=-0.15,
        showarrow=False,
        text="Predicted value",
        xref="paper",
        yref="paper",
    )
)

fig.add_annotation(
    dict(
        x=-0.16,
        y=0.5,
        showarrow=False,
        text="Real value",
        textangle=-90,
        xref="paper",
        yref="paper",
    )
)

fig["data"][0]["showscale"] = True
fig.show()

In [None]:
import eli5

In [None]:
eli5.explain_weights(model, target_names=id2label, feature_names=train_df.loc[:, train_df.columns != 'Категория'].columns.values)

Weight,Feature
0.2307  ± 0.2420,3
0.1977  ± 0.2157,0
0.1900  ± 0.2752,1
0.1808  ± 0.2737,2
0.0364  ± 0.0942,Speech_NLP
0.0330  ± 0.0774,Оценка
0.0312  ± 0.0638,Neutral_NLP
0.0236  ± 0.0336,Positive_NLP
0.0202  ± 0.0415,Negative_NLP
0.0087  ± 0.0238,Exclamations
