In [1]:
import re
import torch
import polars as pl
import pandas as pd
import numpy as np
import warnings
from autogluon.tabular import TabularPredictor
import seaborn as sns
from typing import Any
from functools import partial
from datasets import load_dataset, Dataset
import optuna
from optuna.samplers import TPESampler
from nltk.corpus import stopwords
from string import punctuation as PUNCT
from dostoevsky.tokenization import RegexTokenizer
from dostoevsky.models import FastTextSocialNetworkModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EvalPrediction,
    EarlyStoppingCallback,
)
from sklearn.metrics import (
    f1_score,
    roc_auc_score,
    accuracy_score,
    multilabel_confusion_matrix,
    confusion_matrix,
    ConfusionMatrixDisplay,
    classification_report,
    hamming_loss,
    precision_score,
    recall_score,
)

In [2]:
warnings.filterwarnings("ignore")

# Dataset

In [3]:
dataset = pl.read_csv("preprocessing.csv")
dataset.head()

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,False,1.00001,0.010663,1e-05,1e-05
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,False,0.437833,0.056662,0.140346,0.051855
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,False,1e-05,1e-05,1e-05,1.00001
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,False,0.930468,0.025189,0.119213,0.000378
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,False,0.069552,0.217348,0.019134,0.507822


In [4]:
dataset["Тег"].value_counts()

Тег,count
str,u32
"""VC2 S1 S4""",1
"""VC2 VP2 H2 VC4…",1
""" VC1 H1 VC3 H3…",1
"""VP3 H3 LMS VC3…",1
"""VC3 VC2 H1""",2
…,…
"""VC2 VC4""",1
"""S3 VP3 VC3 VP2…",1
"""S3 H3 VC3 VP2""",1
"""VP3 T3""",3


In [5]:
dataset = dataset.with_columns(
    (pl.col("Тег").apply(lambda x: " ".join(re.findall(r"[A-Z]{1,2}\d|LMS", x)))).alias("corrected_tag")
    )

In [6]:
null_filter = (
    (pl.col("corrected_tag").eq(""))
)

dataset = dataset.filter(~null_filter)
dataset = dataset.filter(~(pl.col("Комментарий").is_null()))

In [7]:
dataset = dataset.with_columns(
    pl.col("corrected_tag")
    .str.replace_all(r"VC4|VP4|VC5|S4|T4|H4|EA1", "")
    .str.strip()
    .str.replace(r"\s\s+", " ")
    .str.replace(r"GH3", "H3")
    .str.replace(r"HH3", "H3")
    .str.replace(r"BP3", "VP3")
    .str.replace(r"V3", "VC3")
    .str.replace(r"V2", "VP2")
)

dataset = dataset.filter(~(pl.col("corrected_tag").eq("")))

In [8]:
dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""H3""",20391
"""VC2""",14414
"""VC3""",8111
"""VP3""",4972
"""VP2""",4897
…,…
"""LMS""",669
"""T2""",548
"""T1""",399
"""T3""",234


In [9]:
dataset = dataset.filter(~pl.col("corrected_tag").str.contains("E2"))

In [10]:
def remove_sub_tags(tags: str):
    split = tags.split(sep=" ")
    new_tag = [x[:-1] if x[-1].isdigit() else x for x in split]
    return " ".join(new_tag)

dataset = dataset.with_columns(
    pl.col("corrected_tag").apply(remove_sub_tags)
)

dataset["corrected_tag"].str.split(by = " ").explode().value_counts(sort=True)

corrected_tag,count
str,u32
"""VC""",26304
"""H""",24437
"""VP""",11560
"""S""",1973
"""E""",1785
"""T""",1181
"""LMS""",669


In [11]:
target = dataset["corrected_tag"].str.split(by = " ").explode().unique().sort().to_list()
target = dict(zip(target, range(len(target))))
reverse_target = {v : k for k, v in target.items()}

In [12]:
def vectorize(tags: str) -> list[float]:
    """Turn str with tags into list with digit labels.

    Args:
        tags (str): tag text representation.

    Returns:
        list[float]: numeric labels.
    """
    split = tags.split(sep = " ")
    res = np.zeros(len(target))
    for x in split:
        res[target[x]] = 1
    return res.tolist()

In [13]:
dataset = dataset.with_columns(pl.col("corrected_tag").apply(vectorize).alias("labels"))

In [14]:
dataset

Направление,Факультет,ID студента,Оценка,Категория,Тег,Комментарий,Статус,Neutral,Positive,Negative,Exclamations,have_code,Neutral_NLP,Positive_NLP,Negative_NLP,Speech_NLP,corrected_tag,labels
str,f64,f64,f64,str,str,str,str,f64,f64,f64,i64,bool,f64,f64,f64,f64,str,list[f64]
"""C""",113.0,1493.0,1.0,"""Видео""","""VP2""","""Видео лагает""",,0.0,0.0,0.0,0,false,1.00001,0.010663,0.00001,0.00001,"""VP""","[0.0, 0.0, … 1.0]"
"""C""",113.0,5580.0,5.0,"""ДЗ""","""H3 D""","""Торгом Бабаян!…",,0.0,0.0,0.0,5,false,0.437833,0.056662,0.140346,0.051855,"""H""","[0.0, 1.0, … 0.0]"
"""E""",126.0,5619.0,5.0,"""ДЗ""","""H3""","""Спасибо)""",,0.0,0.0,0.0,0,false,0.00001,0.00001,0.00001,1.00001,"""H""","[0.0, 1.0, … 0.0]"
"""E""",123.0,310.0,3.0,"""ДЗ""","""H2 E1""","""комментарий со…",,0.0,0.0,0.0,0,false,0.930468,0.025189,0.119213,0.000378,"""H E""","[1.0, 1.0, … 0.0]"
"""E""",123.0,1913.0,5.0,"""ДЗ""","""H3 D""","""Жонибек, хочу …",,0.0,0.0,0.0,2,false,0.069552,0.217348,0.019134,0.507822,"""H""","[0.0, 1.0, … 0.0]"
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Z""",133.0,,3.0,"""ДЗ""","""H2""","""требуемый форм…",,0.0,0.0,0.0,0,false,0.822199,0.013647,0.020974,0.00001,"""H""","[0.0, 1.0, … 0.0]"
"""Z""",,,0.0,,"""S1""","""заплатила и да…",,0.0,0.0,0.0,0,false,0.228166,0.042098,0.644235,0.006108,"""S""","[0.0, 0.0, … 0.0]"
"""Z""",,,7.0,,"""LMS""","""Крайне раздраж…",,0.0,0.0,0.0,0,false,0.692652,0.073706,0.262852,0.00523,"""LMS""","[0.0, 0.0, … 0.0]"
"""Z""",,,,,"""VC2 VP2""","""Аналитик данны…",,0.0,0.0,0.0,0,true,0.705795,0.053413,0.320831,0.001511,"""VC VP""","[0.0, 0.0, … 1.0]"


In [15]:
clear_dataset = dataset.select(
    pl.col("Комментарий"),
    pl.col("Направление"),
    pl.col("Факультет"),
    pl.col("Оценка"),
    pl.col("Neutral"),
    pl.col("Positive"),
    pl.col("Negative"),
    pl.col("Exclamations"),
    pl.col("have_code"),
    pl.col("Neutral_NLP"),
    pl.col("Positive_NLP"),
    pl.col("Negative_NLP"),
    pl.col("Speech_NLP"),
    pl.col("corrected_tag"),
    pl.col("labels"),
    pl.col("corrected_tag").str.split(by=" ").alias("temp"),
)
clear_dataset = clear_dataset.explode(columns=["temp"])

In [16]:
train_df, test_df = train_test_split(
    clear_dataset,
    test_size=0.1,
    random_state=3317,
    stratify=clear_dataset["temp"],
)

train_df = train_df.drop(columns=["corrected_tag", "temp"])
test_df = test_df.drop(columns=["corrected_tag", "temp"])

train_df = train_df.rename({"Комментарий": "text"})
test_df = test_df.rename({"Комментарий": "text"})

In [17]:
train_dataset = Dataset.from_pandas(train_df.to_pandas(), split="train")
test_dataset = Dataset.from_pandas(test_df.to_pandas(), split="test")

In [18]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")


def preprocess_data(sample: dict[str, Any]) -> dict[str, Any]:
    """Encode input text into sequence of tokens.
    Also add corresponding labels.

    Args:
        sample (dict[str, Any]): raw input text.

    Returns:
        dict[str, Any]: transformed sample with tokenized text and labels.
    """
    text = sample["text"]
    encoding = tokenizer(
        text,
        padding="max_length",
        truncation=True,
        max_length=512,
    )
    encoding["labels"] = sample["labels"]
    return encoding

In [19]:
encoded_train = train_dataset.map(
    preprocess_data, batched=True, remove_columns=train_dataset.column_names
)
encoded_test = test_dataset.map(
    preprocess_data, batched=True, remove_columns=test_dataset.column_names
)
encoded_train.set_format("torch")
encoded_test.set_format("torch")

Map:   0%|          | 0/61118 [00:00<?, ? examples/s]

Map:   0%|          | 0/6791 [00:00<?, ? examples/s]

# Bert Training 

In [20]:
def multi_label_metrics(
    predictions: np.ndarray, labels: np.ndarray, threshold: float = 0.5
) -> dict[str, float]:
    """Compute mltilabel metrics.

    Args:
        predictions (np.ndarray): logits array
        labels (np.ndarray): labels array
        threshold (float, optional): activation threshold. Defaults to 0.5.

    Returns:
        dict[str, float]: metrics dict
    """
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    f1_micro_average = f1_score(y_true=labels, y_pred=y_pred, average="micro")
    roc_auc = roc_auc_score(labels, y_pred, average="micro")
    accuracy = accuracy_score(labels, y_pred)
    metrics = {"f1": f1_micro_average, "roc_auc": roc_auc, "accuracy": accuracy}
    return metrics


def compute_metrics(p: EvalPrediction) -> dict[str, float]:
    """Metrics computation wrapper.

    Args:
        p (EvalPrediction): hf model output

    Returns:
        dict[str, float]: metrics dict
    """
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    result = multi_label_metrics(predictions=preds, labels=p.label_ids)
    return result

In [21]:
def make_training_pipeline(
    exp_name: str,
    tokenizer: AutoTokenizer,
    train_dataset: Dataset,
    eval_dataset: Dataset,
    batch_size: int = 64,
    lr: float = 2e-5,
    epochs_num: int = 20,
    model_name=None
) -> Trainer:
    """Training process wrapper.

    Args:
        exp_name (str): name of the local folder
        for saving model checkpoints.
        tokenizer (AutoTokenizer): model tokenizer
        train_dataset (Dataset): train dataset split
        eval_dataset (Dataset): test dataset split
        batch_size (int, optional): number of samples
        in sigle batch. Defaults to 32.
        lr (float, optional): model's learning rate. Defaults to 2e-5.
        epochs_num (int, optional):
        number of training iterations. Defaults to 20.

    Returns:
        Trainer: hf training pipeline abstraction class.
    """
    args = TrainingArguments(
        exp_name,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=lr,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=epochs_num,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        "cointegrated/rubert-tiny2",
        problem_type="multi_label_classification",
        num_labels=len(target),
        id2label=target,
        label2id=reverse_target
    )
    if model_name is not None:
        model.load_state_dict(torch.load(model_name))

    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )
    return trainer

In [22]:
BATCH_SIZE = 180
EPOCHS = 50
LR = 2e-5

In [23]:
trainer = make_training_pipeline("f_without_focal", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.346003,0.616881,0.736454,0.438374
2,No log,0.271092,0.741702,0.822785,0.554263
3,0.344500,0.235977,0.779079,0.853521,0.582094
4,0.344500,0.216731,0.797074,0.86624,0.60963
5,0.344500,0.204721,0.805586,0.870827,0.622441
6,0.211200,0.196783,0.811218,0.87272,0.633044
7,0.211200,0.190647,0.817693,0.878968,0.643351
8,0.211200,0.185663,0.821176,0.880308,0.649831
9,0.178000,0.181578,0.828007,0.886296,0.659255
10,0.178000,0.177097,0.832052,0.887559,0.667648


TrainOutput(global_step=7480, training_loss=0.12872666955631684, metrics={'train_runtime': 4105.645, 'train_samples_per_second': 744.317, 'train_steps_per_second': 2.07, 'total_flos': 1.984358718079795e+16, 'train_loss': 0.12872666955631684, 'epoch': 44.0})

In [31]:
trainer.evaluate()

{'eval_loss': 0.15414921939373016,
 'eval_f1': 0.8772913745650932,
 'eval_roc_auc': 0.9253983535608915,
 'eval_accuracy': 0.7629215137682226,
 'eval_runtime': 4.2929,
 'eval_samples_per_second': 1581.908,
 'eval_steps_per_second': 4.426,
 'epoch': 44.0}

In [32]:
train_preds = trainer.predict(encoded_train)

In [33]:
test_preds = trainer.predict(encoded_test)

In [28]:
print(compute_metrics(test_preds))

{'f1': 0.8772913745650932, 'roc_auc': 0.9253983535608915, 'accuracy': 0.7629215137682226}


In [23]:
trainer = make_training_pipeline("v", tokenizer, encoded_train, encoded_test, batch_size=BATCH_SIZE, epochs_num=EPOCHS, lr=LR, model_name="f/checkpoint-14340/pytorch_model.bin")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
train_preds = trainer.predict(encoded_train)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [25]:
test_preds = trainer.predict(encoded_test)

# Meta Model Training 

Сделать сравнение с Focal Loss и без него

In [37]:
train_dataset

Dataset({
    features: ['text', 'Направление', 'Факультет', 'Оценка', 'Neutral', 'Positive', 'Negative', 'Exclamations', 'have_code', 'Neutral_NLP', 'Positive_NLP', 'Negative_NLP', 'Speech_NLP', 'labels'],
    num_rows: 61118
})

In [38]:
directions = pd.get_dummies(train_df.to_pandas()["Направление"])
departments = pd.get_dummies(train_df.to_pandas()["Факультет"])
meta_dataset_train = train_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_train = pd.concat([meta_dataset_train, directions, departments, pd.DataFrame(train_preds.predictions)], axis=1)
meta_dataset_train = meta_dataset_train.drop(columns=["text"])

meta_dataset_test = test_df.select(pl.exclude("Направление", "Факультет")).to_pandas()
meta_dataset_test = pd.concat([meta_dataset_test, directions, departments, pd.DataFrame(test_preds.predictions)], axis=1)
meta_dataset_test = meta_dataset_test.drop(columns=["text"])

meta_dataset_test = meta_dataset_test.dropna(subset=["labels"])


In [39]:
from catboost import CatBoostClassifier, Pool

In [40]:
X_train, y_train = meta_dataset_train.drop('labels', axis=1), np.array(meta_dataset_train["labels"].to_list())
X_test, y_test = meta_dataset_test.drop('labels', axis=1), np.array(meta_dataset_test["labels"].to_list())


In [41]:
train_pool = Pool(X_train, y_train)
test_pool = Pool(X_test, y_test)

In [42]:
def objective(trial):
    model = CatBoostClassifier(
        iterations=trial.suggest_int("iterations", 500, 2000),
        learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
        depth=trial.suggest_int("depth", 4, 10),
        l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 1e-8, 100.0, log=True),
        bootstrap_type=trial.suggest_categorical("bootstrap_type", ["Bayesian"]),
        random_strength=trial.suggest_float("random_strength", 1e-8, 10.0, log=True),
        bagging_temperature=trial.suggest_float("bagging_temperature", 0.0, 10.0),
        od_type=trial.suggest_categorical("od_type", ["IncToDec", "Iter"]),
        od_wait=trial.suggest_int("od_wait", 10, 50),
        verbose=False,
        task_type="GPU",
        devices='0',
        loss_function = trial.suggest_categorical("loss_function", ["MultiCrossEntropy", "MultiLogloss"]))
    model.fit(train_pool, eval_set=test_pool)
    y_pred = model.predict(test_pool)
    return hamming_loss(y_test, y_pred)


In [43]:
sampler = TPESampler(seed=1337)
study = optuna.create_study(study_name="catboost", direction="minimize", sampler=sampler)
study.optimize(objective, n_trials=50)


[I 2024-05-22 16:42:19,114] A new study created in memory with name: catboost
[I 2024-05-22 16:42:28,049] Trial 0 finished with value: 0.04989797420956308 and parameters: {'iterations': 893, 'learning_rate': 0.0020766721769608226, 'depth': 5, 'l2_leaf_reg': 0.00039189423261207457, 'bootstrap_type': 'Bayesian', 'random_strength': 7.74470472863429e-06, 'bagging_temperature': 5.183928205975371, 'od_type': 'Iter', 'od_wait': 40, 'loss_function': 'MultiLogloss'}. Best is trial 0 with value: 0.04989797420956308.
[I 2024-05-22 16:43:09,065] Trial 1 finished with value: 0.04935103182783937 and parameters: {'iterations': 1443, 'learning_rate': 0.0017787538482591226, 'depth': 10, 'l2_leaf_reg': 0.00027055071527787685, 'bootstrap_type': 'Bayesian', 'random_strength': 0.12765125185466167, 'bagging_temperature': 7.941185757915866, 'od_type': 'Iter', 'od_wait': 33, 'loss_function': 'MultiCrossEntropy'}. Best is trial 1 with value: 0.04935103182783937.
[I 2024-05-22 16:43:10,584] Trial 2 finished wit

In [44]:
print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


Number of finished trials:  50
Best trial:
  Value:  0.047899530891726445
  Params: 
    iterations: 1411
    learning_rate: 0.006015379212439724
    depth: 10
    l2_leaf_reg: 0.00012435060455436673
    bootstrap_type: Bayesian
    random_strength: 1.2756152905883538e-05
    bagging_temperature: 1.9744454459953242
    od_type: Iter
    od_wait: 20
    loss_function: MultiCrossEntropy


In [46]:
model = CatBoostClassifier(**trial.params, verbose=True)
model.fit(train_pool, eval_set=test_pool)
pred_labels = model.predict(test_pool)


0:	learn: 0.6758143	test: 0.6766364	best: 0.6766364 (0)	total: 231ms	remaining: 5m 25s
1:	learn: 0.6587770	test: 0.6605332	best: 0.6605332 (1)	total: 469ms	remaining: 5m 30s
2:	learn: 0.6422147	test: 0.6448593	best: 0.6448593 (2)	total: 704ms	remaining: 5m 30s
3:	learn: 0.6261522	test: 0.6297606	best: 0.6297606 (3)	total: 934ms	remaining: 5m 28s
4:	learn: 0.6101038	test: 0.6145738	best: 0.6145738 (4)	total: 1.16s	remaining: 5m 26s
5:	learn: 0.5951995	test: 0.6005239	best: 0.6005239 (5)	total: 1.4s	remaining: 5m 28s
6:	learn: 0.5797521	test: 0.5858151	best: 0.5858151 (6)	total: 1.64s	remaining: 5m 29s
7:	learn: 0.5657393	test: 0.5726938	best: 0.5726938 (7)	total: 1.88s	remaining: 5m 29s
8:	learn: 0.5518817	test: 0.5597522	best: 0.5597522 (8)	total: 2.12s	remaining: 5m 30s
9:	learn: 0.5374216	test: 0.5462356	best: 0.5462356 (9)	total: 2.37s	remaining: 5m 31s
10:	learn: 0.5244859	test: 0.5342135	best: 0.5342135 (10)	total: 2.61s	remaining: 5m 32s
11:	learn: 0.5114694	test: 0.5220304	best:

In [47]:
cr = classification_report(np.array(meta_dataset_test["labels"].to_list()), pred_labels, output_dict=True)
cr = pd.DataFrame(cr).T
print(cr)

              precision    recall  f1-score  support
0              0.714765  0.647416  0.679426    329.0
1              0.894897  0.907782  0.901293   2917.0
2              0.886076  0.546875  0.676329    128.0
3              0.892670  0.821687  0.855709    415.0
4              0.767442  0.678082  0.720000    146.0
5              0.881835  0.915031  0.898126   3719.0
6              0.860051  0.887605  0.873611   1904.0
micro avg      0.875194  0.885541  0.880337   9558.0
macro avg      0.842534  0.772068  0.800642   9558.0
weighted avg   0.874511  0.885541  0.879148   9558.0
samples avg    0.859706  0.870417  0.855579   9558.0
