In [1]:
!pip install spacy
!pip install razdel
!pip install nltk
!python -m spacy download ru_core_news_lg
!pip install gensim
!pip install sentence-transformers
!pip install transformers
!pip install torch

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl.metadata (10.0 kB)
Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0
Collecting ru-core-news-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_lg-3.8.0/ru_core_news_lg-3.8.0-py3-none-any.whl (513.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m513.4/513.4 MB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from ru-core-news-lg==3.8.0)
  Downloading pymorphy3-2.0.6-py3-none-any.whl.metadata (2.4 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-lg==3.8.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-lg==3.8.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.6-py3-none-any.whl (53 

In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer

In [3]:
import re
import math
import json
import string
import numpy as np
from collections import Counter
from typing import List, Dict

import spacy
from razdel import tokenize, sentenize
import ru_core_news_lg
from gensim import corpora, models
from sentence_transformers import SentenceTransformer

In [4]:
!pip install catboost
!pip install xgboost
!pip install lightgbm

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

In [6]:
nlp = ru_core_news_lg.load()
VOWELS = "аеёиоуыэюя"

In [7]:
class BaseFeatureExtractor:
    """
    Базовый абстрактный класс для всех извлекателей признаков
    """

    def extract(self, text: str) -> Dict:
        # Основной метод извлечения признаков
        raise NotImplementedError

    def get_tokens(self, text: str) -> List[str]:
        # Получение списка слов
        return [t.text.lower() for t in tokenize(text) if t.text.isalpha()]

    def get_sentences(self, text: str) -> List[str]:
        # Разбиение текста на предложения
        return [s.text for s in sentenize(text)]

    def count_syllables(self, word: str) -> int:
        # Подсчёт количества слогов в слове
        return sum(1 for ch in word.lower() if ch in VOWELS)

In [8]:
class ReadabilityFeatures(BaseFeatureExtractor):

    """
    Индексы читабельности
    """

    #  Flesch–Kincaid readability index
    def flesch_kincaid(self, text):
        words = self.get_tokens(text)
        sents = self.get_sentences(text)

        if not words or not sents:
            return 0

        syllables = sum(self.count_syllables(w) for w in words)
        ASL = len(words) / len(sents)
        ASW = syllables / len(words)

        return 0.5 * ASL + 8.4 * ASW - 15.59

    # Coleman–Liau index
    def coleman_liau(self, text):
        words = self.get_tokens(text)
        sents = self.get_sentences(text)

        if not words:
            return 0

        chars = sum(len(w) for w in words)
        L = (chars / len(words)) * 100
        S = (len(sents) / len(words)) * 100

        return 0.055 * L - 0.35 * S - 20.33

    # Automated Readability Index
    def ari(self, text):
        words = self.get_tokens(text)
        sents = self.get_sentences(text)

        if not words or not sents:
            return 0

        chars = sum(len(w) for w in words)

        return 6.26 * (chars / len(words)) + 0.2805 * (len(words) / len(sents)) - 31.04

    # SMOG grade
    def smog(self, text):
        words = self.get_tokens(text)
        sents = self.get_sentences(text)

        if not sents:
            return 0

        polysyll = sum(1 for w in words if self.count_syllables(w) >= 3)

        return 1.1 * math.sqrt(polysyll * (64.6 / len(sents))) + 0.05


    def extract(self, text):
        return {
        "flesch_kincaid": self.flesch_kincaid(text),
        "ari": self.ari(text),
        "coleman_liau": self.coleman_liau(text),
        "smog": self.smog(text),
        }

In [9]:
class TraditionalFeatures(BaseFeatureExtractor):

    """
    Базовые  признаки
    """

    def extract(self, text):
        tokens = self.get_tokens(text)
        sents = self.get_sentences(text)

        sent_lens = [len(self.get_tokens(s)) for s in sents]
        word_lens = [len(w) for w in tokens]

        vocab = set(tokens)

        long_words = [w for w in tokens if self.count_syllables(w) > 4]

        doc = nlp(text)

        nouns = [t.text for t in doc if t.pos_ == "NOUN"]
        adjs = [t.text for t in doc if t.pos_ == "ADJ"]
        verbs = [t.text for t in doc if t.pos_ == "VERB"]

        # Type/token ratio
        def ttr(lst):
            return len(set(lst)) / max(1, len(lst))

        # NAV: TTR for Nouns only plus TTR for Adjectives only divided by
        # TTR for Verbs only
        nav = (ttr(nouns) + ttr(adjs)) / max(1e-5, ttr(verbs))

        return {
            "avg_sent_len": np.mean(sent_lens) if sent_lens else 0,
            "med_sent_len": np.median(sent_lens) if sent_lens else 0,
            "avg_word_len": np.mean(word_lens) if word_lens else 0,
            "med_word_len": np.median(word_lens) if word_lens else 0,
            "long_word_ratio": len(long_words) / max(1, len(tokens)),
            "ttr": len(vocab) / max(1, len(tokens)),
            "nav_ratio": nav,
            }

In [10]:
class MorphologicalFeatures(BaseFeatureExtractor):

    """
    Морфологические признаки
    """

    def extract(self, text):
        doc = nlp(text)

        pos_counts = Counter()
        cases = Counter()
        aspects = Counter()
        tenses = Counter()
        voices = Counter()
        animacy = Counter()
        participles = 0
        gerunds = 0

        total = 0

        for t in doc:
            if not t.is_alpha:
                continue

            total += 1
            pos_counts[t.pos_] += 1

            morph = t.morph

            # Падеж
            if "Case" in morph:
                cases[morph.get("Case")[0]] += 1
            # Вид
            if "Aspect" in morph:
                aspects[morph.get("Aspect")[0]] += 1

            # Время
            if "Tense" in morph:
                tenses[morph.get("Tense")[0]] += 1

            # Залог
            if "Voice" in morph:
                voices[morph.get("Voice")[0]] += 1

            # Одушевленность
            if "Animacy" in morph:
                animacy[morph.get("Animacy")[0]] += 1

            # Причастия
            if "VerbForm=Part" in str(morph):
                participles += 1

            # Деепричастия
            if "VerbForm=Conv" in str(morph):
                gerunds += 1

        feats = {}
        total = max(1, total)

        for k, v in pos_counts.items():
            feats[f"pos_{k}"] = v / max(1, total)

        for k, v in cases.items():
            feats[f"case_{k}"] = v / max(1, total)

        for k, v in aspects.items():
            feats[f"aspect_{k}"] = v / max(1, total)

        for k, v in tenses.items():
            feats[f"tense_{k}"] = v / max(1, total)

        for k, v in voices.items():
            feats[f"voice_{k}"] = v / max(1, total)

        for k, v in animacy.items():
            feats[f"animacy_{k}"] = v / max(1, total)

        feats["participle_ratio"] = participles / total
        feats["gerund_ratio"] = gerunds / total

        return feats

In [11]:
class PunctuationFeatures(BaseFeatureExtractor):
    """
    Пунктуационные признаки
    """
    def extract(self, text):
        tokens = list(tokenize(text))
        total = len(tokens)

        punct = [t.text for t in tokens if t.text in string.punctuation]
        semicolons = [p for p in punct if p == ";"]

        return {
        "punct_ratio": len(punct) / max(1, total),
        "semicolon_ratio": len(semicolons) / max(1, total),
        }

In [12]:
class SyntacticFeatures(BaseFeatureExtractor):
    """
    Синтаксические признаки
    """

    def extract(self, text):
        doc = nlp(text)

        depths = []             # syntactic tree depth
        dep_dists = []          # distance between a node and its descendant

        clauses = 0
        advcl = 0               # adverbial clause modifiers
        acl = 0                 # adnominal clauses
        ccomp = 0               # clausal complements
        xcomp = 0               # open clausal complements
        nominal_modifiers = 0
        max_nominal_chain = 0



In [13]:
class SyntacticFeatures(BaseFeatureExtractor):

    """
    Синтаксические признаки
    """

    def extract(self, text):
        doc = nlp(text)

        tree_depths = []          # глубина деревьев
        dependency_distances = []  # расстояния head-token

        clause_counts = {
            "ccomp": 0,             # clausal complements
            "xcomp": 0,             # open clausal complements
            "advcl": 0,             # adverbial clause modifiers
            "acl": 0                # adnominal clauses
        }

        nominal_modifiers = 0
        max_nominal_chain = 0

        sent_count = 0

        def get_depth(token):
            """
            Рекурсивно считает глубину поддерева
            """

            if not list(token.children):
                return 1

            return 1 + max(get_depth(child) for child in token.children)

        for sent in doc.sents:
            sent_count += 1
            roots = [t for t in sent if t.head == t]

            if roots:
                depth = max(get_depth(root) for root in roots)
                tree_depths.append(depth)

            for t in sent:
                # Расстояние в дереве
                if t.head != t:
                    dist = abs(t.i - t.head.i)
                    dependency_distances.append(dist)

                # Клаузы
                if t.dep_ in clause_counts:
                    clause_counts[t.dep_] += 1

                # Именные группы
                if t.pos_ == "NOUN":

                    modifiers = []

                    for child in t.children:
                        if child.dep_ in {
                            "amod",
                            "nmod",
                            "acl",
                            "det",
                            "nummod",
                            "compound"
                        }:
                            modifiers.append(child)

                    nominal_modifiers += len(modifiers)

                    # Цепочки модификаторов
                    for m in modifiers:
                        length = 1
                        current = m

                        while True:
                            next_mod = None

                            for ch in current.children:

                                if ch.dep_ in {
                                    "amod",
                                    "nmod",
                                    "compound"
                                }:
                                    next_mod = ch
                                    break

                            if next_mod:
                                length += 1
                                current = next_mod
                            else:
                                break

                        max_nominal_chain = max(
                            max_nominal_chain,
                            length
                        )

        feats = {}

        # Глубина дерева
        feats["tree_depth_mean"] = np.mean(tree_depths) if tree_depths else 0
        feats["tree_depth_median"] = np.median(tree_depths) if tree_depths else 0
        feats["tree_depth_max"] = max(tree_depths) if tree_depths else 0

        # Расстояния зависимостей
        feats["dep_dist_mean"] = np.mean(dependency_distances) if dependency_distances else 0
        feats["dep_dist_median"] = np.median(dependency_distances) if dependency_distances else 0
        feats["dep_dist_max"] = max(dependency_distances) if dependency_distances else 0

        # Клаузы
        for k, v in clause_counts.items():
            feats[f"clause_{k}"] = v

        feats["clause_total"] = sum(clause_counts.values())

        # Именные модификаторы
        feats["nominal_modifiers_count"] = nominal_modifiers
        feats["max_nominal_chain"] = max_nominal_chain

        # Нормализация по предложениям
        feats["sent_count"] = sent_count

        if sent_count > 0:
            feats["nominal_modifiers_per_sent"] = (
                nominal_modifiers / sent_count
            )

            feats["clauses_per_sent"] = (
                feats["clause_total"] / sent_count
            )

        else:
            feats["nominal_modifiers_per_sent"] = 0
            feats["clauses_per_sent"] = 0

        return feats

In [14]:
class NERFeatures(BaseFeatureExtractor):
    """
    Именованные сущности
    """
    def extract(self, text):

        doc = nlp(text)

        ents = Counter(ent.label_ for ent in doc.ents)
        total = sum(ents.values())

        feats = {}

        #  Именнованные сущности по типам
        for k, v in ents.items():
            feats[f"ner_{k}"] = v / max(1, total)

        feats["ner_total"] = total

        return feats

In [15]:
class LDAFeatures(BaseFeatureExtractor):
    """
    Тематические признаки LDA
    """

    def __init__(self, num_topics=100):
        self.num_topics = num_topics
        self.lda = None
        self.dictionary = None


    def fit(self, texts: List[str]):
        tokenized = [self.get_tokens(t) for t in texts]

        self.dictionary = corpora.Dictionary(tokenized)
        corpus = [self.dictionary.doc2bow(t) for t in tokenized]

        self.lda = models.LdaModel(
            corpus,
            num_topics=self.num_topics,
            id2word=self.dictionary,
            passes=10
            )

    def extract(self, text):
        if self.lda is None:
            raise ValueError()

        bow = self.dictionary.doc2bow(self.get_tokens(text))
        topics = self.lda.get_document_topics(bow)
        vec = np.zeros(self.num_topics)
        for i, w in topics:
            vec[i] = w

        return {f"topic_{i}": vec[i] for i in range(self.num_topics)}

In [16]:
class EmbeddingFeaturesWithPCA(BaseFeatureExtractor):
    """
    Класс для эмбеддингов с PCA уменьшением размерности
    """

    def __init__(self,
                 model_name='cointegrated/rubert-tiny2',
                 n_components=128):

        super().__init__()

        self.model = SentenceTransformer(model_name)
        self.original_dim = self.model.get_sentence_embedding_dimension()
        self.n_components = n_components

        self.pca = PCA(n_components=n_components)
        self.pca_fitted = False

    def fit_pca(self, texts: list):

        # Получаем эмбеддинги для всех текстов
        embeddings = []
        batch_size = 32

        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            batch_embeddings = self.model.encode(
                batch,
                convert_to_tensor=False,
                show_progress_bar=False
            )
            embeddings.append(batch_embeddings)

        # Объединяем все эмбеддинги
        embeddings = np.vstack(embeddings)

        # Обучаем PCA
        self.pca.fit(embeddings)
        self.pca_fitted = True


    def extract(self, text: str) -> Dict:
        # Получаем оригинальный эмбеддинг
        embedding = self.model.encode(
            text,
            convert_to_tensor=False,
            show_progress_bar=False
        )

        # Применяем PCA, если он обучен
        if self.pca_fitted:
            embedding = self.pca.transform(embedding.reshape(1, -1)).flatten()
        else:
            embedding = embedding[:self.n_components]

        features = {}
        for i in range(len(embedding)):
            features[f"emb_pca_{i}"] = float(embedding[i])

        return features

In [17]:
class FeaturePipeline:
    """
    Конвейер признаков
    """
    def __init__(self, extractors: List[BaseFeatureExtractor]):
        self.extractors = extractors

    def extract(self, text: str) -> Dict:
        features = {}
        for ext in self.extractors:
            features.update(ext.extract(text))
        return features

    def extract_batch(self, texts: List[str], batch_size: int = 64) -> pd.DataFrame:
        all_features = []

        for i in tqdm(range(0, len(texts), batch_size), desc="Извлечение признаков"):
            batch = texts[i:i+batch_size]
            batch_features = []

            for text in batch:
                feats = self.extract(str(text))
                batch_features.append(feats)

            all_features.extend(batch_features)

        features_df = pd.DataFrame(all_features)

        features_df = features_df.fillna(0)

        return features_df

In [19]:
train_df = pd.read_csv('train_dataset.csv')
test_df = pd.read_csv('test_dataset.csv')
val_df = pd.read_csv('val_dataset.csv')

all_texts = pd.concat([train_df['text'], test_df['text'], val_df['text']]).reset_index(drop=True)
all_texts_df = pd.DataFrame({'text': all_texts})

In [20]:
n_train = len(train_df)
n_test = len(test_df)
n_val = len(val_df)

In [21]:
# Инициализация LDA
lda_extractor = LDAFeatures(num_topics=30)
lda_extractor.fit(all_texts_df['text'].tolist())

In [22]:
# Инициализация эмбеддингов
emb_extractor = EmbeddingFeaturesWithPCA()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/118M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/401 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [23]:
# Создание пайплайна
pipeline = FeaturePipeline([
    ReadabilityFeatures(),
    TraditionalFeatures(),
    MorphologicalFeatures(),
    PunctuationFeatures(),
    NERFeatures(),
    SyntacticFeatures(),
    lda_extractor,
    emb_extractor
])

In [24]:
all_features = pipeline.extract_batch(all_texts_df['text'].tolist(), batch_size=64)

Извлечение признаков:   0%|          | 0/414 [00:00<?, ?it/s]

In [25]:
X_train = all_features.iloc[:n_train].reset_index(drop=True)
X_test = all_features.iloc[n_train:n_train+n_test].reset_index(drop=True)
X_val = all_features.iloc[n_train+n_test:n_train+n_test+n_val].reset_index(drop=True)
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"X_val: {X_val.shape}")

X_train: (21163, 209)
X_test: (2645, 209)
X_val: (2646, 209)


In [26]:
le = LabelEncoder()
y_train_enc = le.fit_transform(train_df['cefr_level'])
y_test_enc = le.transform(test_df['cefr_level'])
y_val_enc = le.transform(val_df['cefr_level'])

In [27]:
# Инициализируем модели
models = {
    'CatBoost': CatBoostClassifier(
        iterations=500,
        learning_rate=0.05,
        depth=6,
        loss_function='MultiClass',
        verbose=0,
        random_seed=42
    ),
    'XGBoost': XGBClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        random_state=42,
        use_label_encoder=False,
        eval_metric='mlogloss',
        n_jobs=-1
    ),
    'LightGBM': LGBMClassifier(
        n_estimators=500,
        max_depth=6,
        learning_rate=0.05,
        random_state=42,
        n_jobs=-1
    ),
    'RandomForest': RandomForestClassifier(
        n_estimators=500,
        random_state=42,
        n_jobs=-1
    )
}

In [28]:
val_results = []

for name, model in tqdm(models.items(), desc="Обучение моделей"):
    print(f"\n{'='*40}")
    print(f"Модель: {name}")
    print(f"{'='*40}")

    model.fit(X_train, y_train_enc)

    y_val_pred = model.predict(X_val)

    val_accuracy = accuracy_score(y_val_enc, y_val_pred)
    val_f1_weighted = f1_score(y_val_enc, y_val_pred, average='weighted')
    val_f1_macro = f1_score(y_val_enc, y_val_pred, average='macro')

    val_results.append({
        'model_name': name,
        'model': model,
        'val_accuracy': val_accuracy,
        'val_f1_weighted': val_f1_weighted,
        'val_f1_macro': val_f1_macro
    })

    print(f"\nРезультаты на валидации:")
    print(f"  Accuracy: {val_accuracy:.4f}")
    print(f"  F1 Weighted: {val_f1_weighted:.4f}")
    print(f"  F1 Macro: {val_f1_macro:.4f}")

Обучение моделей:   0%|          | 0/4 [00:00<?, ?it/s]


Модель: CatBoost

Результаты на валидации:
  Accuracy: 0.4924
  F1 Weighted: 0.4887
  F1 Macro: 0.4887

Модель: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Результаты на валидации:
  Accuracy: 0.5163
  F1 Weighted: 0.5151
  F1 Macro: 0.5151

Модель: LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.074679 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 44853
[LightGBM] [Info] Number of data points in the train set: 21163, number of used features: 208
[LightGBM] [Info] Start training from score -1.791523
[LightGBM] [Info] Start training from score -1.791807
[LightGBM] [Info] Start training from score -1.791807
[LightGBM] [Info] Start training from score -1.791807
[LightGBM] [Info] Start training from score -1.791807
[LightGBM] [Info] Start training from score -1.791807

Результаты на валидации:
  Accuracy: 0.5325
  F1 Weighted: 0.5313
  F1 Macro: 0.5313

Модель: RandomForest

Результаты на валидации:
  Accuracy: 0.4630
  F1 Weighted: 0.4587
  F1 Macro: 0.4587


In [29]:
results_df = pd.DataFrame(val_results)
results_df = results_df.sort_values('val_f1_weighted', ascending=False)

print("Результаты на валидации:")
print("-" * 60)
print(results_df[['model_name', 'val_accuracy', 'val_f1_weighted', 'val_f1_macro']]
      .to_string(index=False))

Результаты на валидации:
------------------------------------------------------------
  model_name  val_accuracy  val_f1_weighted  val_f1_macro
    LightGBM      0.532502         0.531302      0.531302
     XGBoost      0.516251         0.515126      0.515126
    CatBoost      0.492441         0.488703      0.488703
RandomForest      0.462963         0.458735      0.458735


In [30]:
best_result = results_df.iloc[0]
best_model_name = best_result['model_name']
print(f'Лучшая модель: {best_model_name}')

Лучшая модель: LightGBM


In [31]:
X_train_full = pd.concat([X_train, X_val], ignore_index=True)
y_train_full = np.concatenate([y_train_enc, y_val_enc])

print(f"X_train_full: {X_train_full.shape}")
print(f"y_train_full: {y_train_full.shape}")

X_train_full: (23809, 209)
y_train_full: (23809,)


In [32]:
final_model = XGBClassifier(
    n_estimators=750,
    max_depth=8,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    early_stopping_rounds=100,
    n_jobs=-1
)

In [33]:
final_model.fit(
    X_train_full,
    y_train_full,
    eval_set=[(X_test, y_test_enc)],
    verbose=100
)

Parameters: { "use_label_encoder" } are not used.

  self.starting_round = model.num_boosted_rounds()


[0]	validation_0-mlogloss:1.77775
[100]	validation_0-mlogloss:1.35847
[200]	validation_0-mlogloss:1.27786
[300]	validation_0-mlogloss:1.24194
[400]	validation_0-mlogloss:1.22471
[500]	validation_0-mlogloss:1.21516
[600]	validation_0-mlogloss:1.21023
[700]	validation_0-mlogloss:1.20790
[749]	validation_0-mlogloss:1.20710


In [34]:
print(f"Best iteration: {final_model.best_iteration}")

Best iteration: 748


In [35]:
y_pred = final_model.predict(X_test, iteration_range=(0, final_model.best_iteration + 1))

In [36]:
print("Классификационный отчет (TEST)")
print("=" * 60)
print(classification_report(
    y_test_enc,
    y_pred,
    target_names=le.classes_,
    digits=4))

Классификационный отчет (TEST)
              precision    recall  f1-score   support

          A1     0.5219    0.5409    0.5312       440
          A2     0.4347    0.4376    0.4362       441
          B1     0.4745    0.4218    0.4466       441
          B2     0.5558    0.5986    0.5764       441
          C1     0.6437    0.6349    0.6393       441
          C2     0.5847    0.5873    0.5860       441

    accuracy                         0.5369      2645
   macro avg     0.5359    0.5369    0.5359      2645
weighted avg     0.5359    0.5369    0.5359      2645



In [37]:
final_model.save_model(f'XGBClassifier_best_{final_model.best_iteration}.json')

In [38]:
accuracy = accuracy_score(y_test_enc, y_pred)
f1_weighted = f1_score(y_test_enc, y_pred, average='weighted')
f1_macro = f1_score(y_test_enc, y_pred, average='macro')

report_data = {
    'model_info': {
        'model_type': 'XGBoost',
        'best_iteration': int(final_model.best_iteration),
        'best_score': float(final_model.best_score),
        'n_features': X_train_full.shape[1],
        'classes': le.classes_.tolist()
    },
    'dataset_info': {
        'train_samples': len(X_train),
        'val_samples': len(X_val),
        'test_samples': len(X_test),
        'train_full_samples': len(X_train_full)
    },
    'metrics': {
        'accuracy': float(accuracy),
        'f1_weighted': float(f1_weighted),
        'f1_macro': float(f1_macro),
    },
    "classification_report": classification_report(y_test_enc,
                                                   y_pred,
                                                   target_names=le.classes_,
                                                   digits=4,
                                                   output_dict=True)
}

report_filename = f'XGBClassifier_best_{final_model.best_iteration}_report.json'
with open(report_filename, 'w', encoding='utf-8') as f:
    json.dump(report_data, f, indent=2, ensure_ascii=False)

In [40]:
X_train.to_csv('X_train_features.csv', index=False)
X_test.to_csv('X_test_features.csv', index=False)
X_val.to_csv('X_val_features.csv', index=False)

np.save('y_train_enc.npy', y_train_enc)
np.save('y_test_enc.npy', y_test_enc)
np.save('y_val_enc.npy', y_val_enc)

In [42]:
X_train_full.to_csv('X_train_full_features.csv', index=False)
np.save('y_train_full_enc.npy', y_train_full)