In [1]:
from typing import List, Tuple
import numpy as np
import re
import spacy
from spacy.lang.ru.stop_words import STOP_WORDS
from datasets import load_dataset
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases, Phraser

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_sib200_ru():
    trainset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='train')
    valset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='validation')
    testset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='test')

    X_train, y_train = trainset['text'], trainset['category']
    X_val, y_val = valset['text'], valset['category']
    X_test, y_test = testset['text'], testset['category']

    categories = sorted(list(set(y_train)))
    y_train = [categories.index(it) for it in y_train]
    y_val = [categories.index(it) for it in y_val]
    y_test = [categories.index(it) for it in y_test]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

train_data, val_data, test_data, classes_list = load_sib200_ru()


nlp = spacy.load("ru_core_news_sm", disable=["ner", "parser"])

# заменяем ссылки, почты и числа спецтокенами.
def normalize_text(s: str, nlp_pipeline: spacy.Language):
    s = re.sub(r'https?://\S+|www\.\S+', '<URL>', s)
    s = re.sub(r'\S+@\S+', '<EMAIL>', s)
    s = re.sub(r'\d+', '<NUM>', s)
    doc = nlp_pipeline(s)
    tokens = []
    for token in doc:
        if token.is_punct or token.is_space or token.lower_ in STOP_WORDS:
            continue
        lemma = token.lemma_.lower()
        if len(lemma) <= 2:
            continue
        tokens.append(lemma)
    return tokens

print(" Нормализация текста...")
X_train_norm = [normalize_text(it, nlp) for it in train_data[0]]
X_val_norm = [normalize_text(it, nlp) for it in val_data[0]]
X_test_norm = [normalize_text(it, nlp) for it in test_data[0]]

# Биграммы / триграммы

print(" Построение биграмм/триграмм...")
bigram = Phrases(X_train_norm, min_count=5, threshold=10, delimiter='_')
trigram = Phrases(bigram[X_train_norm], min_count=5, threshold=10, delimiter='_')
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

def make_ngrams(texts):
    return [trigram_mod[bigram_mod[toks]] for toks in texts]

X_train_norm = make_ngrams(X_train_norm)
X_val_norm = make_ngrams(X_val_norm)
X_test_norm = make_ngrams(X_test_norm)

print(X_train_norm[0])

 Нормализация текста...
 Построение биграмм/триграмм...
['турция', 'три', 'сторона', 'окружить', 'морями', 'запад', 'эгейским', 'север', 'чёрный', 'средиземный']


In [10]:
print(" Обучение TF-IDF...")
tfidf = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf.fit(X_train_norm)
idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

print(" Загрузка FastText...")
MODEL_PATH_VEC = "./models/cc.ru.300.vec"
w2v = KeyedVectors.load_word2vec_format(MODEL_PATH_VEC, binary=False)

def sentence_vector_tfidf(tokens):
    vecs, weights = [], []
    for w in tokens:
        if w in w2v and w in idf_dict:
            vecs.append(w2v[w])
            weights.append(idf_dict[w])
    if not vecs:
        return np.zeros(w2v.vector_size)
    vecs = np.array(vecs)
    weights = np.array(weights).reshape(-1, 1)
    return np.sum(vecs * weights, axis=0) / np.sum(weights)






 Обучение TF-IDF...
 Загрузка FastText...




In [11]:
from sklearn.preprocessing import normalize

print(" Векторизация...")
X_train_vec = np.array([sentence_vector_tfidf(toks) for toks in X_train_norm])
X_val_vec = np.array([sentence_vector_tfidf(toks) for toks in X_val_norm])
X_test_vec = np.array([sentence_vector_tfidf(toks) for toks in X_test_norm])

pca = PCA(n_components=120, random_state=42)
X_train_vec = pca.fit_transform(X_train_vec)
X_val_vec = pca.transform(X_val_vec)
X_test_vec = pca.transform(X_test_vec)


X_train_vec = normalize(X_train_vec, norm='l2')
X_val_vec = normalize(X_val_vec, norm='l2')
X_test_vec = normalize(X_test_vec, norm='l2')

 Векторизация...


In [12]:


models = {
    'SVM': (LinearSVC(random_state=42),
            {
                'cls__C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
                'cls__class_weight': ['balanced'],
                'cls__max_iter': [30000, 10000, 15000, 20000, 100000]
            }),
    
}

best_models = {}

for name, (model, params) in models.items():
    print(f"\n Обучаем {name}...")
    pipeline = Pipeline([
        ('scaler', StandardScaler()), # Нормализует признаки(усредняет и масштабирует до стандартного отклонения 1)
        ('cls', model)
    ])

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=params,
        scoring='f1_macro',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X_train_vec, train_data[1])
    best_models[name] = grid.best_estimator_

    print(f"\n Лучшие параметры для {name}: {grid.best_params_}")
    print(f" Лучший F1-macro (train CV): {grid.best_score_:.4f}")

    # Валидация
    print(f"\n Валидация ({name}):")
    y_val_pred = grid.predict(X_val_vec)
    print(classification_report(val_data[1], y_val_pred, target_names=classes_list))

# Финальная модель и тест

best_model_name = max(best_models.keys(), key=lambda n: best_models[n].score(X_val_vec, val_data[1]))
final_model = best_models[best_model_name]

print(f"\n Лучшая модель: {best_model_name}")

print("\n Тест:")
y_test_pred = final_model.predict(X_test_vec)
print(classification_report(test_data[1], y_test_pred, target_names=classes_list))


 Обучаем SVM...
Fitting 5 folds for each of 35 candidates, totalling 175 fits

 Лучшие параметры для SVM: {'cls__C': 0.01, 'cls__class_weight': 'balanced', 'cls__max_iter': 30000}
 Лучший F1-macro (train CV): 0.7402

 Валидация (SVM):
                    precision    recall  f1-score   support

     entertainment       0.83      0.56      0.67         9
         geography       0.46      0.75      0.57         8
            health       0.64      0.82      0.72        11
          politics       0.90      0.64      0.75        14
science/technology       0.71      0.80      0.75        25
            sports       0.71      0.83      0.77        12
            travel       0.71      0.50      0.59        20

          accuracy                           0.70        99
         macro avg       0.71      0.70      0.69        99
      weighted avg       0.72      0.70      0.70        99


 Лучшая модель: SVM

 Тест:
                    precision    recall  f1-score   support

     entert