In [1]:
from typing import List, Tuple
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import spacy
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_sib200_ru() -> Tuple[Tuple[List[str], List[int]], Tuple[List[str], List[int]], Tuple[List[str], List[int]], List[str]]:
    trainset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='train')
    X_train = trainset['text']
    y_train = trainset['category']
    valset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='validation')
    X_val = valset['text']
    y_val = valset['category']
    testset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='test')
    X_test = testset['text']
    y_test = testset['category']
    categories = set(y_train)
    unknown_categories = set(y_val) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the validation set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    unknown_categories = set(y_test) - categories
    if len(unknown_categories) > 0:
        err_msg = f'The categories {unknown_categories} are represented in the test set, but they are not represented in the training set.'
        raise RuntimeError(err_msg)
    categories = sorted(list(categories))
    y_train = [categories.index(it) for it in y_train]
    y_val = [categories.index(it) for it in y_val]
    y_test = [categories.index(it) for it in y_test]
    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

def normalize_text(s: str, nlp_pipeline: spacy.Language) -> str:
    doc = nlp_pipeline(s)
    lemmas = [('<NUM>' if token.like_num else token.lemma_.lower()) for token in filter(lambda it1: not it1.is_punct, doc)]
    if len(lemmas) == 0:
        return ''
    return ' '.join(lemmas)

In [3]:
train_data, val_data, test_data, classes_list = load_sib200_ru()

In [9]:
print(f'Categories: {classes_list}')
print(f'Amout of Categories: {len(classes_list)}')
print()
print(f'X_train length: {len(train_data[0])}')
print(f'y_train length: {len(train_data[1])}')
print()
print(f'X_val length: {len(val_data[0])}')
print(f'y_val length: {len(val_data[1])}')
print()
print(f'X_test length: {len(test_data[0])}')
print(f'y_test length: {len(test_data[1])}')

Categories: ['entertainment', 'geography', 'health', 'politics', 'science/technology', 'sports', 'travel']
Amout of Categories: 7

X_train length: 701
y_train length: 701

X_val length: 99
y_val length: 99

X_test length: 204
y_test length: 204


In [None]:
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('russian')
print(STOP_WORDS)
print(f'STOP len: {len(STOP_WORDS)}')

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впр

In [14]:
nlp = spacy.load('ru_core_news_sm')

In [15]:
class_probability = 1.0 / len(classes_list)
max_df = 1.0 - 0.2 * class_probability
print(f'Maximal document frequency of term is {max_df}.')

Maximal document frequency of term is 0.9714285714285714.


In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import FeatureUnion
from tqdm import tqdm # Для отслеживания прогресса
import gensim

In [47]:
word_vectorizer = TfidfVectorizer(
    analyzer='word',
    token_pattern=r'\w+', 
    max_df=max_df,
    min_df=1, # Фильтруем редкий шум (слова, которые встречаются менее 5 раз)
    stop_words=STOP_WORDS # <-- Используем стоп-слова только здесь!
)

# 2. Настройки для Character N-грамм (Морфология)
char_vectorizer = TfidfVectorizer(
    analyzer='char_wb',
    # min_df и max_df оставим более мягкими, чтобы не отсечь важные части слов
    max_df=1.0, 
    min_df=1 
    # НЕ используем stop_words и token_pattern, т.к. analyzer='char_wb'
)

# --- ФИНАЛЬНЫЙ ПАЙПЛАЙН ---

feature_union = FeatureUnion([
    ('word_feats', word_vectorizer),
    ('char_feats', char_vectorizer)
])

classifier_final = Pipeline(steps=[
    ('union', feature_union), # Шаг 1: Объединение признаков
    ('cls', LinearSVC(
        random_state=42, 
        max_iter=10000, 
        penalty='l2', 
        multi_class='ovr',
        # C и loss будут тюниться в сетке, но можно задать начальное значение
        C=1, 
        loss='squared_hinge'
    )) 
])

# --- СЕТКА ГИПЕРПАРАМЕТРОВ ---

param_grid_final = {
    # Тюнинг LinearSVC (Твои лучшие параметры C=1, loss='squared_hinge')
    'cls__C': [0.5, 1, 2], 
    
    # Тюнинг Word N-грамм (Твои лучшие были (1, 2))
    'union__word_feats__ngram_range': [(1, 2), (1, 3)],

    # Тюнинг Character N-грамм (Новые признаки!)
    'union__char_feats__ngram_range': [ (3, 5), (2, 6) ], 
    
    # Тюнинг min_df для слов (чтобы проверить, не слишком ли агрессивно min_df=5)
    'union__word_feats__min_df': [1, 2, 5]
}

cv_final = GridSearchCV(
    estimator=classifier_final,
    param_grid=param_grid_final,
    scoring='f1_macro',
    cv=5,
    refit=True,
    n_jobs=-1,
    verbose=True
)


In [48]:
X_train_norm = [normalize_text(s, nlp) for s in train_data[0]]
cv_final.fit(X_train_norm, train_data[1])

Fitting 5 folds for each of 36 candidates, totalling 180 fits


0,1,2
,estimator,Pipeline(step...m_state=42))])
,param_grid,"{'cls__C': [0.5, 1, ...], 'union__char_feats__ngram_range': [(3, ...), (2, ...)], 'union__word_feats__min_df': [1, 2, ...], 'union__word_feats__ngram_range': [(1, ...), (1, ...)]}"
,scoring,'f1_macro'
,n_jobs,-1
,refit,True
,cv,5
,verbose,True
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformer_list,"[('word_feats', ...), ('char_feats', ...)]"
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,"['и', 'в', ...]"
,token_pattern,'\\w+'

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'char_wb'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,2
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [49]:
print('Best parameters:')
print(cv_final.best_params_)

print('Best F1-macro:')
print(cv_final.best_score_)

Best parameters:
{'cls__C': 2, 'union__char_feats__ngram_range': (3, 5), 'union__word_feats__min_df': 1, 'union__word_feats__ngram_range': (1, 3)}
Best F1-macro:
0.6725164746756854


In [50]:
best_model = cv_final.best_estimator_
final_report = classification_report(test_data[1], best_model.predict(test_data[0]), target_names=classes_list)
print(final_report)

                    precision    recall  f1-score   support

     entertainment       0.60      0.32      0.41        19
         geography       0.75      0.53      0.62        17
            health       0.81      0.59      0.68        22
          politics       0.80      0.80      0.80        30
science/technology       0.60      0.90      0.72        51
            sports       1.00      0.60      0.75        25
            travel       0.61      0.68      0.64        40

          accuracy                           0.69       204
         macro avg       0.74      0.63      0.66       204
      weighted avg       0.72      0.69      0.68       204

