In [11]:
# ================================================
# 1Ô∏è‚É£ –ò–º–ø–æ—Ä—Ç –±–∏–±–ª–∏–æ—Ç–µ–∫
# ================================================
from typing import List, Tuple
import numpy as np
import re
import spacy
from spacy.lang.ru.stop_words import STOP_WORDS
from datasets import load_dataset
from gensim.models import KeyedVectors
from gensim.models.phrases import Phrases, Phraser

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


# ================================================
# 2Ô∏è‚É£ –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞—Ç–∞—Å–µ—Ç–∞
# ================================================
def load_sib200_ru():
    trainset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='train')
    valset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='validation')
    testset = load_dataset('Davlan/sib200', 'rus_Cyrl', split='test')

    X_train, y_train = trainset['text'], trainset['category']
    X_val, y_val = valset['text'], valset['category']
    X_test, y_test = testset['text'], testset['category']

    categories = sorted(list(set(y_train)))
    y_train = [categories.index(it) for it in y_train]
    y_val = [categories.index(it) for it in y_val]
    y_test = [categories.index(it) for it in y_test]

    return (X_train, y_train), (X_val, y_val), (X_test, y_test), categories

train_data, val_data, test_data, classes_list = load_sib200_ru()


# ================================================
# 3Ô∏è‚É£ –ü—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥: —á–∏—Å—Ç–∞—è –ª–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è
# ================================================
nlp = spacy.load("ru_core_news_sm", disable=["ner", "parser"])

def normalize_text(s: str, nlp_pipeline: spacy.Language):
    s = re.sub(r'https?://\S+|www\.\S+', '<URL>', s)
    s = re.sub(r'\S+@\S+', '<EMAIL>', s)
    s = re.sub(r'\d+', '<NUM>', s)
    doc = nlp_pipeline(s)
    tokens = []
    for token in doc:
        if token.is_punct or token.is_space or token.lower_ in STOP_WORDS:
            continue
        lemma = token.lemma_.lower()
        if len(lemma) <= 2:
            continue
        tokens.append(lemma)
    return tokens

print("üßπ –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞...")
X_train_norm = [normalize_text(it, nlp) for it in train_data[0]]
X_val_norm = [normalize_text(it, nlp) for it in val_data[0]]
X_test_norm = [normalize_text(it, nlp) for it in test_data[0]]


# ================================================
# 4Ô∏è‚É£ –ë–∏–≥—Ä–∞–º–º—ã / —Ç—Ä–∏–≥—Ä–∞–º–º—ã
# ================================================
print("üîó –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –±–∏–≥—Ä–∞–º–º/—Ç—Ä–∏–≥—Ä–∞–º–º...")
bigram = Phrases(X_train_norm, min_count=5, threshold=10, delimiter='_')
trigram = Phrases(bigram[X_train_norm], min_count=5, threshold=10, delimiter='_')
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

def make_ngrams(texts):
    return [trigram_mod[bigram_mod[toks]] for toks in texts]

X_train_norm = make_ngrams(X_train_norm)
X_val_norm = make_ngrams(X_val_norm)
X_test_norm = make_ngrams(X_test_norm)


# ================================================
# 5Ô∏è‚É£ TF-IDF + FastText
# ================================================
print("üìà –û–±—É—á–µ–Ω–∏–µ TF-IDF...")
tfidf = TfidfVectorizer(
    tokenizer=lambda x: x,
    lowercase=False,
    sublinear_tf=True,
    min_df=3,
    max_df=0.8
)
tfidf.fit(X_train_norm)
idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

print("üíæ –ó–∞–≥—Ä—É–∑–∫–∞ FastText...")
MODEL_PATH_VEC = "./models/cc.ru.300.vec"
w2v = KeyedVectors.load_word2vec_format(MODEL_PATH_VEC, binary=False)

def sentence_vector_tfidf(tokens):
    vecs, weights = [], []
    for w in tokens:
        if w in w2v and w in idf_dict:
            vecs.append(w2v[w])
            weights.append(idf_dict[w])
    if not vecs:
        return np.zeros(w2v.vector_size)
    vecs = np.array(vecs)
    weights = np.array(weights).reshape(-1, 1)
    return np.sum(vecs * weights, axis=0) / np.sum(weights)


print("üìê –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...")
X_train_vec = np.array([sentence_vector_tfidf(toks) for toks in X_train_norm])
X_val_vec = np.array([sentence_vector_tfidf(toks) for toks in X_val_norm])
X_test_vec = np.array([sentence_vector_tfidf(toks) for toks in X_test_norm])

# PCA + –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è
pca = PCA(n_components=150, random_state=42)
X_train_vec = pca.fit_transform(X_train_vec)
X_val_vec = pca.transform(X_val_vec)
X_test_vec = pca.transform(X_test_vec)

X_train_vec = normalize(X_train_vec, norm='l2')
X_val_vec = normalize(X_val_vec, norm='l2')
X_test_vec = normalize(X_test_vec, norm='l2')


# ================================================
# 6Ô∏è‚É£ Logistic Regression (–≤–º–µ—Å—Ç–æ SVM)
# ================================================
print("\nüöÄ –û–±—É—á–∞–µ–º Logistic Regression...")
pipeline = Pipeline([
    ('cls', LogisticRegression(max_iter=20000, class_weight='balanced', solver='liblinear'))
])

params = {
    'cls__C': [0.01, 0.05, 0.1, 0.5, 1],
    'cls__penalty': ['l2']
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=params,
    scoring='f1_macro',
    cv=5,
    n_jobs=-1,
    verbose=2
)

grid.fit(X_train_vec, train_data[1])
best_model = grid.best_estimator_

print(f"\n‚úÖ –õ—É—á—à–∏–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã: {grid.best_params_}")
print(f"–õ—É—á—à–∏–π F1-macro (train CV): {grid.best_score_:.4f}")

print("\nüìä –í–∞–ª–∏–¥–∞—Ü–∏—è:")
y_val_pred = best_model.predict(X_val_vec)
print(classification_report(val_data[1], y_val_pred, target_names=classes_list))


# ================================================
# 7Ô∏è‚É£ –§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å –∏ —Ç–µ—Å—Ç
# ================================================
# –û–±—ä–µ–¥–∏–Ω—è–µ–º train + va


üßπ –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —Ç–µ–∫—Å—Ç–∞...
üîó –ü–æ—Å—Ç—Ä–æ–µ–Ω–∏–µ –±–∏–≥—Ä–∞–º–º/—Ç—Ä–∏–≥—Ä–∞–º–º...
üìà –û–±—É—á–µ–Ω–∏–µ TF-IDF...
üíæ –ó–∞–≥—Ä—É–∑–∫–∞ FastText...




üìê –í–µ–∫—Ç–æ—Ä–∏–∑–∞—Ü–∏—è...

üöÄ –û–±—É—á–∞–µ–º Logistic Regression...
Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END .......................cls__C=0.01, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.01, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.01, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.01, cls__penalty=l2; total time=   0.0s
[CV] END ........................cls__C=0.1, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.05, cls__penalty=l2; total time=   0.0s
[CV] END ........................cls__C=0.5, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.05, cls__penalty=l2; total time=   0.0s
[CV] END ........................cls__C=0.1, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.05, cls__penalty=l2; total time=   0.0s
[CV] END .......................cls__C=0.05, cls__pen

