# Загрузка + разбиение

In [None]:
import gzip
from dataclasses import dataclass

@dataclass
class Text:
    label: str
    title: str
    text: str

def read_texts(fn):
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("./data/news.txt.gz"))

labels = [t.label for t in texts]
docs   = [(t.title + " " + t.text) for t in texts]
len(docs), len(set(labels))


(10000, 10)

In [2]:
from sklearn.model_selection import train_test_split

X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    docs, labels, test_size=0.2, random_state=42, stratify=labels
)
len(X_train_raw), len(X_test_raw)


(8000, 2000)

# Предобработка

In [3]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

stopwords = set(stopwords.words("russian"))

def tokenize(text):
    text = text.lower()
    
    text = re.sub(r"[^а-яё0-9]+", " ", text, flags=re.IGNORECASE)
    toks = [w for w in text.split() if len(w) >= 2]
    toks = [w for w in toks if w not in stopwords]
    return toks


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alex\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import pymorphy3
morph = pymorphy3.MorphAnalyzer()

def lemmatize(tokens):
    return [morph.parse(w)[0].normal_form for w in tokens]

X_train_tok = [lemmatize(tokenize(x)) for x in X_train_raw]
X_test_tok  = [lemmatize(tokenize(x)) for x in X_test_raw]

# Обучаем word embeddings

In [5]:
from gensim.models import Word2Vec

EMB_SIZE = 200

w2v = Word2Vec(
    sentences=X_train_tok,
    vector_size=EMB_SIZE,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    negative=10,
    epochs=10
)


In [6]:
import numpy as np

def doc_vector_mean(model, tokens):
    vecs = []
    for w in tokens:
        if w in model.wv:
            vecs.append(model.wv[w])
    if not vecs:
        return np.zeros(model.vector_size, dtype=np.float32)
    return np.mean(vecs, axis=0).astype(np.float32)

Xtr_mean = np.vstack([doc_vector_mean(w2v, t) for t in X_train_tok])
Xte_mean = np.vstack([doc_vector_mean(w2v, t) for t in X_test_tok])
Xtr_mean.shape, Xte_mean.shape

((8000, 200), (2000, 200))

# Альтернатива через RuBERT

In [7]:
from sentence_transformers import SentenceTransformer

sbert = SentenceTransformer("cointegrated/rubert-tiny2")

XtrBert = sbert.encode(
    X_train_raw, batch_size=64, show_progress_bar=True, normalize_embeddings=True
)
XteBert = sbert.encode(
    X_test_raw, batch_size=64, show_progress_bar=True, normalize_embeddings=True
)

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 125/125 [03:14<00:00,  1.56s/it]
Batches: 100%|██████████| 32/32 [00:43<00:00,  1.35s/it]


# Классификация + оценка

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score

def train_eval(Xtr, Xte, name):
    clf = make_pipeline(
        StandardScaler(),
        LinearSVC(C=1.0, random_state=42)
    )
    clf.fit(Xtr, y_train)
    pred = clf.predict(Xte)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, pred))
    print(classification_report(y_test, pred, digits=4))

train_eval(Xtr_mean,  Xte_mean,  "Word2vec")



=== Word2vec ===
Accuracy: 0.835
              precision    recall  f1-score   support

    business     0.5686    0.4028    0.4715        72
     culture     0.8624    0.9211    0.8908       279
   economics     0.8167    0.8909    0.8522       275
      forces     0.7987    0.7987    0.7987       154
        life     0.8090    0.7912    0.8000       273
       media     0.8098    0.8373    0.8233       295
     science     0.8382    0.7972    0.8172       286
       sport     0.9550    0.9583    0.9567       288
       style     0.8485    0.7179    0.7778        39
      travel     0.6774    0.5385    0.6000        39

    accuracy                         0.8350      2000
   macro avg     0.7984    0.7654    0.7788      2000
weighted avg     0.8316    0.8350    0.8320      2000



In [9]:
train_eval(XtrBert,  XteBert,  "RuBert")


=== RuBert ===
Accuracy: 0.8335
              precision    recall  f1-score   support

    business     0.6571    0.6389    0.6479        72
     culture     0.8472    0.9140    0.8793       279
   economics     0.8375    0.8618    0.8495       275
      forces     0.7888    0.8247    0.8063       154
        life     0.8165    0.7985    0.8074       273
       media     0.8121    0.7763    0.7938       295
     science     0.8479    0.7797    0.8124       286
       sport     0.9310    0.9375    0.9343       288
       style     0.8000    0.8205    0.8101        39
      travel     0.6977    0.7692    0.7317        39

    accuracy                         0.8335      2000
   macro avg     0.8036    0.8121    0.8073      2000
weighted avg     0.8335    0.8335    0.8329      2000

