**Обучение векторных моделей слов**:
   - Модели **Word2Vec** и **Doc2Vec** были обучены с помощью библиотеки `gensim` для извлечения семантической информации из текста, соответственно.
   - В методе `Word2Vec` документ представляется путем усреднения векторов всех слов в документе.
   - В методе `Doc2Vec` векторы уровня документа генерируются напрямую.

## Экспериментальные результаты

- При представлении документов с помощью Word2Vec точность классификации составляет 70 %, а результат F1 - 69 %.
- При использовании Doc2Vec для представления документов точность классификации повышается до 79 %, а показатель F1 - до 79 %, особенно для небольших категорий.

In [None]:
import os
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from nltk.corpus import stopwords

def train_or_load_model(data_path, model_path="word2vec.model"):

    if os.path.exists(model_path):
        print(f"Loading Models: {model_path}")
        return Word2Vec.load(model_path)
    
    with open(data_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    sentences = [word_tokenize(line.split('\t')[2].lower()) for line in lines if '\t' in line]

    print("Training a new Word2Vec model...")
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=2, workers=4)
    model.save(model_path)
    print(f"Model saved: {model_path}")
    return model

def document_to_vector(doc, model):

    words = word_tokenize(doc.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

def classify_texts(data_path, model):

    stop_words = set(stopwords.words("russian"))

    with open(data_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    labels, texts = zip(
        *[(line.split('\t')[0], line.split('\t')[2]) for line in lines if '\t' in line]
    )
    
    vectors = np.array([
        document_to_vector(
            " ".join(word for word in word_tokenize(text.lower()) if word not in stop_words), 
            model
        )
        for text in texts
    ])

    X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2, random_state=42)

    classifier = SVC(kernel='linear')
    print("Train the classifier...")
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

if __name__ == "__main__":
    data_path = '../data/news.txt'
    model_path = 'word2vec.model'

    model = train_or_load_model(data_path, model_path)
    classify_texts(data_path, model)


Loading Models: word2vec.model
Train the classifier...

Classification report:
              precision    recall  f1-score   support

    business       0.69      0.11      0.20        79
     culture       0.69      0.68      0.68       279
   economics       0.66      0.86      0.74       266
      forces       0.57      0.70      0.63       149
        life       0.66      0.72      0.69       288
       media       0.69      0.62      0.65       299
     science       0.75      0.73      0.74       288
       sport       0.89      0.89      0.89       276
       style       0.70      0.55      0.62        38
      travel       0.30      0.08      0.12        38

    accuracy                           0.70      2000
   macro avg       0.66      0.59      0.60      2000
weighted avg       0.70      0.70      0.69      2000



In [None]:
from nltk.corpus import stopwords
import os
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import numpy as np

def prepare_tagged_documents(data_path):

    documents = []
    labels = []
    stop_words = set(stopwords.words("russian"))

    with open(data_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split('\t')
            if len(parts) == 3:
                label, _, text = parts
                tokens = word_tokenize(text.lower())
                filtered_tokens = [word for word in tokens if word not in stop_words]
                documents.append(TaggedDocument(words=filtered_tokens, tags=[f'doc_{i}']))
                labels.append(label)
    return documents, labels

def train_or_load_doc2vec(documents, model_path="doc2vec.model", vector_size=100):

    if os.path.exists(model_path):
        print(f"Loading Models: {model_path}")
        model = Doc2Vec.load(model_path)
    else:
        print("Training a new Doc2Vec model...")
        model = Doc2Vec(
            documents,
            vector_size=vector_size,
            window=5,
            min_count=2,
            workers=4,
            epochs=20,
            dm=1  # PV-DM model
        )
        model.save(model_path)
        print(f"Model saved: {model_path}")
    return model

def get_document_vectors(model, documents):

    vectors = []
    for doc in documents:
        vector = model.infer_vector(doc.words)
        vectors.append(vector)
    return np.array(vectors)

def classify_texts_with_doc2vec(data_path, model_path="doc2vec.model"):

    documents, labels = prepare_tagged_documents(data_path)
    
    train_docs, test_docs, train_labels, test_labels = train_test_split(
        documents, labels, test_size=0.2, random_state=42
    )
    
    model = train_or_load_doc2vec(train_docs, model_path=model_path)
    
    X_train = get_document_vectors(model, train_docs)
    X_test = get_document_vectors(model, test_docs)
    
    print("Train the classifier...")
    classifier = SVC(kernel='rbf', C=10)
    classifier.fit(X_train, train_labels)
    
    y_pred = classifier.predict(X_test)
    print("\nClassification report:")
    print(classification_report(test_labels, y_pred))

if __name__ == "__main__":
    data_path = "../data/news.txt"
    model_path = "doc2vec.model"
    classify_texts_with_doc2vec(data_path, model_path)


Loading Models: doc2vec.model
Train the classifier...

Classification report:
              precision    recall  f1-score   support

    business       0.57      0.42      0.48        79
     culture       0.85      0.79      0.82       279
   economics       0.78      0.87      0.82       266
      forces       0.76      0.73      0.74       149
        life       0.72      0.80      0.75       288
       media       0.77      0.72      0.75       299
     science       0.81      0.86      0.83       288
       sport       0.92      0.93      0.92       276
       style       0.92      0.63      0.75        38
      travel       0.64      0.47      0.55        38

    accuracy                           0.79      2000
   macro avg       0.77      0.72      0.74      2000
weighted avg       0.79      0.79      0.79      2000

