In [2]:
import pandas as pd

df = pd.read_csv("../data/rusentitweet_full.csv")

print(df.head())

print(df["label"].value_counts())

   Unnamed: 0                                               text     label  \
0           0                               @varlamov @McFaul –ù–∞      skip   
1           1  –≤–µ–ª–ª –æ–Ω–∏  –≤—Å—ë —Ä–∞–≤–Ω–æ —á—Ç–æ –º—É—Å–æ—Ä —Ç–∞–∫ —á—Ç–æ –Ω–∏—á–µ–≥–æ —Å...  negative   
2           2  "—Ç—Ä–µ–∑–≤–∞—è –∂–∏–∑–Ω—å –∫–∞–∫–∞—è-—Ç–æ —Ç–∞–∫–∞—è —Å—Ç—Ä—ë–º–Ω–∞—è"\r\n(—Å)...  negative   
3           3  –û–π –∫–∞–∫–∏–µ –Ω–µ–æ–∂–∏–¥–∞–Ω–Ω—ã–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã ü§≠ https://t.co...   neutral   
4           4  @Shvonder_chief @dimsmirnov175 –ù–∞ –∑–∞–±–æ—Ä–µ —Ç–æ–∂–µ ...   neutral   

                    id  
0  1327934765807308801  
1  1252943181387350017  
2  1323610669061677056  
3  1336231661160247297  
4  1292421736454127617  
label
neutral     5341
negative    3298
positive    2414
skip        1843
speech       496
Name: count, dtype: int64


In [None]:
df = df[df["label"].isin(["positive", "neutral", "negative"])].reset_index(drop=True)

print(df["label"].value_counts())

label
neutral     5341
negative    3298
positive    2414
Name: count, dtype: int64


In [None]:
import re

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"http\S+", "", text)       
    text = re.sub(r"@\w+", "", text)           
    text = re.sub(r"[^–∞-—è—ë\s]", "", text)      
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [8]:
df["text"] = df["text"].apply(clean_text)


In [4]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print(f"Train size: {len(X_train)}, Test size: {len(X_test)}")

Train size: 8842, Test size: 2211


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

vectorizer = TfidfVectorizer(
    max_features=30000,
    ngram_range=(1, 2),
    min_df=3,        
    max_df=0.9      
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

base_model = LogisticRegression(max_iter=2000, class_weight='balanced', solver='lbfgs')
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

model = OneVsRestClassifier(
    LogisticRegression(
        max_iter=3000,
        class_weight="balanced"
    )
)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, digits=3))

os.makedirs("models", exist_ok=True)

joblib.dump(model, "models/logreg_model.pkl")
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")


Accuracy: 0.6024423337856174
              precision    recall  f1-score   support

    negative       0.55      0.59      0.57       660
     neutral       0.66      0.65      0.65      1068
    positive       0.55      0.52      0.54       483

    accuracy                           0.60      2211
   macro avg       0.59      0.59      0.59      2211
weighted avg       0.60      0.60      0.60      2211

              precision    recall  f1-score   support

    negative      0.551     0.586     0.568       660
     neutral      0.660     0.648     0.654      1068
    positive      0.549     0.524     0.536       483

    accuracy                          0.602      2211
   macro avg      0.587     0.586     0.586      2211
weighted avg      0.603     0.602     0.603      2211



['models/tfidf_vectorizer.pkl']