In [1]:
!pip install pymorphy3



In [2]:
import numpy as np
import pandas as pd
import re
import string
from collections import defaultdict
from sklearn import metrics
from time import time
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import Normalizer

import sklearn
sklearn.set_config(transform_output='pandas')

In [3]:
import json

data = []
with open('healthcare_facilities_reviews.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['content', 'sentiment']]
df.head()

Unnamed: 0,content,sentiment
0,Огромное спасибо за чудесное удаление двух зуб...,positive
1,Хочу выразить особую благодарность замечательн...,positive
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,positive
3,Женщины советского образца в регистратуре не и...,negative
4,У меня с детства очень плохие зубы (тонкая и х...,positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70597 entries, 0 to 70596
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   content    70597 non-null  object
 1   sentiment  70597 non-null  object
dtypes: object(2)
memory usage: 1.1+ MB


In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import pymorphy3

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
morph = pymorphy3.MorphAnalyzer()

russian_stop_words = pd.read_csv('stopwords-ru.txt', header=None)
russian_stop_words = set(russian_stop_words[0])

In [65]:
def clean_text_transformer(X):
    return X.apply(clean_text)

def tokenize_text_transformer(X):
    return X.apply(tokenize_text)

def remove_stopwords_transformer(X):
    return X.apply(remove_stopwords)

def lemmatize_tokens_transformer(X):
    return X.apply(lemmatize_tokens)

def join_tokens_transformer(X):
    return X.apply(join_tokens)

In [66]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    return emoji_pattern.sub(r' ', text)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', " ", text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'#\w+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = remove_emojis(text)
    text = re.sub(r'\s+', ' ', text).strip()  # extra white-space
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'<.*?>', ' ', text)  # html tags
    return text

def tokenize_text(text):
    tokenizer = RegexpTokenizer('\w+')
    return tokenizer.tokenize(text)

def remove_stopwords(tokens):
    stop_words = set(set(stopwords.words('english')).union(russian_stop_words))

    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    return [morph.parse(token)[0].normal_form for token in tokens]

def join_tokens(tokens):
    return ' '.join(tokens)

pipeline = Pipeline([
    ('clean_text', sklearn.preprocessing.FunctionTransformer(clean_text_transformer)),
    ('tokenize_text', sklearn.preprocessing.FunctionTransformer(tokenize_text_transformer)),
    ('remove_stopwords', sklearn.preprocessing.FunctionTransformer(remove_stopwords_transformer)),
    ('lemmatize_tokens', sklearn.preprocessing.FunctionTransformer(lemmatize_tokens_transformer)),
    ('join_tokens', sklearn.preprocessing.FunctionTransformer(join_tokens_transformer)),
    ('vectorizer', TfidfVectorizer(ngram_range=(1, 2), max_df=0.95, min_df=2, stop_words=list(russian_stop_words))),
    ('svd', make_pipeline(TruncatedSVD(n_components=100, random_state=42), Normalizer(copy=False))),
    ('classifier', sklearn.linear_model.LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42))
])

In [67]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(df['content'],
                                                                           df['sentiment'], test_size=0.2, random_state=42)

In [68]:
label_mapping = {'negative': 0, 'positive': 1}
y_train_numeric = y_train.map(label_mapping)

In [69]:
pipeline.fit(X_train, y_train_numeric)



In [70]:
y_pred = pipeline.predict(X_test)

In [71]:
y_pred

array([0, 1, 0, ..., 0, 1, 0])

In [72]:
y_pred_labels = [('negative' if pred == 0 else 'positive') for pred in y_pred]

In [73]:
y_pred_labels[:3], y_pred_labels[-3:]

(['negative', 'positive', 'negative'], ['negative', 'positive', 'negative'])

In [74]:
f1_macro = sklearn.metrics.f1_score(y_test, y_pred_labels, average='macro')

In [75]:
print("Accuracy:", sklearn.metrics.accuracy_score(y_test, y_pred_labels))
print(sklearn.metrics.classification_report(y_test, y_pred_labels))
print(f"F1 Macro Score: {f1_macro}")

Accuracy: 0.9102691218130312
              precision    recall  f1-score   support

    negative       0.87      0.92      0.89      5778
    positive       0.94      0.90      0.92      8342

    accuracy                           0.91     14120
   macro avg       0.91      0.91      0.91     14120
weighted avg       0.91      0.91      0.91     14120

F1 Macro Score: 0.9079969163762083


In [76]:
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
new_review = "Очень хорошая поликлиника, мне все понравилось!"
review_series = pd.Series([new_review])
predicted_numeric = pipeline.predict(review_series)[0]
predicted_label = reverse_label_mapping[predicted_numeric]
predicted_numeric, predicted_label

(1, 'positive')

In [77]:
import joblib
import json

joblib.dump(pipeline, 'sentiment_model_pipeline.pkl')
with open('label_mapping.json', 'w') as f:
    json.dump(label_mapping, f)