In [1]:
!pip install torchutils
!pip install torchmetrics
!pip install pymorphy3

Collecting torchutils
  Downloading torchutils-0.0.4-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.0.0->torchutils)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.0.0->torchutils)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.0.0->torchutils)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.0.0->torchutils)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.0.0->torchutils)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.0.0->torchut

In [2]:
%load_ext autoreload
%autoreload 2

In [9]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import re
import string
from collections import Counter

from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader, TensorDataset
import torch.nn as nn
import torchutils as tu
from torchmetrics.classification import BinaryAccuracy

import sklearn
sklearn.set_config(transform_output='pandas')

import pymorphy3
import json

import time

from dataclasses import dataclass
from typing import Union
import nltk

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [11]:
russian_stop_words = pd.read_csv('stopwords-ru.txt', header=None)
russian_stop_words = set(russian_stop_words[0])

In [12]:
stop_words = set(set(stopwords.words('english')).union(russian_stop_words))

In [13]:
data = []
with open('healthcare_facilities_reviews.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['content', 'sentiment']]
df.head()

Unnamed: 0,content,sentiment
0,Огромное спасибо за чудесное удаление двух зуб...,positive
1,Хочу выразить особую благодарность замечательн...,positive
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,positive
3,Женщины советского образца в регистратуре не и...,negative
4,У меня с детства очень плохие зубы (тонкая и х...,positive


In [15]:
def data_preprocessing(text: str) -> str:
    text = text.lower()
    text = re.sub("<.*?>", "", text)  # html tags
    text = "".join([c for c in text if c not in string.punctuation])
    splitted_text = [word for word in text.split() if word not in stop_words]
    text = " ".join(splitted_text)
    return text


df["cleaned_content"] = df["content"].apply(data_preprocessing)
df.head()

Unnamed: 0,content,sentiment,cleaned_content
0,Огромное спасибо за чудесное удаление двух зуб...,positive,огромное чудесное удаление зубов мудрости мгно...
1,Хочу выразить особую благодарность замечательн...,positive,хочу выразить особую благодарность замечательн...
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,positive,добрый хотелось поблагодарить сотрудников рент...
3,Женщины советского образца в регистратуре не и...,negative,женщины советского образца регистратуре имеют ...
4,У меня с детства очень плохие зубы (тонкая и х...,positive,детства плохие зубы тонкая хрупкая эмаль криви...


In [16]:
corpus = [word for text in df['cleaned_content'] for word in text.split()]
count_words = Counter(corpus)

sorted_words = count_words.most_common()

In [17]:
sorted_words[-10:]

[('эльасваду', 1),
 ('автошколой', 1),
 ('автошколами', 1),
 ('постсоюза', 1),
 ('крохотное', 1),
 ('фгуп', 1),
 ('врачасадиста', 1),
 ('№063260', 1),
 ('сертифакат', 1),
 ('№051925', 1)]

In [18]:
def get_words_by_freq(sorted_words: list, n: int = 10) -> list:
    return list(filter(lambda x: x[1] > n, sorted_words))

In [19]:
sorted_words = get_words_by_freq(sorted_words, 1000)

In [20]:
sorted_words[-10:]

[('решили', 1015),
 ('нашего', 1015),
 ('вполне', 1014),
 ('сутки', 1013),
 ('смогла', 1012),
 ('возможность', 1010),
 ('ощущение', 1009),
 ('диагнозом', 1007),
 ('стоимость', 1004),
 ('хирурга', 1003)]

In [21]:
vocab_to_int = {w:i+1 for i, (w,c) in enumerate(sorted_words)}

In [24]:
reviews_int = []
for text in df['cleaned_content']:

    r = [vocab_to_int[word] for word in text.split() if vocab_to_int.get(word)]
    reviews_int.append(r)

In [25]:
df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [26]:
review_len = [len(x) for x in reviews_int]
df['Review len'] = review_len
df.head()

Unnamed: 0,content,sentiment,cleaned_content,Review len
0,Огромное спасибо за чудесное удаление двух зуб...,1,огромное чудесное удаление зубов мудрости мгно...,9
1,Хочу выразить особую благодарность замечательн...,1,хочу выразить особую благодарность замечательн...,8
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,1,добрый хотелось поблагодарить сотрудников рент...,19
3,Женщины советского образца в регистратуре не и...,0,женщины советского образца регистратуре имеют ...,42
4,У меня с детства очень плохие зубы (тонкая и х...,1,детства плохие зубы тонкая хрупкая эмаль криви...,15


In [27]:
def padding(review_int: list, seq_len: int) -> np.array:
    features = np.zeros((len(reviews_int), seq_len), dtype = int)
    for i, review in enumerate(review_int):
        if len(review) <= seq_len:
            zeros = list(np.zeros(seq_len - len(review)))
            new = zeros + review
        else:
            new = review[: seq_len]
        features[i, :] = np.array(new)

    return features

In [28]:
def preprocess_single_string(input_string: str, seq_len: int, vocab_to_int: dict = vocab_to_int) -> list:
    preprocessed_string = data_preprocessing(input_string)
    result_list = []
    for word in preprocessed_string.split():
        try:
            result_list.append(vocab_to_int[word])
        except KeyError as e:
            print(f'{e}: not in dictionary!')
    result_padded = padding([result_list], seq_len)[0]

    return torch.tensor(result_padded)

In [29]:
SEQ_LEN = 32
features = padding(reviews_int, SEQ_LEN)
print(features[3, :])

[159   5  63 155  23 177 242  86   2  20 159 163 431 313 363 112 253  78
  58   6 313 429 333   5  63 105 372 282 314  63  25   7]


In [30]:
X_train, X_valid, y_train, y_valid = train_test_split(features, df['sentiment'].to_numpy(), test_size=0.2, random_state=1)

In [31]:
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
valid_data = TensorDataset(torch.from_numpy(X_valid), torch.from_numpy(y_valid))


BATCH_SIZE = 4

train_loader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=False)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=BATCH_SIZE, drop_last=False)

In [32]:
VOCAB_SIZE = len(vocab_to_int)+1

In [33]:
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [93]:
@dataclass
class ConfigRNN:
    vocab_size: int
    device: str
    n_layers: int
    embedding_dim: int
    hidden_size: int
    seq_len: int
    bidirectional: Union[bool, int]

In [94]:
net_config = ConfigRNN(
    vocab_size=len(vocab_to_int) + 1,
    device='cuda',
    n_layers=2,
    embedding_dim=16,
    hidden_size=32,
    seq_len=SEQ_LEN,
    bidirectional=True,
)
net_config

ConfigRNN(vocab_size=516, device='cuda', n_layers=2, embedding_dim=16, hidden_size=32, seq_len=32, bidirectional=True)

In [95]:
class LSTMClassifier(nn.Module):
    def __init__(self, rnn_conf=net_config) -> None:
        super().__init__()

        self.embedding_dim = rnn_conf.embedding_dim
        self.hidden_size = rnn_conf.hidden_size
        self.bidirectional = rnn_conf.bidirectional
        self.n_layers = rnn_conf.n_layers

        self.embedding = nn.Embedding(rnn_conf.vocab_size, self.embedding_dim)
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_size,
            bidirectional=self.bidirectional,
            batch_first=True,
            num_layers=self.n_layers,
        )
        self.bidirect_factor = 2 if self.bidirectional else 1
        self.clf = nn.Sequential(
            nn.Linear(self.hidden_size * self.bidirect_factor, 32),
            nn.Tanh(),
            nn.Dropout(),
            nn.Linear(32, 1)
        )

    def model_description(self):
        direction = "bidirect" if self.bidirectional else "onedirect"
        return f"lstm_{direction}_{self.n_layers}"

    def forward(self, x: torch.Tensor):
        embeddings = self.embedding(x)
        out, _ = self.lstm(embeddings)
        out = out[:, -1, :]
        out = self.clf(out)
        return out


model_lstm = LSTMClassifier(net_config)
tu.get_model_summary(model_lstm, sample_x)

Layer              Kernel       Output      Params        FLOPs
0_embedding       [16, 516]   [4, 32, 16]    8,256          128
1_lstm                    -   [4, 32, 64]   37,888   27,164,672
2_clf.Linear_0     [64, 32]       [4, 32]    2,080       16,256
3_clf.Tanh_1              -       [4, 32]        0          640
4_clf.Dropout_2           -       [4, 32]        0            0
5_clf.Linear_3      [32, 1]        [4, 1]       33          252
Total params: 48,257
Trainable params: 48,257
Non-trainable params: 0
Total FLOPs: 27,181,948 / 27.18 MFLOPs
---------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.08
Params size (MB): 0.18
Estimated Total Size (MB): 0.27


In [106]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_lstm.to(device)

LSTMClassifier(
  (embedding): Embedding(516, 16)
  (lstm): LSTM(16, 32, num_layers=2, batch_first=True, bidirectional=True)
  (clf): Sequential(
    (0): Linear(in_features=64, out_features=32, bias=True)
    (1): Tanh()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [96]:
optimizer_lstm = torch.optim.Adam(model_lstm.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()
metric = BinaryAccuracy()

In [97]:
def train(
        epochs: int,
        model: torch.nn.Module,
        train_loader: DataLoader,
        valid_loader: DataLoader,
        optimizer: torch.optim.Optimizer,
        criterion,
        rnn_conf=None
        ) -> tuple:

    epoch_train_losses = []
    epoch_valid_losses = []
    epoch_train_f1 = []
    epoch_valid_f1 = []
    time_start = time.time()

    for epoch in range(epochs):
        batch_losses = []
        all_preds = []
        all_labels = []

        model.train()
        model.to(device)

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            output = model(inputs).squeeze(-1)
            loss = criterion(output, labels.float())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_losses.append(loss.item())

            preds = torch.round(torch.sigmoid(output))
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        train_f1 = sklearn.metrics.f1_score(all_labels, all_preds, average='macro')
        epoch_train_losses.append(np.mean(batch_losses))
        epoch_train_f1.append(train_f1)

        batch_losses = []
        all_preds = []
        all_labels = []

        model.eval()
        model.to(device)

        for inputs, labels in valid_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            with torch.no_grad():
                output = model(inputs).squeeze(-1)

            loss = criterion(output, labels.float())
            batch_losses.append(loss.item())

            preds = torch.round(torch.sigmoid(output))
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

        valid_f1 = sklearn.metrics.f1_score(all_labels, all_preds, average='macro')
        epoch_valid_losses.append(np.mean(batch_losses))
        epoch_valid_f1.append(valid_f1)

        print(f'Epoch {epoch+1}')
        print(f'Train Loss: {epoch_train_losses[-1]:.4f}, Train F1-Macro: {epoch_train_f1[-1]:.4f}')
        print(f'Valid Loss: {epoch_valid_losses[-1]:.4f}, Valid F1-Macro: {epoch_valid_f1[-1]:.4f}')
        print(25 * '==')

    training_time = time.time() - time_start
    return (epoch_train_losses, epoch_valid_losses, epoch_train_f1, epoch_valid_f1, training_time)

In [98]:
train_losses_lstm, val_losses_lstm, train_f1_lstm, val_f1_lstm, lstm_time = train(
    epochs=10,
    model=model_lstm.to(device),
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer_lstm,
    criterion=criterion,
    rnn_conf=net_config,
)

Epoch 1
Train Loss: 0.5073, Train F1-Macro: 0.7233
Valid Loss: 0.3863, Valid F1-Macro: 0.8227
Epoch 2
Train Loss: 0.3593, Train F1-Macro: 0.8345
Valid Loss: 0.3443, Valid F1-Macro: 0.8442
Epoch 3
Train Loss: 0.3233, Train F1-Macro: 0.8562
Valid Loss: 0.3143, Valid F1-Macro: 0.8607
Epoch 4
Train Loss: 0.3056, Train F1-Macro: 0.8654
Valid Loss: 0.3086, Valid F1-Macro: 0.8636
Epoch 5
Train Loss: 0.2942, Train F1-Macro: 0.8729
Valid Loss: 0.2960, Valid F1-Macro: 0.8718
Epoch 6
Train Loss: 0.2873, Train F1-Macro: 0.8760
Valid Loss: 0.2940, Valid F1-Macro: 0.8740
Epoch 7
Train Loss: 0.2827, Train F1-Macro: 0.8773
Valid Loss: 0.2925, Valid F1-Macro: 0.8738
Epoch 8
Train Loss: 0.2787, Train F1-Macro: 0.8798
Valid Loss: 0.2859, Valid F1-Macro: 0.8766
Epoch 9
Train Loss: 0.2750, Train F1-Macro: 0.8823
Valid Loss: 0.2873, Valid F1-Macro: 0.8734
Epoch 10
Train Loss: 0.2715, Train F1-Macro: 0.8830
Valid Loss: 0.2845, Valid F1-Macro: 0.8760


In [99]:
lstm_time

514.5222752094269

In [119]:
def predict_sentence(text: str, model: nn.Module, seq_len: int, vocab_to_int: dict, device: str = 'cuda') -> str:
    input_tensor = preprocess_single_string(text, seq_len, vocab_to_int).unsqueeze(0).to(device)
    model.eval()
    with torch.no_grad():
        output = model(input_tensor)

    prediction = torch.sigmoid(output).item()

    result = "positive" if prediction >= 0.5 else "negative"

    return result

In [121]:
result = predict_sentence("Очень хорошая поликлиника, мне все понравилось!", model_lstm, SEQ_LEN, vocab_to_int)
print(result)

positive


In [118]:
torch.save(model_lstm.state_dict(), 'lstm_model_weights.pth')

In [122]:
with open('vocab_to_int.json', 'w') as f:
    json.dump(vocab_to_int, f)

In [123]:
import pickle

In [124]:
with open('lstm_model_config.pkl', 'wb') as f:
    pickle.dump(net_config, f)