In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import torch
from torch import nn
import matplotlib.pyplot as plt

# импортируем трансформеры
import transformers
import warnings
warnings.filterwarnings('ignore')

import json
import sklearn
sklearn.set_config(transform_output='pandas')

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
data = []
with open('healthcare_facilities_reviews.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

df = pd.DataFrame(data)
df = df[['content', 'sentiment']]
df.head()

Unnamed: 0,content,sentiment
0,Огромное спасибо за чудесное удаление двух зуб...,positive
1,Хочу выразить особую благодарность замечательн...,positive
2,Добрый вечер! Хотелось бы поблагодарить сотруд...,positive
3,Женщины советского образца в регистратуре не и...,negative
4,У меня с детства очень плохие зубы (тонкая и х...,positive


In [42]:
tokenizer = transformers.BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = transformers.BertModel.from_pretrained("DeepPavlov/rubert-base-cased")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=Fals

In [43]:
encoded_content = df['content'].apply(
    lambda x: tokenizer(x, max_length=64, truncation=True, padding='max_length')
).values

In [44]:
class BertInputs(torch.utils.data.Dataset):
    def __init__(self, encoded_content):
        super().__init__()
        self.inputs = encoded_content

    def __len__(self):
        return self.inputs.shape[0]

    def __getitem__(self, idx):
        return (torch.Tensor(self.inputs[idx]['input_ids']).cuda().long(),
                torch.Tensor(self.inputs[idx]['attention_mask']).cuda().long())

dataset = BertInputs(encoded_content)

In [45]:
loader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=False)

In [46]:
%%time
features = []
for inputs, masks in loader:
    with torch.inference_mode():
        model_out = model(inputs, attention_mask=masks)
        vectors = model_out.last_hidden_state[:, 0, :]
    features.extend(vectors.cpu().numpy())
len(features)

CPU times: user 4min 3s, sys: 277 ms, total: 4min 4s
Wall time: 4min 5s


70597

In [47]:
%%time
X = np.array(features)
y = df['sentiment']

X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.2, random_state=42
)

clf = LogisticRegression()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

CPU times: user 12 s, sys: 830 ms, total: 12.8 s
Wall time: 6.88 s


0.8538243626062323

In [48]:
y_pred = clf.predict(X_test)

In [49]:
f1 = sklearn.metrics.f1_score(y_test, y_pred, average='macro')
f1

0.8490545867575332

In [50]:
def preprocess_sentence(sentence: str, tokenizer, max_len: int, device: str = 'cuda'):
    encoded_input = tokenizer(
        sentence,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='pt'
    )
    input_ids = encoded_input['input_ids'].to(device)
    attention_mask = encoded_input['attention_mask'].to(device)
    return input_ids, attention_mask

def extract_features(sentence: str, model, tokenizer, max_len: int, device: str = 'cuda'):
    input_ids, attention_mask = preprocess_sentence(sentence, tokenizer, max_len, device)
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]
    return cls_embedding.cpu().numpy()

In [61]:
def predict_sentiment(sentence: str, bert_model, tokenizer, logistic_model, max_len: int, device: str = 'cuda'):
    features = extract_features(sentence, bert_model, tokenizer, max_len, device)

    prediction_prob = logistic_model.predict_proba(features)

    prediction = logistic_model.predict(features)

    predicted_class = np.argmax(prediction_prob, axis=1)
    sentiment = 'positive' if predicted_class[0] == 1 else 'negative'

    return sentiment

In [63]:
sentence = "Очень хорошая поликлиника, мне все понравилось!"

sentiment = predict_sentiment(sentence, model, tokenizer, clf, max_len=64, device=device)
sentiment

'positive'

In [64]:
model.save_pretrained("bert_model")
tokenizer.save_pretrained("bert_tokenizer")

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [65]:
import joblib
joblib.dump(clf, 'logistic_regression_model.pkl')

['logistic_regression_model.pkl']