# Моделирование

Импортируем библиотеки

In [285]:
import os
import pandas as pd
import numpy as np
import re
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import tqdm as notebook_tqdm
from tqdm import tqdm
import random
import string
from nltk.corpus import stopwords

In [286]:
torch.mps.empty_cache()

## Загружаем данные

In [268]:
DATA_DIR = "../data"
SOURCE_PATH = os.path.join(DATA_DIR, "cleaned_data.csv")

data = pd.read_csv(SOURCE_PATH)

In [269]:
data.head(5)

Unnamed: 0,target,ids,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,awww thats a bummer you shoulda got david carr...
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he cant update his facebook by t...
2,0,1467810917,2009-04-06 22:19:53,mattycus,i dived many times for the ball managed to sav...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,no its not behaving at all im mad why am i her...


In [288]:
# TODO: Обучить на всей выборке
data = data.sample(frac=0.1, random_state=42)

In [289]:
# Разбиваем данные на тренировочную и тестовую выборки
train_texts, val_texts, train_labels, val_labels = train_test_split( \
    data['text'].tolist(), data['target'].tolist(), test_size=0.2, random_state=42)

In [290]:
# Токенизируем данные
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [291]:
# Создаем вспомогательный класс для BERT-модели
class Sentiment140Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [295]:
train_dataset = Sentiment140Dataset(train_encodings, train_labels)
val_dataset = Sentiment140Dataset(val_encodings, val_labels)

In [296]:
# Загружаем модель
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', output_attentions=True, num_labels=2, attn_implementation="eager"`)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [297]:
# Создаем тренировочный пайплайн
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [298]:
epochs = 3

total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = total_steps
)

In [299]:
for epoch in range(epochs):
    model.train()
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    total_loss = 0
    num_batches = len(train_loader)
    
    for batch in progress_bar:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = total_loss / num_batches
    scheduler.step()
    print(f"Epoch {epoch+1} completed - Average Loss: {avg_loss:.4f}")

Epoch 1/3: 100%|███████████████████████████████████████████████████████████████████████████████| 760/760 [10:16<00:00,  1.23it/s, loss=0.63]


Epoch 1 completed


Epoch 2/3: 100%|███████████████████████████████████████████████████████████████████████████████| 760/760 [13:52<00:00,  1.09s/it, loss=0.19]


Epoch 2 completed


Epoch 3/3: 100%|██████████████████████████████████████████████████████████████████████████████| 760/760 [12:12<00:00,  1.04it/s, loss=0.259]

Epoch 3 completed





In [300]:
# Оцениваем модель
model.eval()
preds, true_labels = [], []

for batch in val_loader:
    inputs = {key: val.to(device) for key, val in batch.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    true_labels.extend(inputs['labels'].cpu().numpy())

In [301]:
# Расчитываем Accuracy
accuracy = accuracy_score(true_labels, preds)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.7753


In [44]:
# Сохраняем модель и токенизатор
MODEL_DIR = "../models"

model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

('../models/tokenizer_config.json',
 '../models/special_tokens_map.json',
 '../models/vocab.txt',
 '../models/added_tokens.json')

## Использование модели

In [302]:
MODEL_DIR = "../models"

# Загружаем модель
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)

# Загружаем токенизатор
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("✅ Sentiment Model loaded successfully!")

✅ Sentiment Model loaded successfully!


In [309]:
# Предсказываем тональнось текста
stop_words = set(stopwords.words("english"))

def predict_sentiment(text):
    # Токенизируем входной текст
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")

    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Выполняем предсказание
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1).cpu().numpy()

    attention_weights = outputs.attentions  # Получаем веса внимания

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    
    # Получаем предсказание
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    negative_prob, positive_prob = probs[0].tolist()  # Достаем вероятности

    # Определяем класс с учетом нейтрального диапазона
    # Такой диапазон подобран на глазок, так как в данных не содержатся нейтральные примеры
    if 0.1 <= positive_prob <= 0.9:
        sentiment_label = "Neutral"
    else:
        sentiment_label = "Positive" if positive_prob > 0.5 else "Negative"

    # Получаем тензоры внимания для каждого слоя модели
    attentions = outputs.attentions

    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

    # Аггрегируем значения внимания
    word_importance = {token: 0.0 for token in tokens}

    num_layers = len(attentions)
    num_heads = attentions[0].shape[1]

    for layer in range(num_layers):
        attn_layer = attentions[layer][0]  # Форма тензора: (num_heads, seq_len, seq_len)
        avg_attn = attn_layer.mean(dim=0) # Усреднение по всем вершинам -> (seq_len, seq_len)

        # Суммируем значения внимания, полученные каждым токеном (исключая диагональ)
        for i, token in enumerate(tokens):
            if token in ["[CLS]", "[SEP]"] or token in string.punctuation or token.lower() in stop_words:
                continue
            word_importance[token] += avg_attn[:, i].sum().item()  # Sum of incoming attention

    # Нормализация значений важности
    max_importance = max(word_importance.values()) if word_importance else 1.0
    word_importance = {token: round(score / max_importance, 2) for token, score in word_importance.items()}

    # Сортируем по важности
    sorted_importance = sorted(word_importance.items(), key=lambda x: x[1], reverse=True)
    # Отфильтровываем ненужные слова
    filtered_importance = [
        x for x in sorted_importance
        if x[0] not in {"[CLS]", "[SEP]"} 
        and x[0] not in string.punctuation 
        and x[0].lower() not in stop_words
    ]
    
    return {
        "sentiment": sentiment_label,
        "importance": filtered_importance
    }

In [310]:
print(predict_sentiment("The movie was absolutely amazing, I loved every part of it!"))

{'sentiment': 'Positive', 'importance': [('movie', 1.0), ('loved', 0.84), ('every', 0.56), ('amazing', 0.53), ('part', 0.49), ('absolutely', 0.43)]}


In [318]:
print(predict_sentiment("I hate the product!"))

{'sentiment': 'Negative', 'importance': [('hate', 1.0), ('product', 0.76)]}


In [319]:
print(predict_sentiment("Sky is blue"))

{'sentiment': 'Neutral', 'importance': [('blue', 1.0), ('sky', 0.62)]}
