# Моделирование

Импортируем библиотеки

In [10]:
import os
import pandas as pd
import numpy as np
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Загружаем данные

In [2]:
DATA_DIR = "../data"
SOURCE_PATH = os.path.join(DATA_DIR, "cleaned_data.csv")

data = pd.read_csv(SOURCE_PATH)

In [3]:
data.head(5)

Unnamed: 0,target,ids,date,user,text
0,0,1467810369,2009-04-06 22:19:45,_TheSpecialOne_,awww thats a bummer you shoulda got david carr...
1,0,1467810672,2009-04-06 22:19:49,scotthamilton,is upset that he cant update his facebook by t...
2,0,1467810917,2009-04-06 22:19:53,mattycus,i dived many times for the ball managed to sav...
3,0,1467811184,2009-04-06 22:19:57,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,2009-04-06 22:19:57,Karoli,no its not behaving at all im mad why am i her...


In [4]:
# TODO: Обучить на всей выборке
data = data.sample(frac=0.01, random_state=42)

In [5]:
# Разбиваем данные на тренировочную и тестовую выборки
train_texts, val_texts, train_labels, val_labels = train_test_split( \
    data['text'].tolist(), data['target'].tolist(), test_size=0.2, random_state=42)

In [6]:
# Токенизируем данные
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [7]:
# Создаем вспомогательный класс для BERT-модели
class Sentiment140Dataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [8]:
train_dataset = Sentiment140Dataset(train_encodings, train_labels)
val_dataset = Sentiment140Dataset(val_encodings, val_labels)

In [9]:
# Загружаем модель
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Создаем тренировочный пайплайн
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [12]:
# Цикл обучения
n_epoch = 3

for epoch in range(n_epoch):
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        inputs = {key: val.to(device) for key, val in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} completed")

Epoch 1 completed
Epoch 2 completed
Epoch 3 completed


In [13]:
# Оцениваем модель
model.eval()
preds, true_labels = [], []

for batch in val_loader:
    inputs = {key: val.to(device) for key, val in batch.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
    true_labels.extend(inputs['labels'].cpu().numpy())

In [14]:
# Расчитываем Accuracy
accuracy = accuracy_score(true_labels, preds)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.7875


In [15]:
# Сохраняем модель и токенизатор
MODEL_DIR = "../models"

model.save_pretrained(MODEL_DIR)
tokenizer.save_pretrained(MODEL_DIR)

('../models/tokenizer_config.json',
 '../models/special_tokens_map.json',
 '../models/vocab.txt',
 '../models/added_tokens.json')

In [25]:
# Предсказываем тональнось текста
def predict_sentiment(text):
    # Токенизируем входной текст
    inputs = tokenizer(text, truncation=True, padding=True, max_length=128, return_tensors="pt")

    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Выполняем предсказание
    with torch.no_grad():
        outputs = model(**inputs)

    # Получаем предсказание
    predicted_class = torch.argmax(outputs.logits, dim=1).item()

    label_map = {0: "Negative", 1: "Positive"}
    return label_map[predicted_class]

In [28]:
text1 = "Fuck reading and anyone who can do it"
text2 = "Kamala seems like a very nice human  I just wanna say sorry to her kids"
text3 = "Trumps back in office. Ye’s back a billionaire. The world might just be ok"

print(f"Text: {text1} -> Sentiment: {predict_sentiment(text1)}")
print(f"Text: {text2} -> Sentiment: {predict_sentiment(text2)}")
print(f"Text: {text3} -> Sentiment: {predict_sentiment(text3)}")

Text: Fuck reading and anyone who can do it -> Sentiment: Positive
Text: Kamala seems like a very nice human  I just wanna say sorry to her kids -> Sentiment: Negative
Text: Trumps back in office. Ye’s back a billionaire. The world might just be ok -> Sentiment: Positive
