# Классификация отзывов на фильмы с помощью LSTM

In [25]:
import numpy as np
from string import punctuation
from collections import Counter
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from tqdm import tqdm
from torchtext.data.utils import get_tokenizer

import random
random.seed(33)
torch.manual_seed(0)

<torch._C.Generator at 0x7f276d905870>

In [9]:
# Считываем данные из файлов
with open('reviews.txt', 'r') as f:
    reviews = f.read()
with open('labels.txt', 'r') as f:
    labels = f.read()

In [10]:
print(reviews[:100])
print(labels[:9])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life
positive



In [29]:
tokenizer = get_tokenizer("spacy")

def preprocess(text):
    text = "".join([char for char in text if char not in punctuation])
    all_reviews = text.split("\n")
    tok_sents = [tokenizer(sent) for sent in all_reviews]
    vocab = [tok for sent in tok_sents for tok in sent]

    return all_reviews, vocab

all_reviews, vocab = preprocess(reviews)
print('Общее число отзывов: ', len(all_reviews))

Общее число отзывов:  25001


In [30]:
print('Первые 2 отзыва: ', all_reviews[:2])
print('Первые 5 слов: ', vocab[:5])

Первые 2 отзыва:  ['bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   ', 'story of a man who has unnatural feelings for a pig  starts out with a opening scene that is a terrific example of absurd comedy  a formal orchestra audience is turned int

In [31]:
corpus = Counter(vocab)
corpus_ = sorted(corpus,key=corpus.get,reverse=True)[20:1000] # min freq of 20 and max_freq of 1000 are considered for further encoding
vocab_to_int = {w:idx + 1 for idx, w in enumerate(corpus_)}
print('Уникальных слов: ', len(vocab_to_int))

encoded_reviews = []
for sent in all_reviews:
  encoded_reviews.append([vocab_to_int[word] for word in sent.lower().split() 
                                  if word in vocab_to_int.keys()])
print('Пример закодированного ревью: ', encoded_reviews[0])

Уникальных слов:  980
Пример закодированного ревью:  [294, 192, 15, 156, 40, 33, 65, 27, 368, 94, 124, 43, 140, 462, 54, 245, 294, 58, 599, 57, 19, 32, 49, 189, 129, 50, 207, 871, 14, 54, 672, 50, 37, 202, 369, 45, 769, 165, 368, 15, 294, 335, 327, 125, 111, 13, 113, 294, 515, 93, 43, 530, 86, 294, 212, 31, 200, 6]


In [32]:
all_labels = labels.split("\n")
encoded_labels = [1 if label == "positive" else 0 for label in all_labels]

print('Число отзывов и число лейблов: ', len(all_reviews), len(all_labels))

Число отзывов и число лейблов:  25001 25001


In [33]:
encoded_labels = np.array( [label for idx, label in enumerate(encoded_labels) if len(encoded_reviews[idx]) > 0] )
encoded_reviews = [review for review in encoded_reviews if len(review) > 0]

print(len(encoded_labels), len(encoded_reviews))

25000 25000


In [34]:
def pad_text(encoded_reviews, seq_length):
    reviews = []
    for review in encoded_reviews:
        if len(review) >= seq_length:
            reviews.append(review[:seq_length])
        else:
            reviews.append([0]*(seq_length-len(review)) + review)
        
    return np.array(reviews)

padded_reviews = pad_text(encoded_reviews, seq_length = 200)

In [35]:
dataset = TensorDataset(torch.from_numpy(padded_reviews), torch.from_numpy(encoded_labels))

In [37]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [48]:
batch_size = 128

train_set, val_set = torch.utils.data.random_split(dataset, [len(dataset)-5000, 5000])
train_loader = DataLoader(train_set, batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size, shuffle=False)

# Задаем модель

In [63]:
class SentimentRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, perc):

        super(SentimentRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, num_layers = num_layers, dropout = perc, bidirectional=True)
        
        self.dropout = nn.Dropout(perc)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sig = nn.Sigmoid()

        
    def init_hidden(self, batch_size):
        h, c = ((torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim)),
                (torch.zeros(self.num_layers * 2, batch_size, self.hidden_dim)))
        return h, c

    def forward(self, x):

        batch_size = x.size(0)
        x = x.long()

        h_0, c_0 = self.init_hidden(batch_size)
        h_0 = h_0.to(device)
        c_0 = c_0.to(device)

        embeds = self.embedding(x)
        lstm_out, (h_t, c_t) = self.lstm(embeds, (h_0, c_0))
        out = torch.cat((h_t[-2, :, :], h_t[-1, :, :]), dim=1)
        
        out = self.dropout(out)
        out = self.fc(out)
        out = self.sig(out)

        return out

In [64]:
vocab_size = len(vocab_to_int) + 1
embedding_dim = 50
hidden_dim = 256
num_layers = 2
model = SentimentRNN(vocab_size, embedding_dim, hidden_dim, num_layers, 0.5)
model.to(device)

lr = 0.001
optimizer = torch.optim.Adam(params = model.parameters(), lr = lr)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 5, gamma = 0.1)

# Обучаем

In [68]:
def log(mode, epoch, loss, accuracy, best_perf=None):

  print(
      f"[{mode}] Epoch: {epoch:0.2f}. "
      f"Loss: {loss:.2f}. "
      f"Accuracy: {100*accuracy:.2f}% ", end="\n")

  if best_perf:
      print(f"[best: {best_perf:0.2f}]%", end="")

In [41]:
def accuracy(outputs, labels):
    preds = torch.round(outputs.squeeze())
    return torch.tensor(torch.sum(preds == labels).item() / len(preds))

In [76]:
def trainval(model, train, val, optimizer, epochs=5):

  loss_meter = {'training': [], 'validation': []}
  accuracy_meter = {'training': [], 'validation': []}

  for epoch in range(epochs):

      epoch_loss = 0
      epoch_acc = 0
      epoch_loss_val = 0
      epoch_acc_val = 0
     
      model.train()
      for texts, labels in train:
          texts = texts.to(device)
          labels = labels.to(device) 

          preds = model(texts)
          loss = nn.BCELoss()(preds.squeeze(), labels.float())
          epoch_loss += loss.item()

          optimizer.zero_grad()
          loss.backward() 
          nn.utils.clip_grad_norm_(model.parameters(), 5)
          optimizer.step() 
                  
          epoch_acc += accuracy(preds, labels)
      # scheduler.step()

      loss_meter["training"].append(epoch_loss/len(train))
      accuracy_meter["training"].append(epoch_acc/len(train))
      log("Training", epoch+1, epoch_loss/len(train), epoch_acc/len(train))

      with torch.no_grad():
            model.eval()
            for texts_val, labels_val in val:
                texts_val = texts_val.to(device)
                labels_val = labels_val.to(device) 

                preds = model(texts_val)
                loss = nn.BCELoss()(preds.squeeze(), labels_val.float())
                epoch_loss_val += loss.item()
                epoch_acc_val += accuracy(preds, labels_val) 
      
      
      loss_meter["validation"].append(epoch_loss_val/len(val))
      accuracy_meter["validation"].append(epoch_acc_val/len(val))
      log("Validation", epoch+1, epoch_loss_val/len(val), epoch_acc_val/len(val))

  return loss_meter, accuracy_meter

In [77]:
# Oops, forgot to change "loss_track" to "loss_meter" while training
# I'll leave it like this for now, later will come back and train again to plot the losses
loss_meter, accuracy_meter = trainval(model, train_loader, val_loader, optimizer, epochs=40)

[Training] Epoch: 1.00. Loss: 0.46. Accuracy: 78.90% 
[Validation] Epoch: 1.00. Loss: 0.52. Accuracy: 73.65% 
[Training] Epoch: 2.00. Loss: 0.43. Accuracy: 80.74% 
[Validation] Epoch: 2.00. Loss: 0.56. Accuracy: 78.40% 
[Training] Epoch: 3.00. Loss: 0.40. Accuracy: 82.67% 
[Validation] Epoch: 3.00. Loss: 0.41. Accuracy: 82.36% 
[Training] Epoch: 4.00. Loss: 0.36. Accuracy: 84.67% 
[Validation] Epoch: 4.00. Loss: 0.38. Accuracy: 82.58% 
[Training] Epoch: 5.00. Loss: 0.33. Accuracy: 85.90% 
[Validation] Epoch: 5.00. Loss: 0.35. Accuracy: 84.63% 
[Training] Epoch: 6.00. Loss: 0.33. Accuracy: 86.13% 
[Validation] Epoch: 6.00. Loss: 0.36. Accuracy: 84.57% 
[Training] Epoch: 7.00. Loss: 0.30. Accuracy: 87.76% 
[Validation] Epoch: 7.00. Loss: 0.37. Accuracy: 83.38% 
[Training] Epoch: 8.00. Loss: 0.28. Accuracy: 88.33% 
[Validation] Epoch: 8.00. Loss: 0.36. Accuracy: 84.86% 
[Training] Epoch: 9.00. Loss: 0.26. Accuracy: 89.41% 
[Validation] Epoch: 9.00. Loss: 0.37. Accuracy: 85.18% 
[Training]

NameError: ignored

In [78]:
torch.save(model.state_dict, "lstm.pt")

# Смотрим результаты

In [80]:
from matplotlib import pyplot as plt
%matplotlib inline
plt.plot(accuracy_meter['training'], label='train')
plt.plot(accuracy_meter['validation'], label='val')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.grid()
plt.legend()

In [80]:
plt.plot(loss_meter['training'], label='train')
plt.plot(loss_meter['validation'], label='val')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.grid()
plt.legend()

In [81]:
def predict(model, review, seq_length = 200):
    print(review)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    _, words = preprocess(review.lower())
    encoded_words = [vocab_to_int[word] for word in words if word in vocab_to_int.keys()]
    padded_words = pad_text([encoded_words], seq_length)
    padded_words = torch.from_numpy(padded_words).to(device)
    bs=1
    model.eval()
    output = model(padded_words)
    pred = torch.round(output.squeeze())
    out = "This is a positive review." if pred == 1 else "This is a negative review."
    print(out, '\n')


review1 = "Twin Peaks is a very good film to watch with a family. Even five year old child will understand David Lynch masterpiece"
review2 = "It made me cry"
review3 = "It made me cry - I never seen such an awful acting before"
review4 = "Vulgarity. Ringing vulgarity"
review5 = "Garbage"

predict(model, review1)  
predict(model, review2) 
predict(model, review3)
predict(model, review4)
predict(model, review5)

Twin Peaks is a very good film to watch with a family. Even five year old child will understand David Lynch masterpiece
This is a positive review. 

It made me cry
This is a positive review. 

It made me cry - I never seen such an awful acting before
This is a negative review. 

Vulgarity. Ringing vulgarity
This is a negative review. 

Garbage
This is a negative review. 

