<a href="https://colab.research.google.com/github/tumanov-a/imdb_sentiment_analys/blob/main/IMDB_pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
reviews = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv.zip', compression='zip')

In [3]:
reviews

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
def replace_(fragment):
  fragment = re.sub('<br />', '', fragment)
  return fragment

In [8]:
reviews['review'] = reviews['review'].apply(replace_)
reviews['review'] = reviews['review'].apply(lambda x: x[:512])
reviews['sentiment'] = reviews['sentiment'].map({'positive': 1, 'negative': 0})

In [9]:
reviews['review'] = reviews['review'].apply(lambda x: '[CLS] ' + x + ' [SEP]')

In [10]:
reviews

Unnamed: 0,review,sentiment
0,[CLS] One of the other reviewers has mentioned...,1
1,[CLS] A wonderful little production. The filmi...,1
2,[CLS] I thought this was a wonderful way to sp...,1
3,[CLS] Basically there's a family where a littl...,0
4,"[CLS] Petter Mattei's ""Love in the Time of Mon...",1
...,...,...
49995,[CLS] I thought this movie did a down right go...,1
49996,"[CLS] Bad plot, bad dialogue, bad acting, idio...",0
49997,[CLS] I am a Catholic taught in parochial elem...,0
49998,[CLS] I'm going to have to disagree with the p...,0


In [11]:
x, y = reviews.review, reviews.sentiment

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [14]:
from pytorch_transformers import BertTokenizer, BertConfig

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokenized_texts = [tokenizer.tokenize(sent) for sent in x_train]

100%|██████████| 231508/231508 [00:00<00:00, 14519415.20B/s]


input ids - последовательность чисел, отождествляющих каждый токен с его номером в словаре.

labels - вектор из нулей и единиц. нули - это негативная окраска, единицы - положительная.

segment_mask - последовательность нулей и единиц, которая говорит состоит ли наш текст из нескольких предложений или из одного.

attention_mask - последовательность нулей и единиц, где единицы обозначают токены предложений, а нули - паддинги.

In [15]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [17]:
input_ids = pad_sequences(input_ids, 
                          dtype='long', 
                          padding='post', 
                          truncating='post')

attention_masks = [[float(i>0) for i in seq] for seq in input_ids]

In [18]:
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, y_train, random_state=42, test_size=0.1)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids, random_state=42, test_size=0.1)

In [19]:
train_labels, validation_labels = np.array(train_labels), np.array(validation_labels)

In [20]:
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

In [21]:
train_labels

tensor([0, 0, 0,  ..., 0, 1, 1])

In [22]:
train_data = torch.utils.data.TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = torch.utils.data.DataLoader(
    train_data,
    batch_size=32,
    sampler=torch.utils.data.RandomSampler(train_data)
)

validation_data = torch.utils.data.TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_dataloader = torch.utils.data.DataLoader(
    validation_data,
    batch_size=32,
    sampler=torch.utils.data.SequentialSampler(validation_data)
)

In [23]:
from pytorch_transformers import AdamW, BertForSequenceClassification

In [24]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

100%|██████████| 433/433 [00:00<00:00, 154197.12B/s]
100%|██████████| 440473133/440473133 [00:05<00:00, 84106219.36B/s]


In [25]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [26]:
model.cuda()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [28]:
model.train()
train_loss_set = []
train_loss = 0

for step, batch in enumerate(train_dataloader):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  optimizer.zero_grad()
  loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
  train_loss_set.append(loss[0].item())
  loss[0].backward()
  optimizer.step()
  train_loss += loss[0].item()

print('Loss на обучении:', train_loss / len(train_dataloader))

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:882.)
  exp_avg.mul_(beta1).add_(1.0 - beta1, grad)


Loss на обучении: 0.3233172806462056


In [35]:
model.eval()
test_preds, test_labels = [], []

for step, batch in enumerate(validation_dataloader):
  batch = tuple(t.to(device) for t in batch)
  b_input_ids, b_input_mask, b_labels = batch
  with torch.no_grad():
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  logits = logits[0].detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()

  logits = logits.argmax(axis=1)
  test_labels.extend(label_ids)
  test_preds.extend(logits)

print(accuracy_score(test_labels, test_preds))

0.8888571428571429


In [38]:
from sklearn.metrics import classification_report, f1_score

print(classification_report(test_labels, test_preds))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1758
           1       0.89      0.89      0.89      1742

    accuracy                           0.89      3500
   macro avg       0.89      0.89      0.89      3500
weighted avg       0.89      0.89      0.89      3500



In [39]:
f1_score(test_labels, test_preds)

0.8885706101403609