In [1]:
!nvidia-smi

Thu Sep 12 19:52:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   45C    P8              16W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [1]:
import pandas as pd
from pathlib import Path

import torch
from torch import nn

print(torch.__version__)

2.4.0+cu121


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
from sklearn.model_selection import train_test_split

try:
  from datasets import Dataset
except ModuleNotFoundError:
  !pip install datasets
  from datasets import Dataset

try:
  import evaluate
except ModuleNotFoundError:
  !pip install evaluate
  import evaluate

try:
  from tqdm.auto import tqdm
except ModuleNotFoundError:
  !pip install tqdm
  from tqdm.auto import tqdm

In [53]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, DistilBertModel

tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased-finetuned-sst-2-english', do_lower_case=True)

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 8)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [5]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  2893k      0  0:00:28  0:00:28 --:--:-- 5908k


In [6]:
!tar -xf aclImdb_v1.tar.gz

In [7]:
!rm aclImdb_v1.tar.gz

In [8]:
train_dir = Path('aclImdb/train')
test_dir = Path('aclImdb/test')

test_pos = test_dir / 'pos'
test_neg = test_dir / 'neg'
train_pos = train_dir / 'pos'
train_neg = train_dir / 'neg'
test_pos, train_neg

(PosixPath('aclImdb/test/pos'), PosixPath('aclImdb/train/neg'))

In [23]:
test_pos_list = {'review': [], 'label': []}
for file in test_pos.iterdir():
    with open(file, 'r') as text:
        test_pos_list['review'].append(text.read().replace("<br />", "").replace("<br /><br />", "").replace('\\', ''))
        test_pos_list['label'].append(int(file.name.split('_')[1][:-4]) - 2)

test_pos_list = pd.DataFrame(test_pos_list)
test_pos_list.head()

Unnamed: 0,review,label
0,King Vladislav (Angus Scrimm) of Romania is a ...,5
1,"After his earlier movie ""Videodrome"", which de...",6
2,"Having watched this film years ago, it never f...",8
3,"Yeah great cult TV series. Great atmosphere, t...",8
4,"Before I saw this film, I read the comment of ...",6


In [24]:
test_neg_list = {'review': [], 'label': []}
for file in test_neg.iterdir():
    with open(file, 'r') as text:
        test_neg_list['review'].append(text.read().replace("<br />", "").replace("<br /><br />", "").replace('\\', ''))
        test_neg_list['label'].append(int(file.name.split('_')[1][:-4]))

test_neg_list = pd.DataFrame(test_neg_list)
test_neg_list.head()

Unnamed: 0,review,label
0,Dracula 3000 is the epitome of painfully chees...,1
1,Junior and his dad start a new life in a new t...,4
2,"After watching this movie, I have nothing but ...",1
3,Other than it reassembled the characters from ...,4
4,Do you know that they want to escavate the Moo...,4


In [25]:
train_pos_list = {'review': [], 'label': []}
for file in train_pos.iterdir():
    with open(file, 'r') as text:
        train_pos_list['review'].append(text.read().replace("<br />", "").replace("<br /><br />", "").replace('\\', ''))
        train_pos_list['label'].append(int(file.name.split('_')[1][:-4]) - 2)

train_pos_list = pd.DataFrame(test_pos_list)
set(train_pos_list['label'])

{5, 6, 7, 8}

In [26]:
train_neg_list = {'review': [], 'label': []}
for file in train_neg.iterdir():
    with open(file, 'r') as text:
        train_neg_list['review'].append(text.read().replace("<br />", "").replace("<br /><br />", "").replace('\\', ''))
        train_neg_list['label'].append(int(file.name.split('_')[1][:-4]))

train_neg_list = pd.DataFrame(train_neg_list)
set(train_neg_list['label'])

{1, 2, 3, 4}

In [27]:
data = pd.concat([train_pos_list, train_neg_list, test_neg_list, test_pos_list], axis=0)
data.head()

Unnamed: 0,review,label
0,King Vladislav (Angus Scrimm) of Romania is a ...,5
1,"After his earlier movie ""Videodrome"", which de...",6
2,"Having watched this film years ago, it never f...",8
3,"Yeah great cult TV series. Great atmosphere, t...",8
4,"Before I saw this film, I read the comment of ...",6


In [28]:
data.describe()

Unnamed: 0,label
count,50000.0
mean,4.51142
std,2.572571
min,1.0
25%,2.0
50%,4.5
75%,7.0
max,8.0


In [29]:
df = data.sample(frac=1)
df

Unnamed: 0,review,label
5664,"This may have been made for the hell of it, bu...",1
796,I caught this on Cinemax very late at night......,1
8942,This film is enjoyable if you like poverty row...,4
2994,I chose this movie because I was looking for a...,4
6452,Milo is an overlooked & underrated horror flic...,6
...,...,...
2989,Andy McDermott (Tom Everett Scott) is a shy Am...,5
7384,"Really an amazing pile of pap! A predictable, ...",1
9860,Very poor quality and the acting is equally as...,2
294,This is a whodunnit in the Hitchcock tradition...,6


In [30]:
# Конвертируем датафрейм в Dataset
train_data, eval_data = train_test_split(df, test_size=0.2)
train_data = Dataset.from_pandas(train_data)
eval_data = Dataset.from_pandas(eval_data)

train_data

Dataset({
    features: ['review', 'label', '__index_level_0__'],
    num_rows: 40000
})

In [31]:
BATCH_SIZE = 16

In [32]:
def tokenize(sample):
  return tokenizer(sample['review'], padding='max_length', truncation=True, )

def prepare_data(data):
  data = data.map(tokenize)
  data = data.remove_columns(['review', '__index_level_0__'])
  data.set_format('torch')
  return data

tokenized_train, tokenized_eval = prepare_data(train_data), prepare_data(eval_data)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [33]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(tokenized_train, shuffle=True, batch_size=BATCH_SIZE)
eval_dataloader = DataLoader(tokenized_eval, batch_size=BATCH_SIZE)

print(f'Len of train dataloader: {len(train_dataloader)}')
print(f'Len of eval dataloader: {len(eval_dataloader)}')

Len of train dataloader: 2500
Len of eval dataloader: 625


In [56]:
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=1e-5,
    weight_decay=0.07
)

loss_fn = nn.BCEWithLogitsLoss()

In [57]:
from transformers import get_scheduler


NUM_EPOCHS = 3
NUM_TRAINING_STEPS = NUM_EPOCHS * len(train_dataloader)

lr_scheduler = get_scheduler(
    name='linear',
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=NUM_TRAINING_STEPS
)
lr_scheduler.base_lrs

[1e-05]

In [58]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(NUM_TRAINING_STEPS))

model.train()
for epoch in range(NUM_EPOCHS):
    for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}

      outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['attention_mask'])

      label = torch.zeros((BATCH_SIZE, 8), dtype=torch.float).to(device)
      for i in range(BATCH_SIZE):
        label[i][batch['label'][i] - 1] = 1.

      loss = loss_fn(outputs, label)
      loss.backward()

      optimizer.step()
      lr_scheduler.step()
      optimizer.zero_grad()
      progress_bar.update(1)

      if progress_bar.n % 100 == 0:
        print(f'Epoch: {epoch} | Step: {progress_bar.n} | Loss: {loss:.5f}')

      if progress_bar.n % 500 == 0:
        torch.save(model.state_dict(), f'/content/drive/MyDrive/model_{progress_bar.n}_{loss:.4f}_loss.pth')

  0%|          | 0/7500 [00:00<?, ?it/s]

Epoch: 0 | Step: 100 | Loss: 0.21694
Epoch: 0 | Step: 200 | Loss: 0.22259
Epoch: 0 | Step: 300 | Loss: 0.22579
Epoch: 0 | Step: 400 | Loss: 0.23908
Epoch: 0 | Step: 500 | Loss: 0.14605
Epoch: 0 | Step: 600 | Loss: 0.21844
Epoch: 0 | Step: 700 | Loss: 0.19856
Epoch: 0 | Step: 800 | Loss: 0.13348
Epoch: 0 | Step: 900 | Loss: 0.21149
Epoch: 0 | Step: 1000 | Loss: 0.12365
Epoch: 0 | Step: 1100 | Loss: 0.26526
Epoch: 0 | Step: 1200 | Loss: 0.22614
Epoch: 0 | Step: 1300 | Loss: 0.18268
Epoch: 0 | Step: 1400 | Loss: 0.16398
Epoch: 0 | Step: 1500 | Loss: 0.22243
Epoch: 0 | Step: 1600 | Loss: 0.21690
Epoch: 0 | Step: 1700 | Loss: 0.20265
Epoch: 0 | Step: 1800 | Loss: 0.22343
Epoch: 0 | Step: 1900 | Loss: 0.15028
Epoch: 0 | Step: 2000 | Loss: 0.10815
Epoch: 0 | Step: 2100 | Loss: 0.19034
Epoch: 0 | Step: 2200 | Loss: 0.22353
Epoch: 0 | Step: 2300 | Loss: 0.14005
Epoch: 0 | Step: 2400 | Loss: 0.15406
Epoch: 0 | Step: 2500 | Loss: 0.27477
Epoch: 1 | Step: 2600 | Loss: 0.12950
Epoch: 1 | Step: 2700

In [61]:
for batch in train_dataloader:
  batch = {k: v.to('cpu') for k, v in batch.items()}


In [72]:
best_model = DistilBERTClass()
best_model.load_state_dict(torch.load('/content/drive/MyDrive/model_5500_Loss_0.07871.pth', weights_only=True))
best_model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [77]:
def check_tone(score):
  if 0 <= score <= 3:
    return 0
  else:
    return 1

In [84]:
import evaluate

eval_progress_bar = tqdm(range(len(eval_dataloader)))

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")
f1_tone = evaluate.load("f1")

best_model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.inference_mode():
      outputs = best_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['attention_mask'])

    tone_pred, tone_fact = [check_tone(x) for x in batch['label']-1], [check_tone(x) for x in outputs.argmax(dim=1)]

    accuracy.add_batch(references=tone_pred, predictions=tone_fact)

    f1_tone.add_batch(references=tone_pred, predictions=tone_fact)

    precision.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    recall.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    f1.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    eval_progress_bar.update(1)

accuracy_result = accuracy.compute()
precision_result = precision.compute(average='macro')
recall_result = recall.compute(average='macro')
f1_result = f1.compute(average='weighted')
f1_tone_result = f1_tone.compute(average='weighted')

print(f'Accuracy: {accuracy_result}')
print(f'F1 tone: {f1_tone_result}\n')
print(f'Precision: {precision_result}')
print(f'Recall: {recall_result}')
print(f'F1: {f1_result}')

  0%|          | 0/625 [00:00<?, ?it/s]

Accuracy: {'accuracy': 0.9555}
Precision: {'precision': 0.5629922552306412}
Recall: {'recall': 0.568462385605018}
F1: {'f1': 0.6094377253207054}


In [85]:
best_model = DistilBERTClass()
best_model.load_state_dict(torch.load('/content/drive/MyDrive/model_7000_Loss_0.09370.pth', weights_only=True))
best_model.to(device)

import evaluate

eval_progress_bar = tqdm(range(len(eval_dataloader)))

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")
f1_tone = evaluate.load("f1")

best_model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.inference_mode():
      outputs = best_model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], token_type_ids=batch['attention_mask'])

    tone_pred, tone_fact = [check_tone(x) for x in batch['label']-1], [check_tone(x) for x in outputs.argmax(dim=1)]

    accuracy.add_batch(references=tone_pred, predictions=tone_fact)

    f1_tone.add_batch(references=tone_pred, predictions=tone_fact)

    precision.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    recall.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    f1.add_batch(references=batch['label']-1, predictions=outputs.argmax(dim=1))

    eval_progress_bar.update(1)

accuracy_result = accuracy.compute()
precision_result = precision.compute(average='macro')
recall_result = recall.compute(average='macro')
f1_result = f1.compute(average='weighted')
f1_tone_result = f1_tone.compute(average='weighted')

print(f'Accuracy: {accuracy_result}')
print(f'F1 tone: {f1_tone_result}\n')
print(f'Precision: {precision_result}')
print(f'Recall: {recall_result}')
print(f'F1: {f1_result}')

  0%|          | 0/625 [00:00<?, ?it/s]

Accuracy: {'accuracy': 0.9564}
F1 tone: {'f1': 0.9563998290879728}

Precision: {'precision': 0.5678614589866388}
Recall: {'recall': 0.5733979901067374}
F1: {'f1': 0.613808076232546}
