<a href="https://colab.research.google.com/github/wizard339/education/blob/main/cnn_nlp_pytorch_lighting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets > /dev/null
!pip install tokenizers > /dev/null
!pip install transformers > /dev/null

In [None]:
!pip install pytorch_lightning > /dev/null

### importing

In [None]:
import random
import collections
import os

from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

import numpy as np
import pandas as pd

import datasets

from tqdm.auto import tqdm

import matplotlib.pyplot as plt

In [None]:
text_dataset = datasets.load_dataset("imdb")

Downloading:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a...


Downloading:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a. Subsequent calls will reuse this data.


### training tokenizer

In [None]:
from tokenizers import SentencePieceBPETokenizer 

In [None]:
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]

In [None]:
tokenizer = SentencePieceBPETokenizer(
    unk_token="[UNK]",
    replacement="▁",
    add_prefix_space=True,
    dropout=0.9,
    fuse_unk=False
)

In [None]:
def batch_iterator():
    batch_length = 1000
    for i in range(0, len(text_dataset["train"]), batch_length):
        yield text_dataset["train"][i : i + batch_length]["text"]

tokenizer.train_from_iterator(batch_iterator(),
                              vocab_size=30000,
                              min_frequency=2,
                              show_progress=True,
                              special_tokens=special_tokens
                              )

In [None]:
tokenizer.save_model('./')

['./vocab.json', './merges.txt']

In [None]:
tokenizer = SentencePieceBPETokenizer('./vocab.json', './merges.txt', unk_token="[UNK]",
                                      replacement="▁", add_prefix_space=True,
                                      fuse_unk=False)

In [None]:
max_len = 697

### data preporation

In [None]:
class CustomDataset:
  def __init__(self, data, targets, tokenizer, max_len=max_len):
    self.data = data
    self.targets = targets
    self.tokenizer = tokenizer

  def __len__(self):
    return len(self.data)

  def __getitem__(self, idx):
    text = self.data[idx]
    label = self.targets[idx]

    input_ids = np.zeros((int(max_len),))
    row = self.tokenizer.encode(text).ids
    for j, id in enumerate(row):
      if j < max_len:
        input_ids[j] = id
    
    return {
        'text': torch.tensor(input_ids, dtype=torch.long),
        'label': torch.tensor(label, dtype=torch.long)
    }

In [None]:
train_dataset = CustomDataset(text_dataset['train']['text'], text_dataset['train']['label'], tokenizer)
test_dataset = CustomDataset(text_dataset['test']['text'], text_dataset['test']['label'], tokenizer)

In [None]:
criterion = torch.nn.CrossEntropyLoss()

In [None]:
def accuracy(probs, targets):
    outputs = np.argmax(probs, axis=1)
    return np.sum(outputs == targets)

### Main part

In [None]:
class ConvNeuralNet(pl.LightningModule):
    def __init__(self, input_dim, embed_size,
                 do_rate1=0.5, n_classes=2):
        super(ConvNeuralNet, self).__init__()

        self.vocab_size = input_dim
        self.embedding_size = embed_size
        self.kernel_num = 200
        self.kernels_sizes = [2, 6, 10]
        
        self.embedding = nn.Embedding(self.vocab_size, embed_size)
        self.convs = nn.ModuleList([nn.Conv2d(1, self.kernel_num, (K, embed_size)) for K in self.kernels_sizes])
        self.dropout = nn.Dropout2d(do_rate1)
        self.fc1 = nn.Linear(len(self.kernels_sizes) * self.kernel_num, n_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.unsqueeze(1)
        
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x]
        x = torch.cat(x, 1)
        
        x = self.dropout(x)
        logit = self.fc1(x)
        return logit

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-4)
        return optimizer

    def training_step(self, train_batch, batch_idx):
      text, label = train_batch['text'], train_batch['label']
      predited_label = model(text)
      loss = criterion(predited_label, label)
      self.log('train_loss', loss)
      return loss

    def validation_step(self, val_batch, batch_idx):
      text, label = val_batch['text'], val_batch['label']
      predited_label = model(text)
      loss = criterion(predited_label, label)
      predited_label = predited_label.detach().cpu().numpy()
      label = label.to('cpu').numpy()
      total_acc = accuracy(predited_label, label) / len(label)
      self.log('val_loss', loss)
      self.log('val_acc', total_acc)

In [None]:
BATCH_SIZE = 32

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                              shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE,
                              shuffle=False)

In [None]:
model = ConvNeuralNet(tokenizer.get_vocab_size(), 300)


trainer = pl.Trainer(gpus=1, max_epochs=4)
trainer.fit(model, train_dataloader, valid_dataloader)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type       | Params
-----------------------------------------
0 | embedding | Embedding  | 9.0 M 
1 | convs     | ModuleList | 1.1 M 
2 | dropout   | Dropout2d  | 0     
3 | fc1       | Linear     | 1.2 K 
-----------------------------------------
10.1 M    Trainable params
0         Non-trainable params
10.1 M    Total params
40.327    Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

  f"Your {mode}_dataloader has `shuffle=True`, it is best practice to turn"


Training: -1it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
