In [60]:
# https://github.com/kswamy15/pytorch-lightning-imdb-bert/blob/master/Bert_NLP_Pytorch_IMDB_v3.ipynb

In [1]:
import os

import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset, RandomSampler, random_split
from torchvision import transforms
import pytorch_lightning as pl
from pytorch_lightning import loggers as pl_loggers
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor
import transformers
from nlp import load_dataset
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser
import re
from typing import Optional

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# custom dataset uses Bert Tokenizer to create the Pytorch Dataset
class ImdbDataset(Dataset):
    def __init__(self, notes, targets, tokenizer, max_len):
        self.notes = notes
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return (len(self.notes))

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        note = str(self.notes[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
            note,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            #'text': note,
            'label': torch.tensor(target, dtype=torch.long),
            'input_ids': (encoding['input_ids']).flatten(),
            'attention_mask': (encoding['attention_mask']).flatten(),
            'token_type_ids': (encoding['token_type_ids']).flatten()
        }

In [3]:
t = transformers.BertTokenizerFast.from_pretrained('bert-base-uncased')

In [4]:
t.vocab_size

30522

In [5]:
from tqdm.auto import tqdm


class DataModule(pl.LightningDataModule):

    def __init__(self,
                 data_dir: str,
                 tokenizer: str = 'bert-base-uncased',
                 max_len: int = 500,
                 batch_size: int = 64,
                 num_workers: int = 4,
                 *args,
                 **kwargs
                 ):
        super().__init__()
        self.save_hyperparameters()
        self.tokenizer = transformers.BertTokenizerFast.from_pretrained(self.hparams.tokenizer)

        self.REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
        self.NO_SPACE = ""
        self.SPACE = " "

    def prepare_data(self) -> None:
        ## Creates a list of reviews from the big text files containing all the reviews
        ## This code was taken from here https://github.com/aaronkub/machine-learning-examples/blob/master/imdb-sentiment-analysis/Sentiment%20Analysis%20Walkthrough%20Part%201.ipynb
        reviews_train = []
        DIR = Path(self.hparams.data_dir)
        with open(DIR / 'full_train.txt', 'r') as f:
            for line in f:
                reviews_train.append(line.strip())

        reviews_test = []
        with open(DIR / 'full_test.txt', 'r') as f:
            for line in f:
                reviews_test.append(line.strip())

        self.reviews_train_clean = self.preprocess_reviews(reviews_train)
        self.reviews_test_clean = self.preprocess_reviews(reviews_test)

    def preprocess_reviews(self, reviews):
        #reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
        reviews = [self.REPLACE_WITH_SPACE.sub(self.SPACE, line) for line in reviews]

        return reviews

    def setup(self, stage: Optional[str] = None):
        ## Creating dataframes from the list data.  The reviews are arranged in the order that the first 12,500 belong to positive reviews and the rest 12,500 belong to negative reviews.
        df_train_reviews_clean = pd.DataFrame(self.reviews_train_clean, columns=['reviews'])
        df_train_reviews_clean['target'] = np.where(df_train_reviews_clean.index < 12500, 1, 0)

        df_test_reviews_clean = pd.DataFrame(self.reviews_test_clean, columns=['reviews'])
        df_test_reviews_clean['target'] = np.where(df_test_reviews_clean.index < 12500, 1, 0)

        # Shuffling the rows in both the train and test data.  This is very important before using the data for training.
        df_train_reviews_clean = df_train_reviews_clean.sample(frac=1).reset_index(drop=True)
        df_test_reviews_clean = df_test_reviews_clean.sample(frac=1).reset_index(drop=True)

        # breaking the train data into training and validation
        df_train, df_valid = train_test_split(df_train_reviews_clean, test_size=0.25,
                                              stratify=df_train_reviews_clean['target'])

        self.train = df_train.reset_index(drop=True)
        self.val = df_valid.reset_index(drop=True)
        self.test = df_test_reviews_clean


    def train_dataloader(self):
        return DataLoader(ImdbDataset(notes=self.train['reviews'],
                                      targets=self.train['target'],
                                      tokenizer=self.tokenizer,
                                      max_len=self.hparams.max_len
                                      ),
                          batch_size=self.hparams.batch_size,
                          num_workers=self.hparams.num_workers)

    def val_dataloader(self):
        return DataLoader(ImdbDataset(notes=self.val['reviews'],
                                      targets=self.val['target'],
                                      tokenizer=self.tokenizer,
                                      max_len=self.hparams.max_len),
                          batch_size=self.hparams.batch_size,
                          num_workers=self.hparams.num_workers)

    def test_dataloader(self):
        return DataLoader(ImdbDataset(notes=self.test['reviews'],
                                      targets=self.test['target'],
                                      tokenizer=self.tokenizer,
                                      max_len=self.hparams.max_len),
                          batch_size=self.hparams.batch_size,
                          num_workers=self.hparams.num_workers)



In [6]:
## The main Pytorch Lightning module
class ImdbModel(pl.LightningModule):

    def __init__(self,
                 learning_rate: float = 0.0001 * 8,
                 **kwargs):
        super().__init__()

        self.save_hyperparameters()

        self.num_labels = 2
        config = transformers.DistilBertConfig(dropout=0.1, attention_dropout=0.2, n_layers=1, n_heads=2)
        self.bert = transformers.DistilBertModel.from_pretrained('distilbert-base-uncased', config=config)

        self.pre_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, self.num_labels)
        self.dropout = torch.nn.Dropout(self.bert.config.seq_classif_dropout)

        # relu activation function
        self.relu = torch.nn.ReLU()

    def forward(self, input_ids, attention_mask, labels):

        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        hidden_state = outputs[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        pooled_output = self.pre_classifier(pooled_output)  # (bs, dim)
        pooled_output = self.relu(pooled_output)  # (bs, dim)
        pooled_output = self.dropout(pooled_output)  # (bs, dim)
        logits = self.classifier(pooled_output)  # (bs, dim)

        return logits

    def get_outputs(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        hidden_state = outputs[0]  # (bs, seq_len, dim)
        pooled_output = hidden_state[:, 0]  # (bs, dim)
        return pooled_output

    def training_step(self, batch, batch_nb):
        # batch
        input_ids = batch['input_ids']
        label = batch['label']
        attention_mask = batch['attention_mask']
        #token_type_ids = batch['token_type_ids']
        # fwd
        y_hat = self(input_ids, attention_mask, label)

        # loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(y_hat.view(-1, self.num_labels), label.view(-1))
        #loss = F.cross_entropy(y_hat, label)

        # logs
        tensorboard_logs = {'train_loss': loss, 'learn_rate': self.optim.param_groups[0]['lr']}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_nb):
        # batch
        input_ids = batch['input_ids']
        label = batch['label']
        attention_mask = batch['attention_mask']
        #token_type_ids = batch['token_type_ids']
        # fwd
        y_hat = self(input_ids, attention_mask, label)

        # loss
        #loss = F.cross_entropy(y_hat, label)
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(y_hat.view(-1, self.num_labels), label.view(-1))

        # acc
        a, y_hat = torch.max(y_hat, dim=1)
        val_acc = accuracy_score(y_hat.cpu(), label.cpu())
        val_acc = torch.tensor(val_acc)

        # logs
        tensorboard_logs = {'val_loss': loss, 'val_acc': val_acc}
        # can't log in validation step lossess, accuracy.  It wouldn't log it at every validation step
        return {'val_loss': loss, 'val_acc': val_acc, 'progress_bar': tensorboard_logs}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        avg_val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()

        # logs
        tensorboard_logs = {'val_loss': avg_loss, 'val_acc': avg_val_acc}
        return {'val_loss': avg_loss, 'progress_bar': tensorboard_logs, 'log': tensorboard_logs}

    def on_batch_end(self):
        #for group in self.optim.param_groups:
        #    print('learning rate', group['lr'])
        # This is needed to use the One Cycle learning rate that needs the learning rate to change after every batch
        # Without this, the learning rate will only change after every epoch
        if self.sched is not None:
            self.sched.step()

    def on_epoch_end(self):
        if self.sched is not None:
            self.sched.step()

    def test_step(self, batch, batch_nb):
        input_ids = batch['input_ids']
        label = batch['label']
        attention_mask = batch['attention_mask']
        #token_type_ids = batch['token_type_ids']
        y_hat = self(input_ids, attention_mask, label)

        # loss
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(y_hat.view(-1, self.num_labels), label.view(-1))

        a, y_hat = torch.max(y_hat, dim=1)
        test_acc = accuracy_score(y_hat.cpu(), label.cpu())

        return {'test_loss': loss, 'test_acc': torch.tensor(test_acc)}

    def test_epoch_end(self, outputs):
        avg_loss = torch.stack([x['test_loss'] for x in outputs]).mean()
        avg_test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()

        tensorboard_logs = {'avg_test_loss': avg_loss, 'avg_test_acc': avg_test_acc}
        return {'avg_test_acc': avg_test_acc, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}

    # ---------------------
    # TRAINING SETUP
    # ---------------------
    def configure_optimizers(self):
        # REQUIRED
        # can return multiple optimizers and learning_rate schedulers
        # (LBFGS it is automatically supported, no need for closure function)
        optimizer = torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=self.hparams.learning_rate,
                                     eps=1e-08)
        #scheduler = StepLR(optimizer, step_size=1, gamma=0.2)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=2e-5, total_steps=2000)
        #scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-7, max_lr=1e-4, cycle_momentum=False,step_size_up=300)

        #scheduler = ReduceLROnPlateau(optimizer, patience=0, factor=0.2)
        self.sched = scheduler
        self.optim = optimizer
        return [optimizer], [scheduler]

In [7]:
tb_logger = pl_loggers.TensorBoardLogger('../.cache/logs/')

# callbacks
early_stop = EarlyStopping(
    monitor='val_loss',
    min_delta=0.0,
    patience=3,
    verbose=True,
    mode='min',
    strict=True,
)

#print ('inside checkpoint loop')
checkpoint = ModelCheckpoint(
    #filepath='best_model_{epoch:02d}-{val_loss:.2f}',
    dirpath='../.cache/best_model',
    verbose=False,
    monitor='val_loss',
    mode='min'
)

lr_logger = LearningRateMonitor(logging_interval='epoch')
trainer = Trainer(logger=tb_logger, callbacks=[checkpoint, lr_logger, early_stop], accelerator='gpu', devices=1, max_epochs=10)

model = ImdbModel()
datamodule = DataModule('../data/imdb/movie_data', batch_size=12)
datamodule.prepare_data()
datamodule.setup()
trainer.fit(model, datamodule=datamodule)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['distilbert.transformer.layer.4.sa_layer_norm.bias', 'distilbert.transformer.layer.3.ffn.lin1.weight', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.3.ffn.lin2.bias', 'distilbert.transformer.layer.4.ffn.lin2.weight', 'distilbert.transformer.layer.4.ffn.lin1.bias', 'distilbert.transformer.layer.1.attention.q_lin.bias', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'vocab_layer_norm.weight', 'distilbert.transformer.layer.1.sa_layer_norm.weight', 'distilbert.transformer.layer.4.attention.q_lin.bias', 'distilbert.transformer.layer.4.attention.v_lin.weight', 'distilbert.transformer.layer.5.attention.out_lin.weight', 'distilbert.transformer.layer.4.attention.k_lin.bias', 'distilbert.t

Sanity Checking: 0it [00:00, ?it/s]

RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.