# Finetuning BERT with IMDB Dataset

Dataset from:
https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [1]:
!pip install pytorch_pretrained_bert



In [2]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

#set GPU
if torch.cuda.is_available():
    device = torch.device("cuda")

#1. Data Load

In [3]:
# read csv file
df = pd.read_csv('IMDB Dataset.csv')
df = df.rename(columns={'sentiment':'label'})

In [4]:
replace = {'positive':0,'negative':1}
df = df.replace({'label':replace})

In [5]:
df.head()

Unnamed: 0,review,label
0,One of the other reviewers has mentioned that ...,0
1,A wonderful little production. <br /><br />The...,0
2,I thought this was a wonderful way to spend ti...,0
3,Basically there's a family where a little boy ...,1
4,"Petter Mattei's ""Love in the Time of Money"" is...",0


In [6]:
!pip install datasets



In [7]:
from transformers import BertForSequenceClassification, BertTokenizerFast, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np

In [8]:
dataset = Dataset.from_pandas(df[:1000])

In [9]:
# test train split
dataset = dataset.train_test_split(test_size=0.3)
train_set = dataset['train']
test_set = dataset['test']

In [10]:
# load tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# tokenize sentences
def preprocess(data):
  tokenized_data = tokenizer(data['review'], padding=True, truncation=True)
  return tokenized_data

In [12]:
train_set = train_set.map(preprocess, batched=True, batch_size=len(train_set))
test_set = test_set.map(preprocess, batched=True, batch_size=len(test_set))

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [13]:
train_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_set.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

#2. Train Model

In [14]:
from transformers import Trainer, TrainingArguments

In [15]:
!pip install transformers[torch]



In [16]:
#set arguments
batch_size = 8
epochs = 2
warmup_steps = 500
weight_decay = 0.01

#define parameters
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=weight_decay,
    logging_dir='./logs',
)

# define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set
)

In [17]:
import tqdm

In [18]:
# train bert
trainer.train()

Step,Training Loss


TrainOutput(global_step=176, training_loss=0.5450306372209028, metrics={'train_runtime': 129.1948, 'train_samples_per_second': 10.836, 'train_steps_per_second': 1.362, 'total_flos': 368355477504000.0, 'train_loss': 0.5450306372209028, 'epoch': 2.0})

In [19]:
# save model
output_dir = '/content/model/model.pt'
torch.save(model.state_dict(),output_dir)

#3. prediction

In [20]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [29]:
# tokenize sample sentence
text = "As I deal with real history almost as it is, I get angry while watching it, but I'm immersed in it because it's cinematically interesting. Every time I watch it, it looks new. It's a work that has both meaning and fun!"
input_text_tokenized = tokenizer.encode(text,truncation=True,padding=True,return_tensors="pt").to(device)
input_text_tokenized

tensor([[  101,  2004,  1045,  3066,  2007,  2613,  2381,  2471,  2004,  2009,
          2003,  1010,  1045,  2131,  4854,  2096,  3666,  2009,  1010,  2021,
          1045,  1005,  1049, 26275,  1999,  2009,  2138,  2009,  1005,  1055,
         21014,  3973,  5875,  1012,  2296,  2051,  1045,  3422,  2009,  1010,
          2009,  3504,  2047,  1012,  2009,  1005,  1055,  1037,  2147,  2008,
          2038,  2119,  3574,  1998,  4569,   999,   102]], device='cuda:0')

In [37]:
# {'positive':0,'negative':1}
prediction = model(input_text_tokenized).logits
pred_class_idx = torch.argmax(prediction).item()

if pred_class_idx==0:
  print('positive')
else:
  print('negative')

positive
