# bert-for-sentiment

Use the "Run" button to execute the code.

In [None]:
#!pip install jovian --upgrade --quiet

In [None]:
#import jovian

In [None]:
# Execute this to save new versions of the notebook
#jovian.commit(project="bert-for-sentiment")

In [None]:
!pip install transformers --upgrade --quiet

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [None]:
%matplotlib inline

In [None]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
import os
print('Files present in this directory', os.listdir('/kaggle/input/privatetrain/'))

In [None]:
print('Files present in this directory', os.listdir('/'))

In [None]:
df = pd.read_csv('/kaggle/input/privatetrain/train.tsv', sep='\t')
df.head()

In [None]:
df.shape

In [None]:
df.info

In [None]:
df.groupby("Sentiment").Sentiment.count().plot.bar(ylim=0)

In [None]:
# keeping case helps keep some info.
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [None]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

## example of what BERT Tokenizer does

In [None]:
sample_txt = 'When was I last outside? I am stuck at home for 2 weeks.'

In [None]:
tokens = tokenizer.tokenize(sample_txt)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {sample_txt}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
# [SEP] - marker for ending of a sentence
tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
# [CLS] - we must add this token to the start of each sentence, so BERT knows we’re doing classification
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
# Padding Token
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
# [UNK] (unknown) token:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
#encode_plus - adds the above tokens where needed

encoding = tokenizer.encode_plus(
  sample_txt,
  max_length=32,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=False,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

encoding.keys()

In [None]:
type(encoding)

In [None]:
type(encoding['input_ids'])

In [None]:
encoding['input_ids'].shape

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
tokenizer.convert_ids_to_tokens(encoding['input_ids'][0])

## choosing Sequence length

In [None]:
token_lens = []

for txt in df.Phrase:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
max(token_lens)

In [None]:
plt.hist(token_lens,  bins=20)  # density=False would make counts
plt.ylabel('Count')
plt.xlabel('Length');

In [None]:
# max lingth is 84 but most are <40.
#Pad to 40 words (tokens) and truncate 

In [None]:
MAX_LEN = 40

## create data loader and data sets

In [None]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]
    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'review_text': review,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
df_train, df_test = train_test_split(
  df,
  test_size=0.1,
  random_state=RANDOM_SEED
)

df_val, df_test = train_test_split(
  df_test,
  test_size=0.5,
  random_state=RANDOM_SEED
)

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
def create_data_loader(df, tokenizer, max_len, batch_size):

  ds = GPReviewDataset(
    reviews=df.Phrase.to_numpy(),
    targets=df.Sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4
  )

In [None]:
BATCH_SIZE = 16

In [None]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

## Explanation of what a Pretrained BERT model does

In [None]:
'''
Depending on the task you might want to use BertForSequenceClassification, BertForQuestionAnswering or something else.
We’ll use the basic BertModel and build our sentiment classifier on top of it. 
'''

In [None]:
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME) #PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

In [None]:
#pass the tokens from the test sentence
last_hidden_state, pooled_output = bert_model(
  input_ids=encoding['input_ids'],
  attention_mask=encoding['attention_mask']
).values()

In [None]:
type(last_hidden_state)

In [None]:
last_hidden_state.shape

In [None]:
# 32 was the length we set for the test sentence
#But why 768? This is the number of hidden units in the feedforward-networks.
bert_model.config.hidden_size

In [None]:
#pooled_output as a summary of the content, according to BERT
# we would add a layer or more to convert to the classes we want
pooled_output.shape

## Create model and run a single batch

In [None]:
#this adds a Linear layer and a dropout layer (for regularization) to the pretrained BERT model
# this is an example of Transfer Learning. Most of the classification work is done by the pre-trained BertModel
'''
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    output = self.drop(pooled_output)

    return self.out(output)
'''


class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

  def forward(self, input_ids, attention_mask):
    returned = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    pooled_output = returned["pooler_output"]
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
device

In [None]:
model = SentimentClassifier(5) # classifying to one of 5 sentiments
model = model.to(device)

In [None]:
# Execute this to save new versions of the notebook
#jovian.commit(project="bert-for-sentiment")

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
output1 = model(input_ids, attention_mask)

In [None]:
type(output1)

In [None]:
output1.shape

In [None]:
nn.functional.softmax(output1, dim=1)

## Train

In [None]:
EPOCHS = 4

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

'''
BERT Authors recomend:
The BERT authors have some recommendations for fine-tuning:

    Batch size: 16, 32
    Learning rate (Adam): 5e-5, 3e-5, 2e-5
    Number of epochs: 2, 3, 4
'''

In [None]:
def train_epoch(
  model,
  data_loader,
  loss_fn,
  optimizer,
  device,
  scheduler,
  n_examples
):

  model = model.train()
  losses = []

  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)
    outputs = model(
      input_ids=input_ids,
      attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())
    loss.backward()

    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()
  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)

      losses.append(loss.item())

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time
history = defaultdict(list)

best_accuracy = 0

for epoch in range(EPOCHS):
  print(f'Epoch {epoch + 1}/{EPOCHS}')
  print('-' * 10)
  train_acc, train_loss = train_epoch(
    model,
    train_data_loader,
    loss_fn,
    optimizer,
    device,
    scheduler,
    len(df_train)
  )

  print(f'Train loss {train_loss} accuracy {train_acc}')

  val_acc, val_loss = eval_model(
    model,
    val_data_loader,
    loss_fn,
    device,
    len(df_val)
  )

  print(f'Val   loss {val_loss} accuracy {val_acc}')
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), '/kaggle/working/best_model_state.bin')
    best_accuracy = val_acc

In [None]:
print('Done')

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()

plt.ylim([0, 1]);

## Evaluation

In [None]:
test_acc, _ = eval_model(
  model,
  test_data_loader,
  loss_fn,
  device,
  len(df_test)
)

test_acc.item()

In [None]:
def get_predictions(model, data_loader):
  model = model.eval()
  review_texts = []
  predictions = []
  prediction_probs = []
  real_values = []

  with torch.no_grad():
    for d in data_loader:
      texts = d["review_text"]
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)
      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)
      review_texts.extend(texts)
      predictions.extend(preds)
      prediction_probs.extend(outputs)
      real_values.extend(targets)

  predictions = torch.stack(predictions).cpu()
  prediction_probs = torch.stack(prediction_probs).cpu()
  real_values = torch.stack(real_values).cpu()
  return review_texts, predictions, prediction_probs, real_values

In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
  model,
  test_data_loader
)

In [None]:
class_names = ['Negative','Somewhat Negative','Neutral','somewhat Positive','Positive']
print(classification_report(y_test, y_pred, target_names=class_names))

## More testing

In [None]:
review_text = "A tale told by idiots, full of sound and fury signifying nothing"

In [None]:
encoded_review = tokenizer.encode_plus(
  review_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  truncation=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)
output = model(input_ids, attention_mask)

_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')

## Load Saved model

In [None]:
review_text = "A tale told by idiots, full of sound and fury signifying nothing"

In [None]:
encoded_review = tokenizer.encode_plus(
  review_text,
  max_length=MAX_LEN,
  add_special_tokens=True,
  return_token_type_ids=False,
  pad_to_max_length=True,
  truncation=True,
  return_attention_mask=True,
  return_tensors='pt',
)

In [None]:
input_ids_loaded = encoded_review['input_ids']
attention_mask_loaded = encoded_review['attention_mask']

In [None]:
#PRE_TRAINED_MODEL_NAME = 'bert-base-cased'

model_loaded = SentimentClassifier(5)

In [None]:
m = torch.load('/kaggle/working/best_model_state.bin')
#m

In [None]:
model_loaded.load_state_dict(m)

In [None]:
input_ids_loaded.get_device()

In [None]:
output = model_loaded(input_ids_loaded, attention_mask_loaded)

_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')