<a href="https://colab.research.google.com/github/v3xlrm1nOwo1/Google-Play-Sentiment-Analysis-With-BERT/blob/main/02_Text_Preprocessing_and_Create_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q -U watermark

In [None]:
%reload_ext watermark
%watermark -v -p numpy,pandas,torch,transformers

In [None]:
!pip install -qq transformers[torch]

In [6]:
import transformers
import torch

import io
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from textwrap import wrap
from collections import defaultdict
from accelerate import Accelerator

from torch import nn, optim
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForSequenceClassification
import torch.nn.functional as F

In [None]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)

HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]

sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))

rcParams['figure.figsize'] = 6, 4

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

LEARN_RATE = 3e-6
RANDOM_SEED = 666
MAX_LEN = 160
BATCH_SIZE = 64
EPOCHS = 30
CHECKPOINT= 'bert-base-cased'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

## The Data

In [None]:
df = pd.read_csv('reviews.csv')

df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
sns.countplot(x=df.score)
plt.xlabel('review score');

## Add More Columns

In [None]:
def to_sentiment(rating):
    rating = int(rating)
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    else:
        return 2

In [None]:
df['sentiment'] = df.score.apply(to_sentiment)

In [None]:
df.head(n=2)

In [None]:
class_names = ['negative', 'neutral', 'postive']

ax = sns.countplot(x=df.sentiment)
plt.xlabel('review sentiment')
ax.set_xticklabels(class_names)

## Data Preprocessing

In [None]:
tokenizer = transformers.AutoTokenizer.from_pretrained(CHECKPOINT)

In [None]:
text = 'Sharing pretrained models!'

print(text)

In [None]:
tokens = tokenizer.tokenize(text)

print(len(tokens))
print(tokens)

In [None]:
token_ids = tokenizer.convert_tokens_to_ids(tokens=tokens)

print(len(token_ids))
print(token_ids)

In [None]:
tokens = tokenizer.tokenize(text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f' Sentence: {text}')
print(f'   Tokens: {tokens}')
print(f'Token IDs: {token_ids}')

In [None]:
tokenizer.sep_token, tokenizer.sep_token_id

In [None]:
tokenizer.cls_token, tokenizer.cls_token_id

In [None]:
tokenizer.pad_token, tokenizer.pad_token_id

In [None]:
tokenizer.unk_token, tokenizer.unk_token_id

In [None]:
encoding = tokenizer.encode_plus(
  text=text,
  max_length=10,
  add_special_tokens=True, # Add '[CLS]' and '[SEP]'
  return_token_type_ids=True,
  pad_to_max_length=True,
  return_attention_mask=True,
  return_tensors='pt',  # Return PyTorch tensors
)

encoding.keys()

In [None]:
print(len(encoding['input_ids'][0]))
encoding['input_ids'][0]

In [None]:
print(len(encoding['attention_mask'][0]))
encoding['attention_mask']

In [None]:
print(tokenizer.convert_ids_to_tokens(encoding['input_ids'][0]))

In [None]:
token_lens = []

for txt in df.content:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256]);
plt.xlabel('Token count');

## Create a Dataset

In [None]:
dataset_df = df[['content', 'sentiment']]

dataset_df.head(n=2)

In [None]:
df_train, df_test = train_test_split(dataset_df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

In [None]:
df_train.head()

In [None]:
class GPReviewDataset(Dataset):

  def __init__(self, reviews, targets, tokenizer, max_len, include_raw_text=False):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.include_raw_text = include_raw_text

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      review,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      truncation=True,
      return_tensors='pt',
    )

    output =  {
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

    if self.include_raw_text:
        output['review_text'] = review

    return output

In [None]:
df_train.shape, df_val.shape, df_test.shape

In [None]:
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

In [None]:
def create_data_loader(df, tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE, include_raw_text=False):
  ds = GPReviewDataset(
    reviews=df.content.to_numpy(),
    targets=df.sentiment.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len,
    include_raw_text=include_raw_text
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=4,
    # collate_fn=data_collator
  )

In [None]:
train_data_loader = create_data_loader(df=df_train, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
val_data_loader = create_data_loader(df=df_val, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE, include_raw_text=True)
test_data_loader = create_data_loader(df=df_test, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE, include_raw_text=True)

In [None]:
data = next(iter(val_data_loader))
data.keys()

In [None]:
print(data['input_ids'].shape)
print(data['attention_mask'].shape)
print(data['targets'].shape)

In [None]:
class SentimentClassifier(nn.Module):
    def __init__(self, n_classes=3):
        super(SentimentClassifier, self).__init__()
        self.model = transformers.BertModel.from_pretrained(CHECKPOINT)
        self.drop = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.model.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )[1]

        output = self.drop(pooled_output)
        output = self.out(output)

        return output

In [None]:
model = SentimentClassifier(n_classes=3)
model = model.to(device)

In [None]:
input_ids = data['input_ids'].to(device)
attention_mask = data['attention_mask'].to(device)

print(input_ids.shape) # batch size x seq length
print(attention_mask.shape) # batch size x seq length

In [None]:
# F.softmax(model(input_ids, attention_mask), dim=1)

In [None]:
optimizer = transformers.AdamW(model.parameters(), lr=LEARN_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = transformers.get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
def train_epoch(model, data_loader, loss_fn, optimizer, scheduler, n_examples):
    model = model.train()

    losses = []
    correct_predictions = 0

    for dl in data_loader:
        input_ids = dl['input_ids'].to(device)
        attention_mask = dl['attention_mask'].to(device)
        targets = dl['targets'].to(device)

        outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss)

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        optimizer.zero_grad()

    return  correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      targets = d['targets'].to(device)

      outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
      )
      _, preds = torch.max(outputs, dim=1)

      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss)

  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):

  print('=' * 50)
  print(f'Epoch {epoch + 1} / {EPOCHS}')

  train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, scheduler, len(df_train))
  print(f'Train loss {train_loss}, accuracy {train_acc}')

  val_acc, val_loss = eval_model(model, val_data_loader, loss_fn, device, len(df_val))
  print(f'Val loss {val_loss}, accuracy {val_acc}')

  print(f'===> Epoch {epoch + 1} / {EPOCHS} | Train loss {train_loss}, accuracy {train_acc} | Val loss {val_loss}, accuracy {val_acc}')
  print('=' * 50)

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), 'best_model_state.bin')
    best_accuracy = val_acc

In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['val_acc'], label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);