# Introduction

This is an implementation of the Bert Model for Sequence classification.
The model was taken from transformers library of Hugging Face.
It is trained to classify tweets according to five categories.
Here we use the base-cased version of BERT.

**Importing**

In [None]:
import torch
# Run on GPU if possile
if torch.backends.mps.is_available():        
    device = torch.device('mps')
    print('Use Apple M2 GPU')
elif torch.cuda.is_available():
    device = torch.device('cuda')
    print('Use GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available!')
    device = torch.device('cpu')
print('Device:', device)

**Prepare the Data**

In [None]:
import pandas as pd
   
# Encoding 'latin-1' tolerates a wider range of Twitter vocabulary compared to utf-8  
df = pd.read_csv('../Data/Corona_NLP_train.csv',encoding='latin-1') 
# Have a look at the data with format 3798*6
df.info()

In [None]:
df.head(5)

**Preprocess the data**

In [4]:
# Label names will be transformed to numbers, so they can be computed as tensors 
# For targetting tensors should correspond to 0-n-1 labels 
def to_sentiment(Sentiment):
  if Sentiment =='Extremely Negative':
    return 4
  elif Sentiment == 'Negative':
    return 3
  elif Sentiment == 'Neutral':
    return 2
  elif Sentiment == 'Positive':
    return 1
  else:
    return 0

df['Sentiment'] = df.Sentiment.apply(to_sentiment)

In [None]:
import re

# Replace all hashtags by the written name and url adresses by 'url'
df.OriginalTweet = df.OriginalTweet.replace(to_replace='#',value='hashtag ',regex=True)
df.OriginalTweet = df.OriginalTweet.replace(to_replace='\n+|\t+',value=' ',regex=True)
df.OriginalTweet = df.OriginalTweet.replace(to_replace=r'https://.+',value='url',regex=True)

# Check success
df.head(10)

In [6]:
import numpy as np

# Get the lists of sentences and their labels.
sentences = df.OriginalTweet.values
labels = df.Sentiment.values
labels = np.array(labels)

# Classes of labels
classes = ['Extremely Positive','Positive','Neutral','Negative','Extremely Negative']

In [None]:
# Inspect samples to classify

def inspect_data(num:int):
  for i in range(num):
    s = np.random.randint(0,len(sentences)-1)
    print(f'Sample {i+1}: {sentences[s]}')
  pass

inspect_data(10)

In [None]:
!pip install seaborn

import seaborn as sns
import matplotlib.pyplot as plt

# Important values for our data

len_list = [len(i) for i in sentences]
average_len = 0
for i in len_list:
  average_len += i
average_len = round(average_len/len(len_list))
print('Average length:',average_len)

max_len = max(len_list)
print('Max sentence length:', max_len)

# Encoding length 
length = average_len + 80

sns.set_style('dark')
sns.histplot(len_list,alpha=0.9,color='blue',bins=50)
plt.xlim([0,450])
plt.xlabel('Sentence length')
plt.ylabel('')

In [None]:
# Label distribution
sns.countplot(labels)
plt.xlabel(classes,fontsize='large')

**Bert Tokenizer**

In [None]:
from transformers import BertTokenizer
# Load the BERT tokenizer
# We make use of Cased Version,as tweets likely express differences in meaning for lower and upper case
# ANGRY!! & angry
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

# Under the hood of tokenization
text = tokenizer.encode_plus(sentences[4],return_token_type_ids=True)
print('Original text: {:}'.format(sentences[4]))
print('text tokens: {:}'.format(tokenizer.tokenize(sentences[4])))
print('Input_ids: {:}'.format(text['input_ids']))

In [None]:
from tqdm import tqdm 
# Get tokenized ids of inputs in torch tensors
input_ids = []

for sample in tqdm(sentences):
  coding = tokenizer.encode_plus(
      sample,
      max_length=length,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='np'
  )
  input_ids.append(coding['input_ids'][0])

# Make attention masks
# For each tweet in input_ids create list of same length with values 1 for present token and 0 else
attention_masks = []
for i in input_ids:
  appen = [1 if (id>0) else 0 for id in i]
  # List of lists
  attention_masks.append(appen)

# Array of arrays
input_ids = np.array(input_ids)


**Create Dataloader**

In [12]:
from sklearn.model_selection import train_test_split
# Split Data into test and validation set 
# Recommended split is 20:80
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=42, test_size=0.2)

train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=42, test_size=0.2)

# Convert to input compatible type: tensors
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels,dtype=torch.long)
validation_labels = torch.tensor(validation_labels,dtype=torch.long)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [13]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 64
# Create DataLoader our training set
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation set
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

**Initialize model**

In [None]:
from transformers import BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

# Loading our model  
model = BertForSequenceClassification.from_pretrained(
    'bert-base-cased', 
    num_labels = 5, 
    output_attentions = False,
    output_hidden_states = False, 
)
# Move model to GPU
model.to(device)

In [None]:
# Define Parameters for training:
# Using AdamW as optimizer
optimizer = AdamW(model.parameters(),
                  lr = 5e-3, # learning_rate alpha, make it high as this is new classification task
                  eps = 1e-8 # epsilon for stability, leave it at default
                  #correct_bias = False,
                )

# Greater than 5 causes huge training time 
epochs = 5
# Number of training steps is number of batches * number of epochs
training_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = training_steps)

**Begin Training**

In [18]:
from torch.utils.checkpoint import checkpoint

# Get accuracy calculates the accuracy of predictions vs the true labels 
def get_accuracy(preds,labs):
  val_1 = np.argmax(preds,axis=1).flatten()
  val_2 = labs.flatten()
  return np.sum(val_1==val_2)/len(val_2),val_1

# Function to clear cache
def clear_cache():
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()
        

In [None]:
 # Store history of learning
loss_values = []
acc_values = []

for i in range(epochs):
    print(f'\n--------- Epoch {i+1}/{epochs} ---------')
    print('\nTraining...')

    # Reset the total loss for this epoch.
    epoch_loss = 0

    # Tell berta to train (differs from evaluation mode where outputs are effected)
    model.train()
    
    for batch in tqdm(train_dataloader,desc='Progress',ncols=100):
        # Get data from Dataloader
        # Move it to GPU
        i0 = batch[0].to(device)
        i1 = batch[1].to(device)
        i2 = batch[2].to(device)

        # Set gradient to zero for each run through loop
        model.zero_grad()        

        # Feed data to model
        outputs = model(i0, token_type_ids=None, attention_mask=i1, labels=i2)
        
        # outputs returns a tuple which includes the loss (for targets)
        # Extract the loss for this batch
        loss = outputs[0]
        epoch_loss += loss.item()

        # For learning we  
        # Calculate the gradient
        # Normalize it by clipping
        # Apply the gradient due to optimizers parameters
        # Update schedulers leraning rate

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    # Average loss by len of data
    training_loss = epoch_loss / len(train_dataloader)    
    
    # Store value          
    loss_values.append(training_loss)

    print('Training loss : {0:.2f}'.format(training_loss))
        

    print('Validation phase Epoch:{:}'.format(i+1))
    # Set berta to evaluation mode
    model.eval()

    # Variables for final measurement
    accuracy = 0
    batch_step = 0
    
    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader,desc='Progress',ncols=100):

        e0 = batch[0].to(device)
        e1 = batch[1].to(device)
        e2 = batch[2].to(device)
        
        # No need for calculating gradient as this is evaluation mode
        with torch.no_grad():        
            outputs = model(e0, token_type_ids=None, attention_mask=e1)
        
        # Returns the activations for the last layer
        # We have 5 labels
        # Move data to cpu & feed to accuracy fun
        logits = outputs[0]
        pred_labels = logits.detach().cpu().numpy()
        real_labels = e2.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        batch_acc,val1 = get_accuracy(pred_labels, real_labels)
        
        # Accumulate the total accuracy.
        accuracy += batch_acc
        # Track the number of batches
        batch_step += 1
        
        # Clear cache for using RAM
        clear_cache()

    # Report final accuracy
    acc = accuracy/batch_step
    acc_values.append(acc)

    print(' Accuracy for Validation : {0:.2f}'.format(acc))

print('~'* 42)

In [None]:
# plot acc_values & loss_values
import matplotlib.pyplot as plt
import seaborn as sns 

loss_val = [0.45,0.27,0.16,0.11]
acc_val = [0.84,0.84,0.85,0.85]
# acc_values
# loss_values

plt.plot(loss_val, label='train accuracy')
plt.plot(acc_val, label='validation accuracy')

plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
best_score = max(acc_values)
best_score = round(best_score,3)

model_save_name = f'berta_{best_score}.pt'
path = f'./path_to_save_to/{model_save_name}' 
#torch.save(model.state_dict(), path)

**Load Prediction Data**

In [None]:
df_test = pd.read_csv('../Data/Corona_NLP_test.csv',encoding='latin-1') 

In [None]:
# Same preprocessing as before

df_test['Sentiment'] = df_test.Sentiment.apply(to_sentiment)

# Replace all hashtags by the written name and url adresses by 'url'
df_test.OriginalTweet = df_test.OriginalTweet.replace(to_replace='#',value='hashtag ',regex=True)
df_test.OriginalTweet = df_test.OriginalTweet.replace(to_replace='\n+|\t+',value=' ',regex=True)
df_test.OriginalTweet = df_test.OriginalTweet.replace(to_replace=r'https://.+',value='url',regex=True)

# Check success
df_test.head(10)

In [None]:
# Get Data
sentences = df_test.OriginalTweet.values
labels = df_test.Sentiment.values

# Get tokenized ids of inputs in torch tensors
input_ids = []

for sample in sentences:
  coding = tokenizer.encode_plus(
      sample,
      max_length=length,
      add_special_tokens=True,
      return_token_type_ids=False,
      pad_to_max_length=True,
      return_attention_mask=True,
      return_tensors='pt'
  )
  input_ids.append(np.array(coding['input_ids'][0]))


# Make attention masks manually
attention_masks = []
for i in input_ids:
    am = [int(token_id > 0) for token_id in i]
    attention_masks.append(am)


input_ids = np.array(input_ids)
labels = np.array(labels)
# type(attention_mask) = list with list

**Prepare the dataloader**

In [None]:
# Convert to tensors
prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)

batch_size = 32  
# Create DataLoader
prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = RandomSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

**Run model on prediction data**

In [None]:
def eval_score(preds,labs):
  compared = 0
  absolute = 0
  list_labels = []
  for s,i in enumerate(preds):
    pval = np.argmax(i,axis=1)
    lval = labs[s]
    list_labels.append(pval)
    compared += np.sum(pval==lval)/32
    absolute += np.sum(pval==lval)
  return compared/119,absolute, list_labels

In [None]:
# Set model to eavluation mode
model.eval()

# For storing results
predictions = []
real_labels = []
step_size = 0

for batch in tqdm(prediction_dataloader,desc='Progress',ncols=100):
  p0 = batch[0].to(device)
  p1 = batch[1].to(device)
  p2 = batch[2].to(device)

  # Calculate no gradient for evaluation
  with torch.no_grad():
      outputs = model(p0, token_type_ids=None, attention_mask=p1)
  logits = outputs[0]
  # Convert values to numpy arrays for comparison
  # Move logits and labels to
  logits = logits.detach().cpu().numpy()
  label_ids = p2.to('cpu').numpy()
  predictions.append(logits)
  real_labels.append(label_ids)

score, absolute, list_l = eval_score(predictions,real_labels)
print(f'\nPrediction data scores with {absolute} of 3808: {round(score,3)}%!')

**Visualize**

In [None]:
# Distribution of predictions
sns.set_style('dark')
pl = [i for ind in list_l for i in ind]
sns.countplot(pl)


In [None]:
# Distribtuin of real labels
rl = [i for ind in real_labels for i in ind]
sns.countplot(rl)

In [None]:
# Create a confusion matrix for visualization
def confuse(predictions,real_labels):
  # Turn nested tensors/arrays into list  
  v1 = [i for sublist in predictions for i in sublist]
  v2 = [i for sublist in real_labels for i in sublist]
  # Dataframe 
  data = {'predicted':v1,'actual':v2}
  df = pd.DataFrame(data, columns=['predicted','actual'])
  # confusion matrix 
  cm = pd.crosstab(df['predicted'],df['actual'],rownames=['Predicted'],colnames=['Actual'])
  sns.heatmap(cm,annot=True)
  plt.show()

confuse(predictions,real_labels)

In [None]:
def predict_tweet(tweet):
  coding = tokenizer.encode_plus(
    tweet,
    max_length=length,#277
    add_special_tokens=True,
    return_token_type_ids=False,
    padding=True,
    return_attention_mask=True,
    return_tensors='pt',
  )

  input_ids = coding['input_ids'].to(device)
  attention_mask = coding['attention_mask'].to(device)

  output = model(input_ids, attention_mask)
  result = np.argmax(output)
  print('Output values:',output)
  print('Result:',result)

  print(f'Original Text : {tweet}')
  print(f'Sentiment : {classes[result]}')

text = 'I am in a very bad mood since the lockdown came in nothing is open any more... the worst thing that could happen!'
text_n = 'i dont really care about corona. As long as i have my playstation it is not too bad.'
text_p = 'Awesome, I love the situation. We can stay at Home all day every day!'

predict_tweet(text)

In [None]:
# newly connected runtime
from transformers import BertConfig,BertForSequenceClassification,BertTokenizer
import torch

# top level model class
config = BertConfig.from_pretrained('bert-base-cased', num_labels=5)
model = BertForSequenceClassification.from_pretrained('bert-base-cased', config=config)

# save from where
path = ''
# laod model with torch.load pt-file
model.load_state_dict(torch.load(path), map_location=torch.device('mps'))
# do not forget tokenizer for new data
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')