# Table of Contents

1. [Importing Libraries](#1.-Importing-Libraries)
2. [Importing Data](#2.-Importing-Data)  
3. [Modelling](#3.-Modelling)  
    3.1 [Model Training](#3.1-Model-Training) <br>
    3.2 [Model Evaluation](#3.2-Model-Evaluation)

## 1. Importing Libraries

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import train_test_split

from transformers import BertForSequenceClassification, BertTokenizer

from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch.nn as nn

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score


In [38]:
import torch

# Check GPU availability
if torch.cuda.is_available():

    # Let PyTorch use GPU
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## 2. Importing Data

In [39]:
dataset = pd.read_csv('/Users/antoniooliveira/MannheimWMProject/processed_data.csv')

In [40]:
dataset.head(2)

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,...,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended,Sentiment,Year,review_length,Reviews_1,Reviews_2
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,...,4,4,4,4,yes,2,2024,467,Flight amazing. The crew onboard flight welcom...,flight amazing crew onboard flight welcoming g...
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,...,3,4,4,1,no,0,2024,249,Booking emergency exit seat still meant huge d...,booking emergency exit seat still meant huge d...


## 3. Modelling

**Train Test Split**

In [28]:
Y = dataset["Sentiment"]
X_train, X_test, Y_train, Y_test = train_test_split(dataset['Reviews_2'], Y, test_size=0.2, random_state=20)

In [29]:
print(Y_train.value_counts())

Sentiment
2    3171
0    2703
1     605
Name: count, dtype: int64


**Reshape**

In [30]:
X_train_2d = np.array(X_train).reshape(-1, 1)
Y_train_2d = np.array(Y_train).reshape(-1, 1)

**Applying Bert Tokenizer**

In [31]:
bTokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bModel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3,)

X_train_tok = []
X_test_tok = []

def encode(reviewSet, newList):
  for review in reviewSet:
    encodedReview = bTokenizer.encode_plus(
      text = review,
      add_special_tokens = True,
      max_length=512,
      truncation=True,
    )
    newList.append(encodedReview)
  return newList

X_train_tok = encode(X_train, X_train_tok)
X_test_tok = encode(X_test, X_test_tok)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [32]:
# print(X_train_tok[0])
# print(bTokenizer.convert_ids_to_tokens(X_train_tok[0]['input_ids']))

**Preparing the Dataset for Data Loader**

In [33]:
BATCH_SIZE = 10

# --- Preparing the train dataset for the DataLoader ---

# Convert the input sentences (tokenized) to tensors
X_TESTER_input = [torch.tensor(nSentence['input_ids']) for nSentence in X_train_tok[:]]
# Pad the sequences so that all tensors in the batch have the same length
X_TESTER_input_pad = pad_sequence(X_TESTER_input, batch_first=True)[:]
# Convert the attention masks to tensors
X_TESTER_mask = [torch.tensor(nSentence['attention_mask']) for nSentence in X_train_tok[:]]
# Pad the attention masks to match the input sequences
X_TESTER_mask_pad = pad_sequence(X_TESTER_mask, batch_first=True)[:]

# Convert the labels to tensors
Y_TESTER_input = torch.tensor(list(Y_train)[:])
# Create a TensorDataset with the padded input sequences, masks, and labels
TESTER_dataset = TensorDataset(X_TESTER_input_pad, X_TESTER_mask_pad, Y_TESTER_input)

# Create a DataLoader from the dataset, which will handle batching
TESTER_dataLoader = DataLoader(TESTER_dataset, batch_size=BATCH_SIZE)

# --- Preparing the test dataset for the DataLoader ---

# Convert the input sentences (tokenized) to tensors
X_TESTTESTER_input = [torch.tensor(nSentence['input_ids']) for nSentence in X_test_tok[:]]
# Pad the sequences so that all tensors in the batch have the same length
X_TESTTESTER_input_pad = pad_sequence(X_TESTTESTER_input, batch_first=True)[:]
# Convert the attention masks to tensors
X_TESTTESTER_mask = [torch.tensor(nSentence['attention_mask']) for nSentence in X_test_tok[:]]
# Pad the attention masks to match the input sequences
X_TESTTESTER_mask_pad = pad_sequence(X_TESTTESTER_mask, batch_first=True)[:]

# Convert the labels to tensors
Y_TESTTESTER_input = torch.tensor(list(Y_test)[:])
# Create a TensorDataset with the padded input sequences, masks, and labels
TESTTESTER_dataset = TensorDataset(X_TESTTESTER_input_pad, X_TESTTESTER_mask_pad, Y_TESTTESTER_input)

# Create a DataLoader from the dataset, which will handle batching
TESTTESTER_dataLoader = DataLoader(TESTTESTER_dataset, batch_size=BATCH_SIZE)


In [34]:
# print(X_TESTER_mask_pad)
# #print(X_TESTTESTER_mask_pad)
# print(Y_TESTER_input)

**Setting up AdmW Optimiser**

In [35]:
# Initializing the AdamW optimizer with the model's parameters and a learning rate
optimizer = AdamW(bModel.parameters(), lr=1e-6)

EPOCHS = 10

# Setting up the learning rate scheduler

scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0, 
    num_training_steps=len(TESTER_dataset) * EPOCHS 
)

# Defining the loss function
loss_function = nn.CrossEntropyLoss()


### 3.1 Model Training

In [36]:
outputs = 0
counter = 0
accuracyTrain_list = []
lossTrain_list = []
avg_accuracyTrain_list = []
prediction_list = []
actual_list = []

bModel.to(device)  

for i in range(EPOCHS):
    # Inform that model training is beginning
    bModel.train()  
    total_loss = 0
    total_accuracy = 0

    for item in TESTER_dataLoader:
        # Restart optimizer values
        optimizer.zero_grad()
        input_ids = item[0].to(device)  
        attention_masks = item[1].to(device)  
        labels = item[2].to(device)  

        # Forward pass through the model
        outputs = bModel(input_ids=input_ids,
                         attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        predictions = torch.argmax(outputs.logits, dim=1) 

        # Print the predictions, labels, and loss for debugging
        print(predictions)
        print(labels)
        print(loss)
        print((predictions == labels).sum().item() / predictions.size(0))

        # Calculate accuracy for the current batch
        accuracyTrain_list.append((predictions == labels).sum().item() / predictions.size(0))
        prediction_list.extend(predictions.tolist())  
        actual_list.extend(labels.tolist())  

        # Accumulate total loss and accuracy
        total_loss += loss.item()
        total_accuracy += (predictions == labels).sum().item() / predictions.size(0)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        scheduler.step()  
        counter += 1
        print("pass done" + str(counter)) 

    # Calculate and store average accuracy for the epoch
    average_accuracy_in_epoch = total_accuracy / len(TESTER_dataLoader)
    avg_accuracyTrain_list.append(average_accuracy_in_epoch)

    # Print average train loss for the epoch
    print("Average Train loss is: " + str(total_loss / len(TESTER_dataLoader)))

    # Generate and plot confusion matrix
    cm = confusion_matrix(actual_list, prediction_list)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of Sentiment (Count)')
    plt.show()

    plt.figure(figsize=(6, 6))
    sns.heatmap(cm / np.sum(cm), annot=True, cmap='Blues')  
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of Sentiment (Percentage)')
    plt.show()

    # Reset lists for the next epoch
    prediction_list = []
    actual_list = []


tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([0, 2, 2, 0, 0, 0, 0, 1, 2, 2])
tensor(1.0854, grad_fn=<NllLossBackward0>)
0.4
pass done1
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([1, 0, 0, 0, 0, 2, 0, 2, 0, 2])
tensor(1.0815, grad_fn=<NllLossBackward0>)
0.3
pass done2
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 0, 2, 2, 0, 0, 0, 0, 0, 2])
tensor(1.0401, grad_fn=<NllLossBackward0>)
0.4
pass done3
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([1, 2, 2, 0, 0, 2, 0, 2, 0, 2])
tensor(1.0399, grad_fn=<NllLossBackward0>)
0.5
pass done4
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 1, 0, 0, 2, 0, 2, 1, 2, 2])
tensor(1.0014, grad_fn=<NllLossBackward0>)
0.5
pass done5
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([0, 2, 2, 2, 0, 2, 0, 2, 0, 0])
tensor(0.9983, grad_fn=<NllLossBackward0>)
0.5
pass done6
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([0, 0, 2, 0, 2, 0, 2, 0, 2, 1])
tensor(1.0565, grad_fn=<NllLossBackward0>)
0.4
pass done7
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
tensor([0


KeyboardInterrupt



### 3.2 Model Evaluation

In [None]:
# Set the model to evaluation mode
bModel.eval()

# Initialize variables to track evaluation metrics
total_correct = 0
total_data_counter = 0
prediction_list = []
actual_list = []
accuracyTest_list = []

# Disable gradient calculation during evaluation
with torch.no_grad():
    for item in TESTTESTER_dataLoader:
        # Move inputs, attention masks, and labels to the device
        input_ids = item[0].to(device)
        attention_masks = item[1].to(device)
        labels = item[2].to(device)

        # Perform forward pass through the model to get predictions
        outputs = bModel(input_ids=input_ids,
                         attention_mask=attention_masks)
        
        # Get predicted labels by selecting the index of the maximum logit value
        predictions = torch.argmax(outputs.logits, dim=1)

        # Update total correct predictions and total data count
        total_correct += (predictions == labels).sum().item()
        total_data_counter += labels.size(dim=0)

        # Store predictions and actual labels 
        prediction_list.extend(predictions.tolist())
        actual_list.extend(labels.tolist())

        # Calculate accuracy for the current batch and store it
        accuracyTest_list.append((predictions == labels).sum().item() / predictions.size(0))

    # Calculate F1 score using sklearn's f1_score function
    f1 = f1_score(actual_list, prediction_list, average="weighted")

    # Print accuracy, F1 score, and confusion matrix
    print("Accuracy is: " + str(total_correct / total_data_counter))
    print("F1 is: " + str(f1))

    # Generate and plot confusion matrix
    cm = confusion_matrix(actual_list, prediction_list)
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of Sentiment (Count)')
    plt.show()   
    
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm/np.sum(cm), annot=True, cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix of Sentiment (Percentage)')
    plt.show()


In [None]:
conf_mat = np.array([[660, 24, 24],
                     [70, 31, 60],
                     [31, 29, 690]])

#print(np.sum(conf_mat, axis=1))

precision = np.diag(conf_mat) / np.sum(conf_mat, axis=0)
recall = np.diag(conf_mat) / np.sum(conf_mat, axis=1)
f1_score = (2 * precision * recall) / (precision + recall)

print(f1_score)

In [None]:
#Remove Punctuation
def remove_punc(review):
    ascii_to_translate = str.maketrans("", "", string.punctuation)
    review = review.translate(ascii_to_translate)
    return review

#print(reviews_list_deemojize[4708])
reviews_list_noPunc = [remove_punc(review) for review in reviews_list_deemojize]
#print(reviews_list_noPunc[4708])

In [None]:
#Since emoji is present only in review 4708, de-emojize review 4708.
reviews_list_deemojize = reviews_list.copy()
#reviews_list_deemojize[4708] = emoji.demojize(reviews_list_deemojize[4708], language='en')
#print(reviews_list[4708])
#print(reviews_list_deemojize[4708])

In [None]:
#check for emojis
def contain_emoji(review):
    emoList = emoji.emoji_list(review)

    if emoList:
        return True

    return False


emoji_check = [contain_emoji(review) for review in reviews_list]

for i in range(len(emoji_check)):
    if emoji_check[i] == True:
        print("This is Review: " + str(i))
        print(reviews_list[i])