# Table of Contents

1. [Importing Libraries](#1.-Importing-Libraries)
2. [Importing Data](#2.-Importing-Data)    

## 1. Importing Libraries

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from transformers import BertForSequenceClassification

pd.set_option('display.max_columns', None)

**Trying to use GPU if available**

In [19]:
import torch

# Check GPU availability
if torch.cuda.is_available():

    # Let PyTorch use GPU
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

## 2. Importing Data

**BERT**

In [9]:
dataset = pd.read_csv('/Users/antoniooliveira/MannheimWMProject/processed_data_v3.csv')
dataset.drop('Overall Rating', axis = 1)

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended,Sentiment,Year,review_length,Reviews_2
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,yes,2,2024,467,flight amazing crew onboard flight welcoming g...
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,no,0,2024,249,booking emergency exit seat still meant huge d...
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,yes,2,2024,196,excellent performance fronts would definitely ...
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,yes,2,2024,991,pretty comfortable flight considering flying e...
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,yes,2,2024,310,service consistently good start finish cabin c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8094,an uneventful flight,N Vickers,2016-06-20,Korean Air,True,"KE124, Brisbane to Incheon (A330) and KE867,...",Business,June 2016,BNE to ULN via ICN,Economy Class,5,4,5,3,4,yes,2,2016,751,ke124 brisbane incheon a330 ke867 incheon ulaa...
8095,Korean Air always impresses,Kim Holloway,2016-06-12,Korean Air,False,Our recent flight was our fourth trip to the...,Couple Leisure,June 2016,SYD to LHR via ICN,Economy Class,3,5,5,4,5,yes,2,2016,1127,recent flight fourth trip uk 6 years sydney lo...
8096,didn’t offer anything,C Clark,2016-06-06,Korean Air,True,I flew Korean Air from Bali to Seoul in Pres...,Business,April 2016,DPS to ICN,Business Class,4,5,5,5,1,no,0,2016,1075,flew korean air bali seoul prestige class busi...
8097,appreciated the service onboard,E Petan,2016-04-21,Korean Air,False,Seoul to Paris with Korean Air. I am traveli...,Business,April 2016,ICN to CDG,Business Class,5,1,3,4,5,yes,2,2016,1298,seoul paris korean air traveling triathlon bik...


**LSTM**

In [27]:
dataset = pd.read_csv('/Users/antoniooliveira/MannheimWMProject/processed_data_v4.csv')
dataset.drop('Overall Rating', axis = 1)

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended,Sentiment,Year,review_length,Tokenized_Reviews
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,yes,2,2024,467,"['flight', 'was', 'amazing', 'the', 'crew', 'o..."
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,no,0,2024,249,"['booking', 'an', 'emergency', 'exit', 'seat',..."
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,yes,2,2024,196,"['excellent', 'performance', 'on', 'all', 'fro..."
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,yes,2,2024,991,"['pretty', 'comfortable', 'flight', 'consideri..."
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,yes,2,2024,310,"['the', 'service', 'was', 'consistently', 'goo..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8094,an uneventful flight,N Vickers,2016-06-20,Korean Air,True,"KE124, Brisbane to Incheon (A330) and KE867,...",Business,June 2016,BNE to ULN via ICN,Economy Class,5,4,5,3,4,yes,2,2016,751,"['ke124', 'brisbane', 'to', 'incheon', 'a330',..."
8095,Korean Air always impresses,Kim Holloway,2016-06-12,Korean Air,False,Our recent flight was our fourth trip to the...,Couple Leisure,June 2016,SYD to LHR via ICN,Economy Class,3,5,5,4,5,yes,2,2016,1127,"['our', 'recent', 'flight', 'was', 'our', 'fou..."
8096,didn’t offer anything,C Clark,2016-06-06,Korean Air,True,I flew Korean Air from Bali to Seoul in Pres...,Business,April 2016,DPS to ICN,Business Class,4,5,5,5,1,no,0,2016,1075,"['i', 'flew', 'korean', 'air', 'from', 'bali',..."
8097,appreciated the service onboard,E Petan,2016-04-21,Korean Air,False,Seoul to Paris with Korean Air. I am traveli...,Business,April 2016,ICN to CDG,Business Class,5,1,3,4,5,yes,2,2016,1298,"['seoul', 'to', 'paris', 'with', 'korean', 'ai..."


## Train-Test Split

In [28]:
Y = dataset["Sentiment"]
X_train, X_test, Y_train, Y_test = train_test_split(dataset,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=20)

## 3. Machine Learning Models

## 4. BERT

In [15]:
X_train_2d = np.array(X_train).reshape(-1, 1)
Y_train_2d = np.array(Y_train).reshape(-1, 1)

In [16]:
X_train_tok = []
#X_val_tok = []
X_test_tok = []

def encode(reviewSet, newList):
  for review in reviewSet:
    encodedReview = bTokenizer.encode_plus(
      text = review,
      add_special_tokens = True,
      max_length=512,
      truncation=True,
    )
    newList.append(encodedReview)
  return newList

X_train_tok = encode(X_train, X_train_tok)
#X_val_tok = encode(X_val, X_val_tok)
X_test_tok = encode(X_test, X_test_tok)

In [20]:
print(X_train_tok[0])
print(bTokenizer.convert_ids_to_tokens(X_train_tok[0]['input_ids']))

{'input_ids': [101, 2516, 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}
['[CLS]', 'title', '[SEP]']


In [22]:
bModel = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 3,)

ImportError: 
BertForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFBertForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [None]:
##Create Dataloaders
BATCH_SIZE = 10

#TESTER
#Repeat for test set
X_TESTER_input = [torch.tensor(nSentence['input_ids']) for nSentence in X_train_tok[:]]
X_TESTER_input_pad = pad_sequence(X_TESTER_input, batch_first=True) [:]
X_TESTER_mask = [torch.tensor(nSentence['attention_mask']) for nSentence in X_train_tok[:]]
X_TESTER_mask_pad = pad_sequence(X_TESTER_mask, batch_first=True) [:]

Y_TESTER_input = torch.tensor(list(Y_train)[:])
TESTER_dataset = TensorDataset(X_TESTER_input_pad, X_TESTER_mask_pad, Y_TESTER_input)

#Create dataloaders
TESTER_dataLoader = DataLoader(TESTER_dataset, batch_size=BATCH_SIZE)

#Repeat for test set
X_TESTTESTER_input = [torch.tensor(nSentence['input_ids']) for nSentence in X_test_tok[:]]
X_TESTTESTER_input_pad = pad_sequence(X_TESTTESTER_input, batch_first=True) [:] #ERROR HERE, proper terms
X_TESTTESTER_mask = [torch.tensor(nSentence['attention_mask']) for nSentence in X_test_tok[:]]
X_TESTTESTER_mask_pad = pad_sequence(X_TESTTESTER_mask, batch_first=True) [:]

Y_TESTTESTER_input = torch.tensor(list(Y_test)[:])
TESTTESTER_dataset = TensorDataset(X_TESTTESTER_input_pad, X_TESTTESTER_mask_pad, Y_TESTTESTER_input)

#Create dataloaders
TESTTESTER_dataLoader = DataLoader(TESTTESTER_dataset, batch_size=BATCH_SIZE)

In [None]:
print(X_TESTER_mask_pad)
#print(X_TESTTESTER_mask_pad)
print(Y_TESTER_input)

In [None]:
#optimizer method
optimizer = torch.optim.AdamW(bModel.parameters(), lr = 1e-6,)

#epochs
EPOCHS = 10

#scheduler
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(TESTER_dataset) * EPOCHS)

#loss function
loss_function = nn.CrossEntropyLoss()

**Model Training**

In [None]:
#Train the BERT model
outputs = 0
counter = 0
accuracyTrain_list = []
lossTrain_list = []
avg_accuracyTrain_list = []
prediction_list = []
actual_list = []

bModel.to(device)
for i in range(EPOCHS):
  #inform model training beginning
  bModel.train()
  total_loss = 0
  total_accuracy = 0

  for item in TESTER_dataLoader:
    #restart optimizer values
    optimizer.zero_grad()
    input_ids = item[0].to(device)
    attention_masks = item[1].to(device)
    labels = item[2].to(device)

    outputs = bModel(input_ids=input_ids,
                     attention_mask=attention_masks, labels=labels)
    loss = outputs.loss
    predictions = torch.argmax(outputs.logits, dim=1)
    print(predictions)
    print(labels)
    print(loss)
    print((predictions == labels).sum().item() / predictions.size(0))
    accuracyTrain_list.append((predictions == labels).sum().item() / predictions.size(0))
    prediction_list.extend(predictions.tolist())
    actual_list.extend(labels.tolist())


    total_loss = total_loss + loss.item()
    total_accuracy += (predictions == labels).sum().item() / predictions.size(0)

    loss.backward()
    optimizer.step()
    scheduler.step()
    counter += 1
    print("pass done" + str(counter))
  average_accuracy_in_epoch = total_accuracy / len(TESTER_dataLoader)
  avg_accuracyTrain_list.append(average_accuracy_in_epoch)

  total_accuracy = 0
  print("Average Train loss is: " + str(total_loss / len(TESTER_dataLoader)))
  cm = confusion_matrix(actual_list, prediction_list)
  plt.figure(figsize=(6, 6))
  sns.heatmap(cm, annot=True, cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title('Confusion Matrix of Sentiment (Count)')
  plt.show()   
    
  plt.figure(figsize=(6, 6))
  sns.heatmap(cm/np.sum(cm), annot=True, cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title('Confusion Matrix of Sentiment (Percentage)')
  plt.show()

  #reset lists
  prediction_list = []
  actual_list = []

**Model Evaluation**

In [None]:
#Evaluation of Model
bModel.eval()
#total_eval_loss = 0
total_correct = 0
total_data_counter = 0
prediction_list = []
actual_list = []
accuracyTest_list = []

with torch.no_grad():
  for item in TESTTESTER_dataLoader:
    input_ids = item[0].to(device)
    attention_masks = item[1].to(device)
    labels = item[2].to(device)

    outputs = bModel(input_ids=input_ids,
                     attention_mask=attention_masks)
    #loss = outputs.loss

    #get maximum value of each column, set prediction to that column
    predictions = torch.argmax(outputs.logits, dim=1)

    total_correct += (predictions == labels).sum().item()
    total_data_counter += labels.size(dim=0)
    #prediction_list.extend(predictions.cpu().numpy())
    #actual_list.extend(labels.cpu().numpy())
    prediction_list.extend(predictions.tolist())
    actual_list.extend(labels.tolist())
    #print(prediction_list)
    #print(actual_list)

    #print(outputs.logits)
    print("predictions: " + str(predictions))
    print("labels: " + str(labels))
    accuracyTest_list.append((predictions == labels).sum().item() / predictions.size(0))
    #total_eval_loss = total_eval_loss + loss.item()
  #actual_list = np.array(actual_list)
  #prediction_list = np.array(prediction_list)
  #actual_list = np.argmax(actual_list, axis=1)
  #print(actual_list)
  f1 = f1_score(actual_list, prediction_list, average="weighted")

  #print("Average Train loss is: " + str(total_eval_loss / len(TESTTESTER_dataLoader)))
  print("Accuracy is: " + str(total_correct / total_data_counter))
  print("F1 is: " + str(f1))

  cm = confusion_matrix(actual_list, prediction_list)
  plt.figure(figsize=(6, 6))
  sns.heatmap(cm, annot=True, cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title('Confusion Matrix of Sentiment (Count)')
  plt.show()   
    
  plt.figure(figsize=(6, 6))
  sns.heatmap(cm/np.sum(cm), annot=True, cmap='Blues')
  plt.xlabel('Predicted')
  plt.ylabel('Actual')
  plt.title('Confusion Matrix of Sentiment (Percentage)')
  plt.show()

In [None]:
conf_mat = np.array([[660, 24, 24],
                     [70, 31, 60],
                     [31, 29, 690]])

#print(np.sum(conf_mat, axis=1))

precision = np.diag(conf_mat) / np.sum(conf_mat, axis=0)
recall = np.diag(conf_mat) / np.sum(conf_mat, axis=1)
f1_score = (2 * precision * recall) / (precision + recall)

print(f1_score)

## 5. LSTM