In [40]:
## Imports and constants
import os
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import gensim.downloader as api

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import warnings

from preprocessing import lstm_preprocessing
from zm_lstm_helper import *
from zm_lstm_model import SimpleLSTM

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(20) 
np.random.seed(20)

MAX_SEQ_LEN = 200
BATCH_SIZE = 10

warnings.filterwarnings('ignore')

In [41]:
dataset = pd.read_csv(os.getcwd() + '/processed_data.csv')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
dataset = dataset.head(20)

In [42]:
import pandas as pd
import numpy as np
import random
import string
from nltk.tokenize import word_tokenize
import gensim.downloader as api

# Function for removing punctuations
def _remove_punc(review):
    ascii_to_translate = str.maketrans("", "", string.punctuation)
    review = review.translate(ascii_to_translate)
    return review

# Function for GloVe vectorization
def _glove_embed(tokenized_reviews):
    glove_model = api.load("glove-wiki-gigaword-50")

    rev_tokenized_embedded = []
    unidentified_tokens = []  # Tokens not in GloVe model

    for review in tokenized_reviews:
        curr_embedded_review = []
        for token in review:
            if token in glove_model:
                curr_embedded_review.append(glove_model[token])
            else:
                unidentified_tokens.append(token)
        rev_tokenized_embedded.append(curr_embedded_review)
    print(f'{len(unidentified_tokens)} total tokens not in GloVe model.')

    return rev_tokenized_embedded

# Function to preprocess reviews for LSTM
def lstm_preprocessing(reviews: pd.Series, tokenizer=word_tokenize):
    random.seed(20)
    np.random.seed(20)

    # Ensure the reviews are of type string
    reviews = reviews.astype(str)

    # Remove Punctuation
    reviews_list_noPunc = [_remove_punc(review) for review in reviews]

    # Make text all lowercase
    reviews_list_lower = [review.lower() for review in reviews_list_noPunc]

    # Tokenization
    rev_tokenized = [tokenizer(review) for review in reviews_list_lower]

    # GloVe Vectorization
    rev_tokenized_embedded = _glove_embed(rev_tokenized)

    # Return a new Series with preprocessed reviews
    preprocessed_reviews = pd.Series(rev_tokenized_embedded, index=reviews.index)

    return preprocessed_reviews


In [43]:
## Preprocessing using common preprocessing function
dataset['Reviews_5'] = lstm_preprocessing(dataset['Reviews'])

19 total tokens not in GloVe model.


In [44]:
dataset

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Recommended,Sentiment,Year,review_length,Reviews_1,Reviews_4,Reviews_2,Reviews_5
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,yes,2,2024,467,Flight amazing. The crew onboard flight welcom...,flight amaz crew onboard flight welcom gave go...,flight amazing crew onboard flight welcoming g...,"[[1.7306, 0.284, -0.040613, -0.087372, -0.4819..."
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,no,0,2024,249,Booking emergency exit seat still meant huge d...,book emerg exit seat still meant huge discomfo...,booking emergency exit seat still meant huge d...,"[[0.75874, -0.22013, -0.12103, 0.021208, -0.65..."
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,yes,2,2024,196,Excellent performance fronts. I would definite...,excel perform front would definit choos use ai...,excellent performance fronts would definitely ...,"[[-0.40431, 0.78002, -0.67538, -0.097149, 0.54..."
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,yes,2,2024,991,Pretty comfortable flight considering I flying...,pretti comfort flight consid fli economi class...,pretty comfortable flight considering flying e...,"[[-0.24922, -0.39835, -0.45851, -0.34846, 0.74..."
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,yes,2,2024,310,The service consistently good start finish. Th...,servic consist good start finish cabin crew sh...,service consistently good start finish cabin c...,"[[0.418, 0.24968, -0.41242, 0.1217, 0.34527, -..."
5,“seat is absolutely dreadful”,Robert Watson,2024-02-19,Singapore Airlines,True,This flight was over six hours long on a B737...,Solo Leisure,February 2024,Singapore to Cairns,Economy Class,5,1,5,5,1,no,0,2024,812,This flight six hours long B737 MAX8. I booked...,flight six hour long b max book emerg exit row...,flight six hours long b737 max8 booked emergen...,"[[0.53074, 0.40117, -0.40785, 0.15444, 0.47782..."
6,Very ordinary service,S Holger,2024-02-19,Singapore Airlines,True,"Boarding process went smoothly, and plane le...",Solo Leisure,February 2024,Singapore to Frankfurt,Economy Class,1,3,1,2,2,no,1,2024,866,"Boarding process went smoothly, plane left tim...",board process went smoothli plane left time wi...,boarding process went smoothly plane left time...,"[[0.47061, 0.38608, -0.38143, -1.628, -0.16672..."
7,I like flying this airline,A Jabil,2024-02-19,Singapore Airlines,True,Pleasant flight which operated on time. Chec...,Family Leisure,February 2024,Singapore to Bandar Seri Begawan,Economy Class,2,5,3,4,5,yes,2,2024,284,Pleasant flight operated time. Check-in Changi...,pleasant flight oper time checkin changi smoot...,pleasant flight operated time check changi smo...,"[[0.4512, 1.315, -1.0892, -0.33892, 0.43551, -..."
8,left me deeply disappointed,A Kalarsan,2024-02-18,Singapore Airlines,True,I embarked on a journey with high hopes and ...,Business,February 2024,Singapore to Jakarta,Economy Class,5,4,5,5,3,no,0,2024,1933,"I embarked journey high hopes anticipation, se...",embark journey high hope anticip secur connect...,embarked journey high hopes anticipation secur...,"[[0.11891, 0.15255, -0.082073, -0.74144, 0.759..."
9,can’t fault a single aspect,S Dayle,2024-02-17,Singapore Airlines,True,"No queue at check-in, and boarding at the ga...",Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,4,5,4,3,5,yes,2,2024,502,"No queue check-in, boarding gate done orderly ...",queue checkin board gate done orderli fashion ...,queue check boarding gate done orderly fashion...,"[[0.34957, 0.40147, -0.012561, 0.13743, 0.4008..."


In [45]:
## Using Dataset wrapper
tokenized_embedded_reviews = dataset['Reviews_5']
reviews_labels = dataset['Sentiment']
reviews_dataset = ReviewsDataset(reviews=tokenized_embedded_reviews, labels=reviews_labels)

In [46]:
## Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    reviews_dataset, [0.8, 0.1, 0.1], generator=torch.Generator().manual_seed(20)
)

In [47]:
## Create DataLoaders
train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
val_loader = DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)

In [49]:
## Check batches
example_features, example_label = next(iter(train_loader))
print(f'Sample feature: \n{example_features}, \nFeature size: {example_features.shape}')
print(f'Sample label: \n{example_label}')

## Assert that feature size is (batch_size, sequence_length ie review_length, feature_size ie word_vec_size)
assert example_features.shape == torch.Size([10, 200, 50]), 'Batch provided by DataLoader is of wrong size'

Sample feature: 
tensor([[[ 0.1527,  0.3618, -0.2217,  ...,  0.4338, -0.0846,  0.1214],
         [-0.2728,  0.7752, -0.1018,  ..., -0.7337,  0.0404,  0.2666],
         [-0.4124,  0.6493, -0.5585,  ...,  0.2621,  0.1045, -0.4430],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.5307,  0.4012, -0.4078,  ...,  0.2876,  0.1444,  0.2361],
         [ 1.7306,  0.2840, -0.0406,  ...,  0.3401, -0.0973,  0.2801],
         [ 0.0869, -0.1942, -0.2427,  ..., -0.7700,  0.3945, -0.1694],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.1189,  0.1525, -0.0821,  ..., -0.5751, -0.2667,  0.9212],
         [ 0.9068, -0.0447, 

In [50]:
## Initialize an LSTM model
    ## Hyperparameters
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 1
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)
print(model)

SimpleLSTM(
  (model): ModuleDict(
    (lstm): LSTM(50, 64, batch_first=True, dropout=0.1)
    (linear1): Linear(in_features=64, out_features=3, bias=True)
    (sigmoid): Sigmoid()
  )
)


In [51]:
## Test forward pass
example_output = model(example_features.to(DEVICE))
print(f'Sample output: \n{example_output}')

Sample output: 
(tensor([[0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028],
        [0.4997, 0.5243, 0.5028]], grad_fn=<SigmoidBackward0>), tensor([[-0.0282, -0.0089, -0.0667,  0.0447, -0.0328,  0.0650, -0.0334,  0.0322,
         -0.0363, -0.0422, -0.0951, -0.0023,  0.0646,  0.0734, -0.0823, -0.0244,
         -0.1229,  0.0260, -0.0238,  0.0861, -0.0334,  0.0769,  0.0700, -0.0005,
         -0.0619,  0.0084,  0.0494,  0.0742,  0.0528, -0.0441,  0.1016,  0.0556,
         -0.0189, -0.0870, -0.0028,  0.0984, -0.0087,  0.0847,  0.0134,  0.0034,
         -0.0563,  0.0026,  0.0033, -0.0601, -0.0309, -0.0152,  0.0460,  0.0268,
          0.0625,  0.0591, -0.0123,  0.0376, -0.0714,  0.0368, -0.0112,  0.0315,
         -0.0720, -0.0727, -0.0908, -0.0060,  

In [52]:
example_loss, example_f1 = evaluation(model, val_loader)
print(f'Loss: {example_loss}')
print(f'F1: {example_f1}')

Loss: 1.1048007011413574
F1: 0.0


In [53]:
## Training the model
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 2
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)

trained_model = train_model(model, train_loader, val_loader, lr=0.001, epochs=2)

## Save model
curr_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
file_dir = '.experiments/'
file_name = f'model_{curr_datetime}.pt'
torch.save(model.state_dict(), file_dir + file_name)

Training loss: 1.0861	 Training F1: 0.1
Validation loss: 1.0794	 Validation F1: 0.0
Training loss: 1.0798	 Training F1: 0.39495798319327735
Validation loss: 1.0690	 Validation F1: 1.0


In [54]:
## Hyperparameter tuning
param_grid = {
    'hidden_dim': [32, 64],
    'lstm_layers': [2, 3],
    'dropout': [0.1, 0.8]
}
embedding_dim = 50

models = []

for curr_hidden_dim in param_grid['hidden_dim']:
    for curr_num_lstm_layers in param_grid['lstm_layers']:
        for curr_dropout in param_grid['dropout']:
            text = f'hidden_dim: {curr_hidden_dim}, num_lstm_layers: {curr_num_lstm_layers}, dropout: {curr_dropout}'
            print(text)

            curr_model = SimpleLSTM(embedding_dim, curr_hidden_dim, curr_num_lstm_layers, curr_dropout)
            curr_trained_model = train_model(curr_model, train_loader, val_loader, lr=0.001, epochs=10)
            models.append((text, curr_trained_model))
    

hidden_dim: 32, num_lstm_layers: 2, dropout: 0.1
Training loss: 1.0884	 Training F1: 0.5601851851851851
Validation loss: 1.0733	 Validation F1: 1.0
Training loss: 1.0837	 Training F1: 0.5601851851851851
Validation loss: 1.0666	 Validation F1: 1.0
Training loss: 1.0792	 Training F1: 0.5601851851851851
Validation loss: 1.0599	 Validation F1: 1.0
Training loss: 1.0751	 Training F1: 0.5601851851851851
Validation loss: 1.0529	 Validation F1: 1.0
Training loss: 1.0705	 Training F1: 0.5601851851851851
Validation loss: 1.0455	 Validation F1: 1.0
Training loss: 1.0674	 Training F1: 0.5601851851851851
Validation loss: 1.0376	 Validation F1: 1.0
Training loss: 1.0600	 Training F1: 0.5601851851851851
Validation loss: 1.0291	 Validation F1: 1.0
Training loss: 1.0505	 Training F1: 0.5601851851851851
Validation loss: 1.0199	 Validation F1: 1.0
Training loss: 1.0533	 Training F1: 0.5601851851851851
Validation loss: 1.0095	 Validation F1: 1.0
Training loss: 1.0438	 Training F1: 0.5601851851851851
Valid

Training loss: 1.0830	 Training F1: 0.5601851851851851
Validation loss: 1.0479	 Validation F1: 1.0
Training loss: 1.0729	 Training F1: 0.5601851851851851
Validation loss: 1.0133	 Validation F1: 1.0
Training loss: 1.0461	 Training F1: 0.5601851851851851
Validation loss: 0.9600	 Validation F1: 1.0
Training loss: 1.0046	 Training F1: 0.5601851851851851
Validation loss: 0.8829	 Validation F1: 1.0
Training loss: 0.9704	 Training F1: 0.5601851851851851
Validation loss: 0.8007	 Validation F1: 1.0
Training loss: 0.9362	 Training F1: 0.5601851851851851
Validation loss: 0.7410	 Validation F1: 1.0
hidden_dim: 64, num_lstm_layers: 3, dropout: 0.8
Training loss: 1.0789	 Training F1: 0.5601851851851851
Validation loss: 1.0645	 Validation F1: 1.0
Training loss: 1.0769	 Training F1: 0.5601851851851851
Validation loss: 1.0533	 Validation F1: 1.0
Training loss: 1.0683	 Training F1: 0.5601851851851851
Validation loss: 1.0413	 Validation F1: 1.0
Training loss: 1.0585	 Training F1: 0.5601851851851851
Valid

In [55]:
## Train model again with best hyperparameters
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 2
cell_dropout = 0.8

best_model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)

## Initialize early stopper
curr_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
file_dir = '.experiments/tuned/'
file_name = f'model_{curr_datetime}.pt'

early_stopper = EarlyStopper(patience=3, delta=0.005, path=file_dir+file_name)

## Train model, stopping when no further improvements
best_model = train_model(best_model, train_loader, val_loader, lr=0.001, epochs=100, early_stopper=early_stopper)

Training loss: 1.1085	 Training F1: 0.008333333333333333
Validation loss: 1.1109	 Validation F1: 0.0


RuntimeError: Parent directory .experiments/tuned does not exist.

In [56]:
## Evaluate model performance on test set
best_model = SimpleLSTM(embedding_dim=50, hidden_dim=64, num_lstm_layers=2, cell_dropout=0.1).to(DEVICE)
best_model.load_state_dict(torch.load('.experiments\lstm_trained_model.pt'))

test_loss, test_f1 = evaluation(best_model, test_loader)
print(f'Test loss: {test_loss / len(test_loader):.4f}\t', f'Test F1: {test_f1}')

FileNotFoundError: [Errno 2] No such file or directory: '.experiments\\lstm_trained_model.pt'

In [None]:
## Retrieving the performance scores for each class
all_pred_class = []
all_labels = []

for reviews, labels in test_loader:
    reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

    ## Forward pass
    pred, thought_vector = best_model(reviews)
    pred_class = torch.argmax(pred, dim=1).to('cpu')

    all_pred_class.extend(pred_class.numpy())
    all_labels.extend(labels.to('cpu').numpy())

assert len(all_pred_class) == len(all_labels)