In [1]:
## Imports and constants
import os
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import matutils
import gensim.downloader as api

from collections import OrderedDict

from lstm_preprocessing import lstm_preprocessing

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42) 
np.random.seed(42)


MAX_SEQ_LEN = 200
BATCH_SIZE = 10

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
raw_dataset = pd.read_csv(os.getcwd() + '\\airlines_reviews.csv')
raw_dataset.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [3]:
## Preprocessing
tokenized_dataset = lstm_preprocessing(dataset=raw_dataset.sample(1000, ignore_index=True)) ##TODO: Remove sampling
tokenized_reviews = tokenized_dataset['Tokenized_Reviews']
tokenized_dataset.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended,Sentiment,Tokenized_Reviews
0,staff care about their clients,H Lamesson,2017-01-31,All Nippon Airways,True,Tokyo Narita to Singapore. I have flown ANA ...,Business,January 2017,Tokyo to Singapore,Economy Class,5,5,5,4,4,9,yes,2,"[tokyo, narita, to, singapore, i, have, flown,..."
1,seat width doesn't feel too bad,T Chan,2023-08-09,Cathay Pacific Airways,True,"Despite the age of the plane, the refreshed ...",Solo Leisure,August 2023,Hong Kong to Bangkok,Economy Class,5,5,3,3,4,10,yes,2,"[despite, the, age, of, the, plane, the, refre..."
2,couldn't wait to get off,M Kronz,2016-11-16,Turkish Airlines,True,Munich to Istanbul. The plane took off almo...,Family Leisure,November 2016,IST to BJV,Economy Class,4,4,4,4,2,2,no,0,"[munich, to, istanbul, the, plane, took, off, ..."
3,best entertainment system on board for sure,Anton Gots,2020-01-22,Emirates,True,Hong Kong to Paris via Dubai with Emirates. H...,Solo Leisure,November 2019,Lisbon to Sydney via Dubai,Economy Class,1,1,5,3,4,7,yes,1,"[hong, kong, to, paris, via, dubai, with, emir..."
4,regret the choice,O Ellis,2019-08-30,Cathay Pacific Airways,True,Gold member on EK but decided to fly with Cat...,Family Leisure,August 2019,Singapore to Vancouver via Hong Kong,Economy Class,2,3,1,2,1,2,no,0,"[gold, member, on, ek, but, decided, to, fly, ..."


In [4]:
## Embedding / Vectorization using TF-IDF (each review = 1 document)
dct = Dictionary(tokenized_reviews)  # fit dictionary
corpus = [dct.doc2bow(line) for line in tokenized_reviews]
tfidf_model = TfidfModel(corpus)

print('Number of features: {}'.format(len(dct)))


## Inspect some TF-IDF scores of first review, sorted by the similarity score
print('Text 0:\n{}\n'.format(tokenized_reviews[0]))
for w, s in sorted(tfidf_model[corpus[0]], reverse=True, key=lambda x: x[1]):
    print('{}:{}'.format(dct[w], s))

Number of features: 8241
Text 0:
['tokyo', 'narita', 'to', 'singapore', 'i', 'have', 'flown', 'ana', 'several', 'times', 'on', 'both', 'economy', 'and', 'premium', 'economy', 'cabin', 'and', 'it', 'is', 'a', 'seemless', 'ride', 'wonderful', 'service', 'from', 'both', 'ground', 'and', 'inflight', 'crews', 'you', 'really', 'get', 'the', 'feeling', 'that', 'ana', 'staff', 'care', 'about', 'their', 'clients', 'food', 'is', 'delicious', 'great', 'quality', 'of', 'wines', 'ife', 'with', 'plenty', 'of', 'choices', 'the', '2x4x3', 'configuration', 'is', 'the', 'best', 'layout', 'i', 'ever', 'saw', 'on', 'a', 'boeing', '777', 'only', 'downside', 'is', 'how', 'the', 'y', 'seat', 'reclines', 'instead', 'of', 'your', 'back', 'reclining', 'it', 'is', 'your', 'hips', 'that', 'slide', 'forwards', 'hence', 'reducing', 'your', 'leg', 'room', 'hard', 'to', 'describe', 'but', 'the', 'economy', 'seat', 'isnt', 'reclining', 'the', 'traditional', 'way', 'thus', 'making', 'it', 'a', 'biy', 'uncomfortable', '

In [5]:
## Vectorize all reviews with TF-IDF
tfidf_vectorization_csr = matutils.corpus2csc(tfidf_model[corpus], num_terms=len(dct))
tfidf_reviews = tfidf_vectorization_csr.T.toarray()
print(f'TF-IDF matrix has shape: {tfidf_reviews.shape}')

TF-IDF matrix has shape: (1000, 8241)


In [6]:
## Embedding / Vectorization using GloVe
    ## Note: LSTM processes the sentence sequentially, hence vectorization should be done word-by-word
glove_model = api.load("glove-wiki-gigaword-50")


tokenized_embedded_reviews = []
unidentified_tokens = [] ## Tokens not in GloVe model


for review in tokenized_reviews:
    curr_embedded_review = []
    
    for token in review:
        if token in glove_model:
            curr_embedded_review.append(glove_model[token])
        else:
            unidentified_tokens.append(token)

    tokenized_embedded_reviews.append(curr_embedded_review)


print(f'Sample of embedded reviews:')
print(f'Text: {tokenized_reviews[0]}')
print(f'Vector: {tokenized_embedded_reviews[0]}')
print(f'{len(unidentified_tokens)} total tokens not in GloVe model: \n{unidentified_tokens}')

Sample of embedded reviews:
Text: ['tokyo', 'narita', 'to', 'singapore', 'i', 'have', 'flown', 'ana', 'several', 'times', 'on', 'both', 'economy', 'and', 'premium', 'economy', 'cabin', 'and', 'it', 'is', 'a', 'seemless', 'ride', 'wonderful', 'service', 'from', 'both', 'ground', 'and', 'inflight', 'crews', 'you', 'really', 'get', 'the', 'feeling', 'that', 'ana', 'staff', 'care', 'about', 'their', 'clients', 'food', 'is', 'delicious', 'great', 'quality', 'of', 'wines', 'ife', 'with', 'plenty', 'of', 'choices', 'the', '2x4x3', 'configuration', 'is', 'the', 'best', 'layout', 'i', 'ever', 'saw', 'on', 'a', 'boeing', '777', 'only', 'downside', 'is', 'how', 'the', 'y', 'seat', 'reclines', 'instead', 'of', 'your', 'back', 'reclining', 'it', 'is', 'your', 'hips', 'that', 'slide', 'forwards', 'hence', 'reducing', 'your', 'leg', 'room', 'hard', 'to', 'describe', 'but', 'the', 'economy', 'seat', 'isnt', 'reclining', 'the', 'traditional', 'way', 'thus', 'making', 'it', 'a', 'biy', 'uncomfortable', 

In [7]:
## Dataset class
    ## Uses preprocessing method above
    ## __getitem__ returns (preprocessed) text and its corresponding label

##TODO: Augment such that it can return both text and vector forms of reviews
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    
    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        ret_review = self.reviews[idx]
        ret_label = self.labels[idx]
        
        return ret_review, ret_label

In [8]:
## Using Dataset wrapper
reviews_dataset = ReviewsDataset(reviews=tokenized_embedded_reviews, labels=tokenized_dataset['Sentiment'])
print(f'First review: {reviews_dataset[0][0]}, \nCorresponding label: {reviews_dataset[0][1]}')

First review: [array([-0.31168  ,  0.19471  ,  0.19075  ,  0.68413  ,  0.29163  ,
       -0.8988   ,  0.22633  ,  0.17832  , -1.4774   , -0.091882 ,
        0.089789 , -0.94473  , -0.19385  ,  0.58078  ,  0.20208  ,
        0.9924   , -1.0311   ,  0.42467  , -1.142    ,  0.71974  ,
        2.1561   , -0.14197  , -0.92983  , -0.28101  , -0.011046 ,
       -1.6787   ,  0.44449  ,  0.54703  , -0.71357  , -0.67743  ,
        2.3393   ,  0.28577  ,  1.4062   , -0.0078203, -0.15283  ,
       -1.1147   ,  0.2415   , -0.65908  , -0.044945 ,  0.046839 ,
       -1.1396   , -0.44836  ,  0.91807  , -0.74048  ,  1.0508   ,
        0.052699 ,  0.13431  ,  0.62261  ,  0.61384  , -0.097283 ],
      dtype=float32), array([ 0.85192 ,  0.39179 , -0.10823 ,  0.43999 , -0.15208 , -0.47381 ,
        0.33514 ,  0.34507 , -0.015091, -0.34836 ,  0.72096 , -0.14913 ,
       -0.32007 ,  0.81045 , -0.11981 ,  0.69412 , -0.75372 ,  0.81457 ,
       -1.5082  ,  0.72005 ,  1.4265  ,  0.51806 , -1.3069  ,  0.5828  ,


In [9]:
## Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    reviews_dataset, [0.8, 0.1, 0.1], generator=torch.Generator()
)

In [10]:
## Create DataLoaders

## Collate function to pad or trim reviews to same number of tokens
def review_collate_fn(raw_batch):
    ## Input: Collection of (review, label) tuples from ReviewDataset

    padded_reviews = []
    labels = []
    pad_tensor = torch.zeros(len(raw_batch[0][0][0])) ## Pad with zero tensor of size equal to word embeddings

    for (review, label) in raw_batch:
        padded_review = review
        if len(review) < MAX_SEQ_LEN:
            padded_review = padded_review + [pad_tensor for i in range(MAX_SEQ_LEN - len(review))]
        elif len(review) > MAX_SEQ_LEN:
            padded_review = padded_review[:MAX_SEQ_LEN]
        padded_reviews.append(padded_review)
        labels.append(label)
    
    # print(torch.Tensor(padded_reviews).shape)

    ## Returns: a tuple (review tensor, label tensor) of sizes batch_size*MAX_SEQ_LEN and batch_size, respectively.
    return torch.Tensor(padded_reviews), torch.Tensor(labels)

train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
val_loader = DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)

In [11]:
## Check batches
example_features, example_label = next(iter(train_loader))
print(f'Sample feature: \n{example_features}, \nFeature size: {example_features.shape}')
print(f'Sample label: \n{example_label}')

## Assert that feature size is (batch_size, sequence_length ie review_length, feature_size ie word_vec_size)
assert example_features.shape == torch.Size([10, 200, 50]), 'Batch provided by DataLoader is of wrong size'

Sample feature: 
tensor([[[-0.2728,  0.7752, -0.1018,  ..., -0.7337,  0.0404,  0.2666],
         [ 0.6589, -0.5696, -0.4147,  ...,  0.6028, -0.4070,  0.8622],
         [ 0.1519,  0.3633, -0.1575,  ..., -0.5220, -0.2840,  1.1457],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.7515,  0.6458, -0.7527,  ...,  0.7348,  0.3114, -0.4673],
         [ 0.6805, -0.0393,  0.3019,  ..., -0.0733, -0.0647, -0.2604],
         [ 0.5639, -0.6709, -0.1637,  ...,  0.1448,  1.5258, -1.0116],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
         [-0.4759,  0.3783, 

  return torch.Tensor(padded_reviews), torch.Tensor(labels)


In [27]:
## LSTM model

class SimpleLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_lstm_layers=1, cell_dropout=0.0):
        ## vocab_size = no. of unique words in reviews
        ## embedding_dim = size of embeddings / vectors
        ## hidden_dim = dimension of LSTM output
        ## num_lstm_layers = no. of LSTM layers
        ## cell_dropout = dropout applied between LSTM layers

        super().__init__()

        self.num_lstm_layers = num_lstm_layers
        self.hidden_dim = hidden_dim

        ## Model layers
            ## Embedding layer TODO: Should this be implemented?
            ## LSTM (for thought vector)
            ## Linear layer (for logit score)
            ## Activation (for P of +ve sentiment)

        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=embedding_dim, 
                hidden_size=self.hidden_dim, 
                num_layers=self.num_lstm_layers, 
                batch_first=True, 
                dropout=cell_dropout),
            'linear1': nn.Linear(
                in_features=self.hidden_dim, 
                out_features=3 ## 3 units for predicting 3 sentiments
            ),
            'sigmoid': nn.Sigmoid()
        })

    
    def forward(self, x):
        ## Input is a (batch_size, sequence_length, feature_size) tensor
        ##TODO: Implement forward pass, with cell and hidden states

        ## LSTM outputs
            ## h_t = Tensor of shape (batch_size, sequence_length, direction*hidden_size) representing hidden state at each t
            ## h_n = Hidden state at last time step
            ## c_n = Cell state at last time step
        h_t, (h_n, c_n) = self.model['lstm'](x)
        # print(f'LSTM hidden states: {h_t.shape}')
        # print(f'LSTM final state: {h_n.shape}')

        output = self.model['linear1'](h_n[-1])
        # print(f'Linear output: {output.shape}')

        output = self.model['sigmoid'](output)
        # print(f'Sigmoid output: {output.shape}')

        return output


    ## Initialize initial cell and hidden states
    def init_hidden(self, batch_size):
        ##TODO: Return tuple of two num_layers * batch_size * hidden_dim tensors
        pass

    

In [28]:
## Initialize an LSTM model
    ## Hyperparameters
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 1
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)
print(model)

SimpleLSTM(
  (model): ModuleDict(
    (lstm): LSTM(50, 64, batch_first=True, dropout=0.1)
    (linear1): Linear(in_features=64, out_features=3, bias=True)
    (sigmoid): Sigmoid()
  )
)




In [29]:
## Test forward pass
example_output = model(example_features)
print(f'Sample output: \n{example_output}')

Sample output: 
tensor([[0.4767, 0.5195, 0.5024],
        [0.4767, 0.5195, 0.5024],
        [0.4767, 0.5195, 0.5024],
        [0.4767, 0.5195, 0.5024],
        [0.4666, 0.5148, 0.4900],
        [0.4767, 0.5195, 0.5024],
        [0.4951, 0.5343, 0.4533],
        [0.4767, 0.5195, 0.5024],
        [0.4767, 0.5195, 0.5024],
        [0.4767, 0.5195, 0.5024]], grad_fn=<SigmoidBackward0>)


In [30]:
## Evaluation function
def evaluation(model:nn.Module, data_loader:DataLoader, title='val', loss_fn=torch.nn.CrossEntropyLoss()):
    with torch.no_grad():
        model.eval()

        total_correct = 0
        total_loss = 0
        all_pred = []
        all_labels = []

        ## Process batch by batch
        for reviews, labels in data_loader:
            reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

            ## Forward pass
            pred = model(reviews)
            pred_class = torch.argmax(pred, dim=1) ## Get indices of largest value in prediction tensor, ie predicted class

            ## Calculate loss
            loss = loss_fn(pred, labels.long())
            total_loss += abs(loss)

            all_pred.extend(pred_class.numpy())
            all_labels.extend(labels.numpy())
        
        ## Calculate metrics
        correct_preds = sum([1 for pred, label in zip(all_pred, all_labels) if pred == label])
        f1 = f1_score(all_labels, all_pred, average='micro')

        print(f'Loss: {total_loss}')
        print(f'F1: \n{f1}')

    return total_loss, f1

In [31]:
## Test evaluation function
evaluation(model, val_loader)

Loss: 11.027554512023926
F1: 
0.19


(tensor(11.0276), 0.19)

In [None]:
## Training function / Optimization loop

In [None]:
## Hyperparameter tuning
    ## Embedding (size, method)
    ## Hidden dimension
    ## LSTM layers
    ## Bidirectional
    ## Dropout probability
    ## Learning rate