In [1]:
## Imports and constants
import os
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import matutils
import gensim.downloader as api
from sklearn.metrics import f1_score

import datetime

from lstm_preprocessing import lstm_preprocessing

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42) 
np.random.seed(42)


MAX_SEQ_LEN = 200
BATCH_SIZE = 10

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
raw_dataset = pd.read_csv(os.getcwd() + '\\airlines_reviews.csv')
raw_dataset.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes


In [3]:
## Preprocessing
tokenized_dataset = lstm_preprocessing(dataset=raw_dataset)
tokenized_reviews = tokenized_dataset['Tokenized_Reviews']
tokenized_dataset.head()

Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended,Sentiment,Tokenized_Reviews
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes,2,"[flight, was, amazing, the, crew, onboard, thi..."
1,seats on this aircraft are dreadful,Robert Watson,2024-02-21,Singapore Airlines,True,Booking an emergency exit seat still meant h...,Solo Leisure,February 2024,Phuket to Singapore,Economy Class,5,3,4,4,1,3,no,0,"[booking, an, emergency, exit, seat, still, me..."
2,Food was plentiful and tasty,S Han,2024-02-20,Singapore Airlines,True,Excellent performance on all fronts. I would...,Family Leisure,February 2024,Siem Reap to Singapore,Economy Class,1,5,2,1,5,10,yes,2,"[excellent, performance, on, all, fronts, i, w..."
3,“how much food was available,D Laynes,2024-02-19,Singapore Airlines,True,Pretty comfortable flight considering I was f...,Solo Leisure,February 2024,Singapore to London Heathrow,Economy Class,5,5,5,5,5,10,yes,2,"[pretty, comfortable, flight, considering, i, ..."
4,“service was consistently good”,A Othman,2024-02-19,Singapore Airlines,True,The service was consistently good from start ...,Family Leisure,February 2024,Singapore to Phnom Penh,Economy Class,5,5,5,5,5,10,yes,2,"[the, service, was, consistently, good, from, ..."


In [4]:
## Embedding / Vectorization using TF-IDF (each review = 1 document)
dct = Dictionary(tokenized_reviews)  # fit dictionary
corpus = [dct.doc2bow(line) for line in tokenized_reviews]
tfidf_model = TfidfModel(corpus)

print('Number of features: {}'.format(len(dct)))


## Inspect some TF-IDF scores of first review, sorted by the similarity score
print('Text 0:\n{}\n'.format(tokenized_reviews[0]))
for w, s in sorted(tfidf_model[corpus[0]], reverse=True, key=lambda x: x[1]):
    print('{}:{}'.format(dct[w], s))

Number of features: 23183
Text 0:
['flight', 'was', 'amazing', 'the', 'crew', 'onboard', 'this', 'flight', 'were', 'very', 'welcoming', 'and', 'gave', 'a', 'good', 'atmosphere', 'the', 'crew', 'serving', 'my', 'aisle', 'goes', 'by', 'the', 'initial', '“', 'g', '”', 'she', 'was', 'very', 'kind', 'helpful', 'gave', 'my', 'mom', 'a', 'bday', 'cake', 'for', 'a', 'late', 'celebration', 'even', 'though', 'it', 'was', 'just', 'a', '1hr', '45min', 'flight', 'seat', 'is', 'well', 'sanitized', 'legroom', 'is', 'spacious', 'ife', 'onboard', 'has', 'many', 'variety', 'of', 'shows', 'music', 'etc', 'bathroom', 'always', 'kept', 'clean', 'by', 'crew', 'at', 'all', 'times', 'food', 'was', 'delicious', 'overall', 'this', 'flight', 'is', 'a', '910']

bday:0.29370024702302083
celebration:0.2710795682534202
sanitized:0.2710795682534202
45min:0.24845888948381967
910:0.24845888948381967
g:0.2258382107142191
1hr:0.19137420512066913
gave:0.18807545809541834
atmosphere:0.18495461559969756
mom:0.18495461559969

In [5]:
## Vectorize all reviews with TF-IDF
tfidf_vectorization_csr = matutils.corpus2csc(tfidf_model[corpus], num_terms=len(dct))
tfidf_reviews = tfidf_vectorization_csr.T.toarray()
print(f'TF-IDF matrix has shape: {tfidf_reviews.shape}')

TF-IDF matrix has shape: (8100, 23183)


In [6]:
## Embedding / Vectorization using GloVe
    ## Note: LSTM processes the sentence sequentially, hence vectorization should be done word-by-word
glove_model = api.load("glove-wiki-gigaword-50")


tokenized_embedded_reviews = []
unidentified_tokens = [] ## Tokens not in GloVe model


for review in tokenized_reviews:
    curr_embedded_review = []
    
    for token in review:
        if token in glove_model:
            curr_embedded_review.append(glove_model[token])
        else:
            unidentified_tokens.append(token)

    tokenized_embedded_reviews.append(curr_embedded_review)


print(f'Sample of embedded reviews:')
print(f'Text: {tokenized_reviews[0]}')
print(f'Vector: {tokenized_embedded_reviews[0]}')
print(f'{len(unidentified_tokens)} total tokens not in GloVe model: \n{unidentified_tokens}')

Sample of embedded reviews:
Text: ['flight', 'was', 'amazing', 'the', 'crew', 'onboard', 'this', 'flight', 'were', 'very', 'welcoming', 'and', 'gave', 'a', 'good', 'atmosphere', 'the', 'crew', 'serving', 'my', 'aisle', 'goes', 'by', 'the', 'initial', '“', 'g', '”', 'she', 'was', 'very', 'kind', 'helpful', 'gave', 'my', 'mom', 'a', 'bday', 'cake', 'for', 'a', 'late', 'celebration', 'even', 'though', 'it', 'was', 'just', 'a', '1hr', '45min', 'flight', 'seat', 'is', 'well', 'sanitized', 'legroom', 'is', 'spacious', 'ife', 'onboard', 'has', 'many', 'variety', 'of', 'shows', 'music', 'etc', 'bathroom', 'always', 'kept', 'clean', 'by', 'crew', 'at', 'all', 'times', 'food', 'was', 'delicious', 'overall', 'this', 'flight', 'is', 'a', '910']
Vector: [array([ 1.7306   ,  0.284    , -0.040613 , -0.087372 , -0.4819   ,
       -0.4278   , -0.65733  ,  0.31632  ,  1.0554   , -0.70909  ,
        0.57747  ,  0.3628   , -0.46717  ,  0.97844  ,  0.12695  ,
       -0.39876  , -1.6432   ,  0.34616  , -1.9

In [7]:
## Dataset class
    ## Uses preprocessing method above
    ## __getitem__ returns (preprocessed) text and its corresponding label

##TODO: Augment such that it can return both text and vector forms of reviews
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    
    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        ret_review = self.reviews[idx]
        ret_label = self.labels[idx]
        
        return ret_review, ret_label

In [8]:
## Using Dataset wrapper
reviews_dataset = ReviewsDataset(reviews=tokenized_embedded_reviews, labels=tokenized_dataset['Sentiment'])
print(f'First review: {reviews_dataset[0][0]}, \nCorresponding label: {reviews_dataset[0][1]}')

First review: [array([ 1.7306   ,  0.284    , -0.040613 , -0.087372 , -0.4819   ,
       -0.4278   , -0.65733  ,  0.31632  ,  1.0554   , -0.70909  ,
        0.57747  ,  0.3628   , -0.46717  ,  0.97844  ,  0.12695  ,
       -0.39876  , -1.6432   ,  0.34616  , -1.9699   , -0.40326  ,
        0.27543  ,  0.7005   , -0.24267  , -0.0042508,  0.34004  ,
       -1.6021   ,  0.06427  ,  0.035409 ,  0.33291  ,  0.25677  ,
        2.2611   ,  0.95595  , -0.55761  , -0.12359  ,  0.74087  ,
       -0.47427  ,  0.89383  , -0.022107 , -0.38237  ,  0.87486  ,
       -0.13263  ,  0.069306 ,  1.1166   ,  0.44355  , -0.84921  ,
        0.039591 ,  0.34486  ,  0.34012  , -0.097281 ,  0.28005  ],
      dtype=float32), array([ 0.086888, -0.19416 , -0.24267 , -0.33391 ,  0.56731 ,  0.39783 ,
       -0.97809 ,  0.03159 , -0.61469 , -0.31406 ,  0.56145 ,  0.12886 ,
       -0.84193 , -0.46992 ,  0.47097 ,  0.023012, -0.59609 ,  0.22291 ,
       -1.1614  ,  0.3865  ,  0.067412,  0.44883 ,  0.17394 , -0.53574 ,


In [9]:
## Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    reviews_dataset, [0.8, 0.1, 0.1], generator=torch.Generator()
)

In [10]:
## Create DataLoaders

## Collate function to pad or trim reviews to same number of tokens
def review_collate_fn(raw_batch):
    ## Input: Collection of (review, label) tuples from ReviewDataset

    padded_reviews = []
    labels = []
    pad_tensor = torch.zeros(len(raw_batch[0][0][0])) ## Pad with zero tensor of size equal to word embeddings

    for (review, label) in raw_batch:
        padded_review = review
        if len(review) < MAX_SEQ_LEN:
            padded_review = padded_review + [pad_tensor for i in range(MAX_SEQ_LEN - len(review))]
        elif len(review) > MAX_SEQ_LEN:
            padded_review = padded_review[:MAX_SEQ_LEN]
        padded_reviews.append(padded_review)
        labels.append(label)
    
    # print(torch.Tensor(padded_reviews).shape)

    ## Returns: a tuple (review tensor, label tensor) of sizes batch_size*MAX_SEQ_LEN and batch_size, respectively.
    return torch.Tensor(padded_reviews), torch.Tensor(labels)

train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
val_loader = DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)

In [11]:
## Check batches
example_features, example_label = next(iter(train_loader))
print(f'Sample feature: \n{example_features}, \nFeature size: {example_features.shape}')
print(f'Sample label: \n{example_label}')

## Assert that feature size is (batch_size, sequence_length ie review_length, feature_size ie word_vec_size)
assert example_features.shape == torch.Size([10, 200, 50]), 'Batch provided by DataLoader is of wrong size'

Sample feature: 
tensor([[[ 4.7061e-01,  3.8608e-01, -3.8143e-01,  ..., -3.6983e-01,
           6.8705e-01, -2.1267e-01],
         [ 8.6888e-02, -1.9416e-01, -2.4267e-01,  ..., -7.7000e-01,
           3.9450e-01, -1.6937e-01],
         [ 2.1705e-01,  4.6515e-01, -4.6757e-01,  ..., -4.3782e-02,
           4.1013e-01,  1.7960e-01],
         ...,
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00],
         [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
           0.0000e+00,  0.0000e+00]],

        [[-9.7114e-03,  1.0479e+00, -1.5266e-01,  ..., -7.1828e-01,
           6.9078e-02,  1.7890e+00],
         [-1.7063e-01,  8.2230e-01, -5.8367e-02,  ..., -5.9622e-01,
          -2.0733e-01,  1.3045e+00],
         [ 6.8047e-01, -3.9263e-02,  3.0186e-01,  ..., -7.3297e-02,
          -6.4699e-02, -2.6044e-01],
         ...,
        

  return torch.Tensor(padded_reviews), torch.Tensor(labels)


In [12]:
## LSTM model

class SimpleLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_lstm_layers=1, cell_dropout=0.0):
        ## vocab_size = no. of unique words in reviews
        ## embedding_dim = size of embeddings / vectors
        ## hidden_dim = dimension of LSTM output
        ## num_lstm_layers = no. of LSTM layers
        ## cell_dropout = dropout applied between LSTM layers

        super().__init__()

        self.num_lstm_layers = num_lstm_layers
        self.hidden_dim = hidden_dim

        ## Model layers
            ## Embedding layer TODO: Should this be implemented?
            ## LSTM (for thought vector)
            ## Linear layer (for logit score)
            ## Activation (for P of +ve sentiment)

        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=embedding_dim, 
                hidden_size=self.hidden_dim, 
                num_layers=self.num_lstm_layers, 
                batch_first=True, 
                dropout=cell_dropout),
            'linear1': nn.Linear(
                in_features=self.hidden_dim, 
                out_features=3 ## 3 units for predicting 3 sentiments
            ),
            'sigmoid': nn.Sigmoid()
        })

    
    def forward(self, x):
        ## Input is a (batch_size, sequence_length, feature_size) tensor
        ##TODO: Implement forward pass, with cell and hidden states

        ## LSTM outputs
            ## h_t = Tensor of shape (batch_size, sequence_length, direction*hidden_size) representing hidden state at each t
            ## h_n = Hidden state at last time step
            ## c_n = Cell state at last time step
        h_t, (h_n, c_n) = self.model['lstm'](x)
        # print(f'LSTM hidden states: {h_t.shape}')
        # print(f'LSTM final state: {h_n.shape}')

        output = self.model['linear1'](h_n[-1])
        # print(f'Linear output: {output.shape}')

        output = self.model['sigmoid'](output)
        # print(f'Sigmoid output: {output.shape}')

        return output


    ## Initialize initial cell and hidden states
    def init_hidden(self, batch_size):
        ##TODO: Return tuple of two num_layers * batch_size * hidden_dim tensors
        pass

    

In [13]:
## Initialize an LSTM model
    ## Hyperparameters
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 1
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)
print(model)

SimpleLSTM(
  (model): ModuleDict(
    (lstm): LSTM(50, 64, batch_first=True, dropout=0.1)
    (linear1): Linear(in_features=64, out_features=3, bias=True)
    (sigmoid): Sigmoid()
  )
)




In [14]:
## Test forward pass
example_output = model(example_features)
print(f'Sample output: \n{example_output}')

Sample output: 
tensor([[0.5060, 0.5247, 0.5220],
        [0.5048, 0.5078, 0.5030],
        [0.5055, 0.5245, 0.5203],
        [0.5060, 0.5247, 0.5220],
        [0.5051, 0.5245, 0.5196],
        [0.5060, 0.5247, 0.5220],
        [0.5060, 0.5247, 0.5220],
        [0.5060, 0.5247, 0.5220],
        [0.5202, 0.5096, 0.4976],
        [0.5292, 0.5247, 0.5068]], grad_fn=<SigmoidBackward0>)


In [15]:
## Evaluation function
def evaluation(model:nn.Module, data_loader:DataLoader, loss_fn=nn.CrossEntropyLoss()):
    with torch.no_grad():
        model.eval()

        total_correct = 0
        total_loss = 0
        all_pred = []
        all_labels = []

        ## Process batch by batch
        for reviews, labels in data_loader:
            reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

            ## Forward pass
            pred = model(reviews)
            pred_class = torch.argmax(pred, dim=1) ## Get indices of largest value in prediction tensor, ie predicted class

            ## Calculate loss
            loss = loss_fn(pred, labels.long())
            total_loss += abs(loss)

            all_pred.extend(pred_class.numpy())
            all_labels.extend(labels.numpy())
        
        ## Calculate metrics
        correct_preds = sum([1 for pred, label in zip(all_pred, all_labels) if pred == label])
        f1 = f1_score(all_labels, all_pred, average='micro')

    return total_loss, f1

In [16]:
## Test evaluation function
example_loss, example_f1 = evaluation(model, val_loader)
print(f'Loss: {example_loss}')
print(f'F1: {example_f1}')

Loss: 89.05116271972656
F1: 0.18024691358024691


In [17]:
## Training function / Optimization loop
##TODO: Add logging
def train_model(
    model:nn.Module, 
    train_loader:DataLoader, 
    val_loader:DataLoader, 
    lr=0.01, 
    epochs=3, 
    loss_fn=nn.CrossEntropyLoss(),
):
    model = model.to(DEVICE)

    ## Initialize optimizer for params that require gradients
    optimizer = torch.optim.Adam([param for param in model.parameters() if param.requires_grad], lr=lr)

    ## Training loop
    for epoch in range(epochs):
        train_loss = 0
        epoch_all_pred = []
        epoch_all_labels = []

        ## Iterate through batches
        for reviews, labels in train_loader:
            model.train()
            reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

            ## Forward pass
            pred = model(reviews)
            pred_class = torch.argmax(pred, dim=1) ## Get indices of largest value in prediction tensor, ie predicted class

            ## Calculate loss
            loss = loss_fn(pred, labels.long())
            train_loss += abs(loss)

            ## Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            ## Tabulate epoch outputs
            epoch_all_pred.extend(pred_class.numpy())
            epoch_all_labels.extend(labels.numpy())

        ## Epoch statistics
        train_f1 = f1_score(epoch_all_labels, epoch_all_pred, average='micro')
        if epoch == 0 or (epoch + 1) % 100 == 0:
            print(f'======== Epoch {epoch + 1} ========')
            print(f'Training loss: {train_loss}')
            print(f'Training F1: {train_f1}')

            ## Validation
            val_loss, val_f1 = evaluation(model, val_loader, loss_fn=loss_fn)
            print(f'Validation loss: {val_loss}')
            print(f'Validation F1: {val_f1}')
        
    return model

In [18]:
## Testing training loop
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 2
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)

trained_model = train_model(model, train_loader, val_loader, lr=0.001, epochs=100)

Training loss: 623.5164794921875
Training F1: 0.4816358024691358
Validation loss: 79.01199340820312
Validation F1: 0.4765432098765432
Training loss: 474.2033996582031
Training F1: 0.8145061728395062
Validation loss: 59.45176696777344
Validation F1: 0.8185185185185185


In [22]:
## Save model
curr_datetime = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M')
file_dir = '.experiments/'
file_name = f'model_{curr_datetime}.pt'
torch.save(model.state_dict(), file_dir + file_name)

In [None]:
## Error analysis

In [20]:
## Hyperparameter tuning
    ## Embedding (size, method)
    ## Hidden dimension
    ## LSTM layers
    ## Bidirectional
    ## Dropout probability
    ## Learning rate