In [None]:
## Imports and constants
import os
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import matutils
import gensim.downloader as api
from sklearn.metrics import f1_score

import datetime

from lstm_preprocessing import lstm_preprocessing

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42) 
np.random.seed(42)


MAX_SEQ_LEN = 200
BATCH_SIZE = 10

In [None]:
raw_dataset = pd.read_csv(os.getcwd() + '\\airlines_reviews.csv')
raw_dataset.head()

In [None]:
## Preprocessing
tokenized_dataset = lstm_preprocessing(dataset=raw_dataset.sample(2000, ignore_index=True)) ##TODO: Remove sampling
tokenized_reviews = tokenized_dataset['Tokenized_Reviews']
tokenized_dataset.head()

In [None]:
## Embedding / Vectorization using TF-IDF (each review = 1 document)
dct = Dictionary(tokenized_reviews)  # fit dictionary
corpus = [dct.doc2bow(line) for line in tokenized_reviews]
tfidf_model = TfidfModel(corpus)

print('Number of features: {}'.format(len(dct)))


## Inspect some TF-IDF scores of first review, sorted by the similarity score
print('Text 0:\n{}\n'.format(tokenized_reviews[0]))
for w, s in sorted(tfidf_model[corpus[0]], reverse=True, key=lambda x: x[1]):
    print('{}:{}'.format(dct[w], s))

In [None]:
## Vectorize all reviews with TF-IDF
tfidf_vectorization_csr = matutils.corpus2csc(tfidf_model[corpus], num_terms=len(dct))
tfidf_reviews = tfidf_vectorization_csr.T.toarray()
print(f'TF-IDF matrix has shape: {tfidf_reviews.shape}')

In [None]:
## Embedding / Vectorization using GloVe
    ## Note: LSTM processes the sentence sequentially, hence vectorization should be done word-by-word
glove_model = api.load("glove-wiki-gigaword-50")


tokenized_embedded_reviews = []
unidentified_tokens = [] ## Tokens not in GloVe model


for review in tokenized_reviews:
    curr_embedded_review = []
    
    for token in review:
        if token in glove_model:
            curr_embedded_review.append(glove_model[token])
        else:
            unidentified_tokens.append(token)

    tokenized_embedded_reviews.append(curr_embedded_review)


print(f'Sample of embedded reviews:')
print(f'Text: {tokenized_reviews[0]}')
print(f'Vector: {tokenized_embedded_reviews[0]}')
print(f'{len(unidentified_tokens)} total tokens not in GloVe model: \n{unidentified_tokens}')

In [None]:
## Dataset class
    ## Uses preprocessing method above
    ## __getitem__ returns (preprocessed) text and its corresponding label

##TODO: Augment such that it can return both text and vector forms of reviews
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    
    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        ret_review = self.reviews[idx]
        ret_label = self.labels[idx]
        
        return ret_review, ret_label

In [None]:
## Using Dataset wrapper
reviews_dataset = ReviewsDataset(reviews=tokenized_embedded_reviews, labels=tokenized_dataset['Sentiment'])
print(f'First review: {reviews_dataset[0][0]}, \nCorresponding label: {reviews_dataset[0][1]}')

In [None]:
## Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    reviews_dataset, [0.8, 0.1, 0.1], generator=torch.Generator()
)

In [None]:
## Create DataLoaders

## Collate function to pad or trim reviews to same number of tokens
def review_collate_fn(raw_batch):
    ## Input: Collection of (review, label) tuples from ReviewDataset

    padded_reviews = []
    labels = []
    pad_tensor = torch.zeros(len(raw_batch[0][0][0])) ## Pad with zero tensor of size equal to word embeddings

    for (review, label) in raw_batch:
        padded_review = review
        if len(review) < MAX_SEQ_LEN:
            padded_review = padded_review + [pad_tensor for i in range(MAX_SEQ_LEN - len(review))]
        elif len(review) > MAX_SEQ_LEN:
            padded_review = padded_review[:MAX_SEQ_LEN]
        padded_reviews.append(padded_review)
        labels.append(label)
    
    # print(torch.Tensor(padded_reviews).shape)

    ## Returns: a tuple (review tensor, label tensor) of sizes batch_size*MAX_SEQ_LEN and batch_size, respectively.
    return torch.Tensor(padded_reviews), torch.Tensor(labels)

train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
val_loader = DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True, collate_fn=review_collate_fn)

In [None]:
## Check batches
example_features, example_label = next(iter(train_loader))
print(f'Sample feature: \n{example_features}, \nFeature size: {example_features.shape}')
print(f'Sample label: \n{example_label}')

## Assert that feature size is (batch_size, sequence_length ie review_length, feature_size ie word_vec_size)
assert example_features.shape == torch.Size([10, 200, 50]), 'Batch provided by DataLoader is of wrong size'

In [None]:
## LSTM model

class SimpleLSTM(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, num_lstm_layers=1, cell_dropout=0.0):
        ## vocab_size = no. of unique words in reviews
        ## embedding_dim = size of embeddings / vectors
        ## hidden_dim = dimension of LSTM output
        ## num_lstm_layers = no. of LSTM layers
        ## cell_dropout = dropout applied between LSTM layers

        super().__init__()

        self.num_lstm_layers = num_lstm_layers
        self.hidden_dim = hidden_dim

        ## Model layers
            ## Embedding layer TODO: Should this be implemented?
            ## LSTM (for thought vector)
            ## Linear layer (for logit score)
            ## Activation (for P of +ve sentiment)

        self.model = nn.ModuleDict({
            'lstm': nn.LSTM(
                input_size=embedding_dim, 
                hidden_size=self.hidden_dim, 
                num_layers=self.num_lstm_layers, 
                batch_first=True, 
                dropout=cell_dropout),
            'linear1': nn.Linear(
                in_features=self.hidden_dim, 
                out_features=3 ## 3 units for predicting 3 sentiments
            ),
            'sigmoid': nn.Sigmoid()
        })

    
    def forward(self, x):
        ## Input is a (batch_size, sequence_length, feature_size) tensor
        ##TODO: Implement forward pass, with cell and hidden states

        ## LSTM outputs
            ## h_t = Tensor of shape (batch_size, sequence_length, direction*hidden_size) representing hidden state at each t
            ## h_n = Hidden state at last time step
            ## c_n = Cell state at last time step
        h_t, (h_n, c_n) = self.model['lstm'](x)
        # print(f'LSTM hidden states: {h_t.shape}')
        # print(f'LSTM final state: {h_n.shape}')

        output = self.model['linear1'](h_n[-1])
        # print(f'Linear output: {output.shape}')

        output = self.model['sigmoid'](output)
        # print(f'Sigmoid output: {output.shape}')

        return output


    ## Initialize initial cell and hidden states
    def init_hidden(self, batch_size):
        ##TODO: Return tuple of two num_layers * batch_size * hidden_dim tensors
        pass

    

In [None]:
## Initialize an LSTM model
    ## Hyperparameters
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 1
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)
print(model)

In [None]:
## Test forward pass
example_output = model(example_features)
print(f'Sample output: \n{example_output}')

In [None]:
## Evaluation function
def evaluation(model:nn.Module, data_loader:DataLoader, loss_fn=nn.CrossEntropyLoss()):
    with torch.no_grad():
        model.eval()

        total_correct = 0
        total_loss = 0
        all_pred = []
        all_labels = []

        ## Process batch by batch
        for reviews, labels in data_loader:
            reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

            ## Forward pass
            pred = model(reviews)
            pred_class = torch.argmax(pred, dim=1) ## Get indices of largest value in prediction tensor, ie predicted class

            ## Calculate loss
            loss = loss_fn(pred, labels.long())
            total_loss += abs(loss)

            all_pred.extend(pred_class.numpy())
            all_labels.extend(labels.numpy())
        
        ## Calculate metrics
        correct_preds = sum([1 for pred, label in zip(all_pred, all_labels) if pred == label])
        f1 = f1_score(all_labels, all_pred, average='micro')

    return total_loss, f1

In [None]:
## Test evaluation function
example_loss, example_f1 = evaluation(model, val_loader)
print(f'Loss: {example_loss}')
print(f'F1: {example_f1}')

In [None]:
## Training function / Optimization loop
def train_model(
    model:nn.Module, 
    train_loader:DataLoader, 
    val_loader:DataLoader, 
    lr=0.01, 
    epochs=3, 
    loss_fn=nn.CrossEntropyLoss(),
):
    model = model.to(DEVICE)

    ## Initialize optimizer for params that require gradients
    optimizer = torch.optim.Adam([param for param in model.parameters() if param.requires_grad], lr=lr)

    ## Training loop
    for epoch in range(epochs):
        train_loss = 0
        epoch_all_pred = []
        epoch_all_labels = []

        ## Iterate through batches
        for reviews, labels in train_loader:
            model.train()
            reviews, labels = reviews.to(DEVICE), labels.to(DEVICE)

            ## Forward pass
            pred = model(reviews)
            pred_class = torch.argmax(pred, dim=1) ## Get indices of largest value in prediction tensor, ie predicted class

            ## Calculate loss
            loss = loss_fn(pred, labels.long())
            train_loss += abs(loss)

            ## Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            ## Tabulate epoch outputs
            epoch_all_pred.extend(pred_class.numpy())
            epoch_all_labels.extend(labels.numpy())

        ## Epoch statistics
        train_f1 = f1_score(epoch_all_labels, epoch_all_pred, average='micro')
        if epoch == 0 or (epoch + 1) % 100 == 0:
            print(f'======== Epoch {epoch + 1} ========')
            print(f'Training loss: {train_loss}')
            print(f'Training F1: {train_f1}')

            ## Validation
            val_loss, val_f1 = evaluation(model, val_loader, loss_fn=loss_fn)
            print(f'Validation loss: {val_loss}')
            print(f'Validation F1: {val_f1}')
        
    return model

In [None]:
## Testing training loop
embedding_dim = 50
hidden_dim = 64
num_lstm_layers = 2
cell_dropout = 0.1

model = SimpleLSTM(embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)

trained_model = train_model(model, train_loader, val_loader, epochs=1000)

In [None]:
## Save model
curr_datetime = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")
file_name = f"model_{curr_datetime}.pt"
torch.save(model.state_dict(), file_name)

In [None]:
## Hyperparameter tuning
    ## Embedding (size, method)
    ## Hidden dimension
    ## LSTM layers
    ## Bidirectional
    ## Dropout probability
    ## Learning rate