In [None]:
## Imports and constants
import os
import sys

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim import matutils
import gensim.downloader as api

from collections import OrderedDict

from lstm_preprocessing import lstm_preprocessing

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(42) 
np.random.seed(42)


MAX_SEQ_LEN = 200
BATCH_SIZE = 10

In [None]:
raw_dataset = pd.read_csv(os.getcwd() + '\\airlines_reviews.csv')
raw_dataset.head()

In [None]:
## Preprocessing
tokenized_dataset = lstm_preprocessing(dataset=raw_dataset.sample(1000, ignore_index=True)) ##TODO: Remove sampling
tokenized_reviews = tokenized_dataset['Tokenized_Reviews']
tokenized_dataset.head()

In [None]:
## Embedding / Vectorization using TF-IDF (each review = 1 document)
dct = Dictionary(tokenized_reviews)  # fit dictionary
corpus = [dct.doc2bow(line) for line in tokenized_reviews]
tfidf_model = TfidfModel(corpus)

print('Number of features: {}'.format(len(dct)))


## Inspect some TF-IDF scores of first review, sorted by the similarity score
print('Text 0:\n{}\n'.format(tokenized_reviews[0]))
for w, s in sorted(tfidf_model[corpus[0]], reverse=True, key=lambda x: x[1]):
    print('{}:{}'.format(dct[w], s))

In [None]:
## Vectorize all reviews with TF-IDF
tfidf_vectorization_csr = matutils.corpus2csc(tfidf_model[corpus], num_terms=len(dct))
tfidf_reviews = tfidf_vectorization_csr.T.toarray()
print(f'TF-IDF matrix has shape: {tfidf_reviews.shape}')

In [None]:
## Embedding / Vectorization using GloVe
    ## Note: LSTM processes the sentence sequentially, hence vectorization should be done word-by-word
glove_model = api.load("glove-wiki-gigaword-50")


tokenized_embedded_reviews = []
unidentified_tokens = [] ## Tokens not in GloVe model


for review in tokenized_reviews:
    curr_embedded_review = []
    
    for token in review:
        if token in glove_model:
            curr_embedded_review.append(glove_model[token])
        else:
            unidentified_tokens.append(token)

    tokenized_embedded_reviews.append(curr_embedded_review)


print(f'Sample of embedded reviews:')
print(f'Text: {tokenized_reviews[0]}')
print(f'Vector: {tokenized_embedded_reviews[0]}')
print(f'{len(unidentified_tokens)} total tokens not in GloVe model: \n{unidentified_tokens}')

In [None]:
## Dataset class
    ## Uses preprocessing method above
    ## __getitem__ returns (preprocessed) text and its corresponding label

##TODO: Augment such that it can return both text and vector forms of reviews
class ReviewsDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    
    def __len__(self):
        return len(self.labels)


    def __getitem__(self, idx):
        ret_review = self.reviews[idx]
        ret_label = self.labels[idx]
        
        return ret_review, ret_label

In [None]:
## Using Dataset wrapper
reviews_dataset = ReviewsDataset(reviews=tokenized_embedded_reviews, labels=tokenized_dataset['Sentiment'])
print(f'First review: {reviews_dataset[0][0]}, \nCorresponding label: {reviews_dataset[0][1]}')

In [None]:
## Split dataset
train_set, val_set, test_set = torch.utils.data.random_split(
    reviews_dataset, [0.8, 0.1, 0.1], generator=torch.Generator()
)

In [None]:
## Create DataLoaders

## Collate function to pad or trim reviews to same number of tokens
def review_collate_fn(raw_tokens):
    ##TODO: Implement function and add as argument for DataLoaders
    pass

train_loader = DataLoader(dataset=train_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_set, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(dataset=test_set, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
## Check batches
example_features, example_label = next(iter(train_loader))
print(f'Sample feature: \n{example_features}, \nFeature size: {example_features.shape}')
print(f'Sample label: \n{example_label}')

##TODO: Assert that feature size is (batch_size, sequence_length ie review_length, feature_size ie word_vec_size)

In [None]:
## LSTM model

class SimpleLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_lstm_layers=1, cell_dropout=0.0):
        ## vocab_size = no. of unique words in reviews
        ## embedding_dim = size of embeddings / vectors
        ## hidden_dim = dimension of LSTM output
        ## num_lstm_layers = no. of LSTM layers
        ## cell_dropout = dropout applied between LSTM layers

        super().__init__()

        self.num_lstm_layers = num_lstm_layers
        self.hidden_dim = hidden_dim

        ## Model layers
            ## Embedding layer TODO: Should this be implemented?
            ## LSTM (for thought vector)
            ## Linear layer (for logit score)
            ## Activation (for P of +ve sentiment)
        self.model = nn.Sequential(OrderedDict([
            ('lstm', nn.LSTM(embedding_dim, self.hidden_dim, self.num_lstm_layers, batch_first=True, dropout=cell_dropout)),
            ('fcl1', nn.Linear(self.hidden_dim, 1)),
            ('sigmoid1', nn.Sigmoid())
        ]))

    
    def forward(self, x):
        ## Input is a (batch_size, sequence_length, feature_size) tensor
        ##TODO: Implement forward pass, with cell and hidden states
        pass


    ## Initialize initial cell and hidden states
    def init_hidden(self, batch_size):
        ##TODO: Return tuple of two num_layers * batch_size * hidden_dim tensors
        pass

    

In [None]:
## Initialize an LSTM model
    ## Hyperparameters
vocab_size = len(dct)
embedding_dim = len(dct)
hidden_dim = 32
num_lstm_layers = 2
cell_dropout = 0.1

model = SimpleLSTM(vocab_size, embedding_dim, hidden_dim, num_lstm_layers, cell_dropout)
print(model)

In [None]:
## Test forward pass

In [None]:
## Evaluation function

In [None]:
## Training function / Optimization loop

In [None]:
## Hyperparameter tuning
    ## Embedding (size, method)
    ## Hidden dimension
    ## LSTM layers
    ## Bidirectional
    ## Dropout probability
    ## Learning rate