# Importing libraries

In [81]:
## Importing all libraries
import gc
gc.collect()


189

In [82]:
import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
import torch.nn as nn
import re
import json
from sklearn.model_selection import train_test_split
import numpy as np
import gc
from torch.utils.data import Dataset, DataLoader , TensorDataset
import pandas as pd
import tensorflow as tf
import time
import tqdm

# Initialize variables

In [83]:
# Constants
input_file_path = "sample_data/train"
dev_file_path = "sample_data/dev_niu"
test_file_path = "sample_data/test_niu"

## Custom params
unk_token='__UNK__'
dataLoader_batch_size = 5 
tag_embedding_size = 1

## Hyper-params as per HW4 pdf
word_embedding_size = 100 


# Device variable

In [84]:
is_cuda = torch.cuda.is_available()

# #If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
# if is_cuda:
#     device = torch.device("cuda")
#     print("GPU is available")
# else:
#     device = torch.device("cpu")
#     print("GPU not available, CPU used")

device = torch.device("cuda")


# Functions

In [85]:
# Function to sort dictionary in desc order
def orderDictionary(d, reverse = False):
  return dict(sorted(d.items(), key = lambda x: x[1], reverse = reverse))

In [86]:
# Creating dictionary
def create_vocabulary(file_path):
    dictionary = dict()
    # We removed punctuations to increase accuracy
    train_file = open(file_path, "r")
    for line in train_file:
        # If line is not a blank line, do further processing  
        if(line.strip()):
            line = line.strip()
            word = line.split(" ")[2]
            if word in dictionary:
                dictionary[word] = dictionary[word] + 1
            else:
                dictionary[word] = 1
            

    # Order the dictionary in descending order
    sorted_dict = orderDictionary(dictionary, True)

    # @TODO - Check if we need unknown counts at all? - Might not need, we are explicitly considering UNK tag

    return sorted_dict 

In [87]:
def create_sentence_lists(file_path):
  file = open(file_path, "r")
  master_data = []
  master_tags = []
  sentence = []
  tags = []
  for line in file:
      # If line is not a blank line, do further processing  
      if(line.strip()):
        line = line.strip()
        word = line.split(" ")[1]
        sentence.append(word)
        t = line.split(" ")[2]
        tags.append(t)
      else:
        # We know a line is done 
        master_data.append(sentence)
        master_tags.append(tags)
        sentence = []
        tags = []

  return master_data, master_tags    

In [88]:
def convertWordsToVectors(vocab,text):
    
    vectors = []
    unk_ID = vocab[unk_token]
    for word in str(text).split():

    # if the word is not in vocab_dict, then assign UNK
      word_ID = vocab.get(word, unk_ID)
      vec = np.zeros((word_embedding_size,), dtype=np.float32)
      vec[0] = word_ID
      vectors.append(vec)

    return vectors  

In [89]:
def convertNERTagsToVectors(ner_vocab, text):
    
    vectors = []
    for i in range(0,len(text)):
      tag = text[i]
      # if the tag is not in ner_tag_list, then assign UNK
      tag_ID = ner_vocab.get(tag)
      vec = np.zeros((tag_embedding_size,), dtype=np.float32)
      vec[0] = tag_ID
      vectors.append(vec)

    return vectors 


In [90]:
def dStackAndMove(arr):
  arr = np.dstack(arr)  
  arr = np.moveaxis(arr, -1, 0)
  return arr

# Creating a dictionary of all unique NER tags from Training data

In [91]:
# Task1: Create dictionary of all unique NER tags
ner_tag_dict = create_vocabulary(input_file_path)

# Make data list from Train, Dev and Test data

In [92]:
master_train_data, master_train_tags = create_sentence_lists(input_file_path)
master_dev_data, master_dev_tags = create_sentence_lists(dev_file_path)

# Creating Vocabulary from all words in Training data file

In [93]:
## Step 1:
vocab = {'__PAD__': 0, '__</e>__': 1, '__UNK__': 2} 
ner_vocab = {}
for i in range(0,len(master_train_data)):
    # If line is not a blank line, do further processing  
    sentence = master_train_data[i]
    tag = master_train_tags[i]
    for word in sentence:
      # We send the line to training model
      if word not in vocab:
        vocab[word] = len(vocab)
    for t in tag:
      if t not in ner_vocab:
        ner_vocab[t] = float(len(ner_vocab))

print(len(vocab))

23627


# Convert individual word in each sentence to an embedding of size 1 x k=100

In [94]:
## Convert lists into dataframe for ease in further processing

text_labels_df = pd.DataFrame({'Text': master_train_data, 'Labels': master_train_tags})
text_labels_df['Text'] = text_labels_df['Text'].apply(lambda x: convertWordsToVectors(vocab,x))   
text_labels_df['Labels'] = text_labels_df['Labels'].apply(lambda x: convertNERTagsToVectors(ner_vocab,x))   
print(text_labels_df)


                                                    Text  \
0      [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
1      [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
2      [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
3      [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
4      [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
...                                                  ...   
14981  [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
14982  [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
14983  [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
14984  [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   
14985  [[2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...   

                                                  Labels  
0      [[0.0], [1.0], [2.0], [1.0], [1.0], [1.0], [2....  
1                                         [[3.0], [4.0]]  
2                                         [[5.0], [1.0]]  
3      [[1.0], [0.0], [6.0], [1.0], [1.0], 

# Converting numeric sentences to tensors

In [95]:
#print(text_labels_df['Text'][0]) 

In [96]:
text_labels_df['Text'] = text_labels_df['Text'].apply(lambda x: torch.tensor(x))
text_labels_df['Labels'] = text_labels_df['Labels'].apply(lambda x: torch.tensor(x))
print(text_labels_df['Labels'][0].shape) 

torch.Size([9, 1])


# Creating a dataset object for using dataloader

In [97]:
class BiLSTMDataset(Dataset):
    def __init__(self, text, labels):
        self.labels = labels
        self.text = text

        ## SENTENCE PADDING
        self.text = pad_sequence(self.text, batch_first=True)
        self.labels = pad_sequence(self.labels, batch_first=True, padding_value= -1 )
        
        
        
    def __len__(self):
            return len(self.text)

    def __getitem__(self, idx):
            label = self.labels[idx]
            text = self.text[idx]
            #sample = {"Text": text, "Class": label}
            return text,label

# Initializing a DataLoader

In [98]:
trainDataLoader = DataLoader(BiLSTMDataset(text_labels_df['Text'],text_labels_df['Labels']), batch_size=dataLoader_batch_size, shuffle=True) 


# Create a BiLSTM class

In [99]:
class BiLSTM(nn.Module):
    # target_size is no_of_tags. So, our output from linear layer must be probabilty of tag being any one of the no_of_tags (here: 9) tags
    def __init__(self, vocab_len, input_size, hidden_size, num_layers, linear_output_dim, batch_size): # WE have to tune batch-size as hyper-param
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.linear_output_dim = linear_output_dim

        ## Individual layer structures
        # 1. Embedding layer
        self.embedding = nn.Embedding(vocab_len, input_size, padding_idx = 0)

        # 2. Single LSTM layer that is bi-directional
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)

        # 3. Linear layer
        self.fc = nn.Linear(hidden_size*2, linear_output_dim)
        
        # 4. ELU (we add an activation function in forward pass)
        self.elu = nn.ELU()

        # 5. We define the DropOUT layer here 
        self.dropout = nn.Dropout(0.33)

        ## 6. - Not sure if we have to include this or not
        self.sigmoid = nn.Sigmoid()
        
    #########################################################################################
    def forward(self, x):
        # Embedding layer - convert input to sequence of dense vectors

        #print("PRE Embedding is: ",x.shape)
        embedded = self.embedding(x)
        embedded = embedded[:, :, 0, :]
        #print("Embedding is: ",embedded.shape)

        # Post-padding is needed as all sentences are not of same length, we consider a batch
        # a batch has some list of sentences, we find the longest sentence 
        # We post-pad rest of the sentences until we reach length of the longest one
        
        seq_lengths = []
        #print("Our batch ssize: ",len(x))
        for i in range(0,len(x)):
          seq_lengths.append(input_size)

        #print("Seq lengths: ",seq_lengths)
        seq_lens = torch.Tensor(seq_lengths)

        ### SHOUMIK ::: REMMOVE THE ENFORE-SORTED VARIABLE FROM HERE
        packed_embedded = pack_padded_sequence(embedded, seq_lens ,batch_first= True)
        #packed_embedded = pack_padded_sequence(embedded, x.to('cpu'), enforce_sorted=False, batch_first= True) # we have to pass tensor also 'x'
        #print("Packed Embedding is: ",len(packed_embedded))
        
        # BiLSTM layer
        # num_layers*2 => here as we have a BiLSTM we multiple by 2 else 1
        # h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        # c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        # out, _ = self.lstm(embedded, (h0, c0))

        # (2, 7 ,256)
        # co: (2,7,256)
        h0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)
        c0 = torch.zeros(self.num_layers*2, x.size(0), self.hidden_size).to(device)

        # print("Hidden shape h0: ",h0.shape)
        # print("Hidden shape c0: ",c0.shape)  
        
        out, _ = self.lstm(packed_embedded, (h0, c0))
        #print("OUT from self.lstm: ",out)

        # This is needed for the embeddings, we have to reduce the train time too!
        output_unpacked, output_lengths = pad_packed_sequence(out, batch_first=True)
        #print("UNPACKED OUT from self.lstm: ",output_unpacked.shape)

        # Concatenate the final output of the forward and backward LSTM
        out1 = torch.cat((output_unpacked[:, -1, :self.hidden_size], output_unpacked[:, 0, self.hidden_size:]), dim=1)
        
        #print("INPUT to Linear:: ",out1.shape)
        # Output layer - Linear
        out2 = self.fc(out1)

        # print("Dimesnions from linear layer output: ",out2.shape)

        ## See if adding dropout here works or not
        dropout_op = self.dropout(out2)

        # Then we apply ELU activation
        elu_act = self.elu(dropout_op) 

        ## WHEN WE USE CROSS ENTROPY LOSS, WE MIGHT NOT NEED THIS SOFTMAX LAYER
        #target_output = elu_act
        #target_output = self.sm(elu_act)
        #print("Elu activation step completed")


        #print("Probability target output: ",target_output)
        #print("Probability target output size: ",target_output.shape)

        #sigmoid function
        sig_out = self.sigmoid(elu_act)
        
        #print("sig_out dimensions: ",sig_out.shape)
        # reshape to be batch size first
        #sig_out = sig_out.view(num_classes, -1)
        #sig_out = sig_out[:, -1] # get last batch of labels

        result = sig_out
        #print("Probability target output size AFTER SIGMOID: ",sig_out.shape)

        return result


# Model Parameters

In [100]:
## Model variables
input_size = 100 ## Given HP - embedding dim = 100
hidden_size = 256 # Given HP - LSTM hidden dim
num_layers = 1 # Number of LSTM layers
num_classes = len(ner_tag_dict) # ASSUMING IT IS THE SIZE OF NER VOCAB WE CREATED,i,e 9
batch_size = 5 # @TODO - Hyperparam - we find the best
linear_output_dim = 128
max_epochs = 1 #@ TODO - Hyper-param tuning needed
lrate = 0.01 #@ TODO - Hyper-param tuning needed

# Initialize the model

In [101]:
bilstm = BiLSTM(len(vocab), input_size, hidden_size, num_layers, linear_output_dim, batch_size) 

# Optimiser and Loss function

In [102]:
# Now we include optimizer
optimizer = torch.optim.SGD(bilstm.parameters(), lr=lrate)

# @TODO - Change this value Define the loss function
criterion = nn.CrossEntropyLoss(ignore_index = -1)


# Training model for Task 1

In [103]:

for epoch in range(0,max_epochs):
    print("Starting Epoch: ",epoch)
    bilstm.train()
    sum_loss = 0.0
    total = 0
    for input,label in trainDataLoader:
      input = input.squeeze(0)
      #print("Input shape after squeeze: ",input.shape)

      
      optimizer.zero_grad()
      y_pred = bilstm(input.to(device).long())
      reshaped_ypred = y_pred.sum(dim=1)
      label = torch.squeeze(label).squeeze().squeeze()
      label = label[:,0]#[:, :, 0, :]
      #print("Reshaped ypred shape: ",reshaped_ypred.shape)
      #print("lable shape :: ",label.shape)
      loss = criterion(reshaped_ypred.to(device).long(), label.to(device).long()) 
      loss.backward()
      optimizer.step()
      sum_loss += loss.item() 
      total = total + 1   

    print("Printing Sum loss: ",sum_loss," total: ",total," and train_loss: ",sum_loss/total)    
    


Starting Epoch:  0


RuntimeError: ignored