In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

import spacy
import numpy as np
import pandas as pd

import random
import math
import time

In [2]:
SEED = 25

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
from BERT_WordEmbeddingsPipeline import embeddingsPipeline
embedding_length = 768
zero_embedding = [0 for i in range(embedding_length)]

print(len(zero_embedding))

768


In [4]:
data = pd.read_csv('hpl.csv')
print(data.head())
print(data.shape)

                                     tokenized_sents
0  It never once occurred to me that the fumbling...
1  Finding nothing else not even gold the Superin...
2  Herbert West needed fresh bodies because his l...
3  The farm like grounds extended back very deepl...
4  His facial aspect too was remarkable for its m...
(5635, 1)


In [5]:
max_words = 200

emb = []
for i in data['tokenized_sents']:
    e = embeddingsPipeline(i)
    while(len(e) < max_words):
        e.append(zero_embedding)
    e = e[:200]
    emb.append(e)
    
data['embeddings'] = emb

In [6]:
data.head()

Unnamed: 0,tokenized_sents,embeddings
0,It never once occurred to me that the fumbling...,"[[1.0952692, -1.5751679, -2.1890771, -1.346773..."
1,Finding nothing else not even gold the Superin...,"[[-2.8680723, 1.028281, 0.8684244, -0.20770997..."
2,Herbert West needed fresh bodies because his l...,"[[1.1890708, 3.4342752, -1.1254256, 1.5733889,..."
3,The farm like grounds extended back very deepl...,"[[-0.47942826, -1.4710225, -0.8196672, -0.9625..."
4,His facial aspect too was remarkable for its m...,"[[-2.287135, -0.24020833, -1.8980496, -0.57004..."


In [7]:
from sklearn.model_selection import train_test_split
train_data, remaining_data = train_test_split(data, test_size=0.2, random_state=SEED)
test_data, valid_data = train_test_split(remaining_data, test_size=0.5, random_state=SEED)

print(train_data.shape, test_data.shape, valid_data.shape)

(4508, 2) (563, 2) (564, 2)


In [8]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, isCuda):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.isCuda = isCuda
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.relu = nn.ReLU()
        
        # initializing weights
        nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2))
        nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2))
         
    def forward(self, input):
        encoded_input, hidden = self.lstm(input)
        encoded_input = self.relu(encoded_input)
        return encoded_input
    
class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size, num_layers, isCuda):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        
        self.isCuda = isCuda
        self.lstm = nn.LSTM(hidden_size, output_size, num_layers, batch_first=True)
        self.sigmoid = nn.Sigmoid()
        
        nn.init.xavier_uniform(self.lstm.weight_ih_l0, gain=np.sqrt(2))
        nn.init.xavier_uniform(self.lstm.weight_hh_l0, gain=np.sqrt(2))
       
    def forward(self, encoded_input):
        decoded_output, hidden = self.lstm(encoded_input)
        decoded_output = self.sigmoid(decoded_output)
        return decoded_output 

In [9]:
class LSTM_AE(nn.Module):
    def __init__(self, encoder, decoder, isCuda):
        super(LSTM_AE, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.isCuda = isCuda
        
    def forward(self, input):
        encoded_input = self.encoder(input)
        decoded_output = self.decoder(encoded_input)
        return decoded_output

In [10]:
INPUT_SIZE = embedding_length
HIDDEN_SIZE = 512
NUM_LAYERS = 2
OUTPUT_SIZE = embedding_length
IS_CUDA = torch.cuda.is_available()

enc = Encoder(INPUT_SIZE, HIDDEN_SIZE, NUM_LAYERS, IS_CUDA)
dec = Decoder(HIDDEN_SIZE, OUTPUT_SIZE, NUM_LAYERS, IS_CUDA)
model = LSTM_AE(enc, dec, IS_CUDA)

  del sys.path[0]
  


In [11]:
train_data['embeddings'].head()

4693    [[-4.7064943, -1.5040853, 1.2175621, 1.5480866...
4226    [[3.3086557, 0.09898888, -3.8257747, 0.6116794...
1723    [[2.2980692, 1.2661281, -0.25308305, 1.8103327...
4944    [[1.086747, -0.052063126, -0.60388684, 0.19703...
3783    [[-2.4465256, 3.5449834, -1.2721826, -1.445640...
Name: embeddings, dtype: object

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTM_AE(
  (encoder): Encoder(
    (lstm): LSTM(768, 512, num_layers=2, batch_first=True)
    (relu): ReLU()
  )
  (decoder): Decoder(
    (lstm): LSTM(512, 768, num_layers=2, batch_first=True)
    (sigmoid): Sigmoid()
  )
)

In [13]:
def train(train_data, test_data, model, epochs):
    
    model.train()
    optimizer = torch.optim.Adam(model.parameters())
    criterion = nn.MSELoss()
    
    print("Started Training ...")
    for epoch in range(epochs):
        start_time = time.time()
        
        train_epoch_loss = 0
        for text in train_data:
            text = torch.Tensor(text)
            
            text = text.to(device)
            text = text.view(max_words, 1, embedding_length)
            output = model(text)
            loss = criterion(text, output)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_epoch_loss += loss.item()
            
        test_epoch_loss = 0
        for text in train_data:
            text = torch.Tensor(text)
            
            text = text.to(device)
            text = text.view(max_words, 1, embedding_length)
            output = model(text)
            loss = criterion(text, output)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            test_epoch_loss += loss.item()
        
        finish_time = time.time()
        print("Epoch {}: Loss from Training data: {} and Loss from Test data: {} Time taken: {}" 
              .format(epoch, train_epoch_loss/len(train_data), test_epoch_loss/len(test_data), finish_time-start_time))

In [15]:
train(train_data['embeddings'], test_data['embeddings'], model, 15)

Started Training ...
Epoch 0: Loss from Training data: 1.0399610326784896 and Loss from Test data: 8.281609604985421 Time taken: 210.87269473075867
Epoch 1: Loss from Training data: 1.0312922497171688 and Loss from Test data: 8.240919174590077 Time taken: 215.92697954177856
Epoch 2: Loss from Training data: 1.0278757249386097 and Loss from Test data: 8.222859383873363 Time taken: 219.4379162788391
Epoch 3: Loss from Training data: 1.0261627643665807 and Loss from Test data: 8.211620027335034 Time taken: 221.91909527778625
Epoch 4: Loss from Training data: 1.0250699079488903 and Loss from Test data: 8.204774286538415 Time taken: 222.91722321510315
Epoch 5: Loss from Training data: 1.0243461487390029 and Loss from Test data: 8.199733608728192 Time taken: 225.33042907714844
Epoch 6: Loss from Training data: 1.0238147356548242 and Loss from Test data: 8.19615974630598 Time taken: 223.68233108520508
Epoch 7: Loss from Training data: 1.0234338087463601 and Loss from Test data: 8.193407905832