In [1]:
import re
import torch.nn as nn
import numpy as np
from torch.nn.utils import clip_grad_norm_
import torch
import os
import torch
from matplotlib import pyplot as plt
import pandas as pd

In [32]:
para = {}
para['seq'] = 5
para['batch'] = 80
para['lr'] = 0.005
para['hidden'] = 1024
para['layers'] = 1
para['embed'] = 256
para['epoch'] = 30

In [3]:
def text_cleaner(stmts):
    final = []
    
    for text in stmts:
       
        newString = text.lower()
        newString = re.sub(r"'s\b","",newString)
       
        newString = re.sub("[^a-zA-Z]", " ", newString) 
        long_words=[]
        
        for i in newString.split():
            if len(i)>=0:                  
                long_words.append(i)
        final.append( (" ".join(long_words)).strip() ) 
    
    return final

In [5]:
def get_data(path):

    f =  open('brown.txt')
    l = f.readlines()

    final_lines = []
    temp = ''

    for i in l:

        if len(i.split('\n')[0]) != 0:
            temp = ' '.join([temp , i.split('\n')[0]])
        else:
            final_lines.append(temp)
            temp = ''
    
    return final_lines

sentences = get_data('brown.txt')
sentences = text_cleaner(sentences)

In [39]:
print( len(sentences))

5460


In [6]:
def get_dict(data):

    final = {}

    idx = 0 
    total = 0 

    for line in data:
        words = ['<s>'] + line.split() + ['</s>']

        total += len(words)

        for w in words:
            if w not in final.keys() :
                final[w] = idx 
                idx += 1
    
    final['<UNK>'] = idx
    return final , total 

word_dict , total_tokens = get_dict(sentences)

In [7]:
total_stmt = len(sentences)

train_len = int( (7*total_stmt)/10)
valid_len = int( (2*total_stmt)/10)
test_len = int( (1*total_stmt)/10)

train_data = sentences[:train_len]
val_data = sentences[train_len:train_len+valid_len]
test_data = sentences[train_len+valid_len : train_len+valid_len+test_len]

In [36]:
def get_int_data(data ,total_token):

    ids = torch.LongTensor(total_token)
    token = 0
    
    for line in data:
        words = ['<s>'] + line.split() + ['</s>']

        for w in words:
            ids[token] = word_dict[w]
            token += 1
    
    num_batches = ids.size(0) // para['batch']
    ids = ids[:num_batches*para['batch'] ]
    return ids.view(para['batch'], -1) 

train_int  = get_int_data(train_data, total_tokens)
vocab_size = len(word_dict)

num_batches = train_int.size(1) // para['seq']


In [38]:
def resize_outputs( arr):
    dim1 = arr.size(0) * arr.size(1)
    dim2 = arr.size(2)

    temp = arr.reshape( dim1 ,dim2)
    return temp
    
def define_model(embed , hidden , layer , vocab_size ,opt , lrate):
    device = 'cuda'
    model = Neural_LM(embed, hidden, layer ,vocab_size).to(device)

    criterion = nn.CrossEntropyLoss()
    
    if opt =='adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=lrate)
    elif opt == 'adelta':
        optimizer = torch.optim.Adadelta(model.parameters() ,lr = lrate)
    elif opt == 'agrad':
        optimizer = torch.optim.Adagrad(model.parameters, lr=lrate)
    
    return model ,device ,criterion ,optimizer
        
class Neural_LM(nn.Module):
    
    def __init__(self,embed , hid , layers ,vocab):
        
        super(Neural_LM, self).__init__()
        
        self.embed = nn.Embedding(vocab, embed)
        self.lstm = nn.LSTM(embed, hid , layers, batch_first=True)
        self.linear = nn.Linear(hid, vocab )

    def forward(self, inputs , hidden_var):
        op = self.embed(inputs)

        op, (hidden_var, temp) = self.lstm(op, hidden_var)
        
        op = resize_outputs(op)

        output = self.linear(output)
        
        return output, (hidden_var, temp)
    
    def get_init_stat(layer , batch ,hid )
        
        stat = (torch.zeros(layer ,batch ,hid).to(device),
              torch.zeros( layer ,batch ,hid).to(device))
        
        return stat 
    
model ,device ,criterion ,optimizer =define_model( para['embed'] , para['hidden'] ,para['layers'] ,vocab_size , 'adam' ,para['lr'])
model

Neural_LM(
  (embed): Embedding(41505, 256)
  (lstm): LSTM(256, 1024, batch_first=True)
  (linear): Linear(in_features=1024, out_features=41505, bias=True)
)

In [17]:
def detach_hidden(h_stat):
    stat_list = []
    
    for i in h_stat:
        stat_list.append(i.detach())
    
    return stat_list


num_epochs = model_para['epoch']
for epoch in range(model_para['epoch']):

    h_stat = model.get_init_stat(para['layers'] , para['batch'] ,para['hidden'])
    
    
    for i in range(0, train_int.size(1) - para['seq'], spara['seq']):
    
        ip = train_int[:, i: i+para['seq']]
        ip = ip.to(device)
        
        op = train_int[:, (i+1):(i+1)+para['seq']]
        op = op.to(device)
        
        h_stat = detach_hidden(h_stat)
        
        out, h_stat = model(ip, h_stat)
        model_loss = criterion(out, op.reshape(-1))
        
        model.zero_grad()
        model_loss.backward()
        
        clip_grad_norm_(model.parameters(), 0.4)
        optimizer.step()

        fwd_pass = (i+1) // para['seq']
        
        if step % 500 == 0:
            print ('Epoch [{}/{}], Step[{}/{}], Loss: {}, Perplexity: {}'.format(epoch+1, num_epochs, fwd_pass, num_batches, loss.item(), np.exp(loss.item())))

Epoch [1/30], Step[0/2058], Loss: 10.6228, Perplexity: 41061.87
Epoch [1/30], Step[500/2058], Loss: 4.5481, Perplexity: 94.45
Epoch [1/30], Step[1000/2058], Loss: 4.3463, Perplexity: 77.19
Epoch [1/30], Step[1500/2058], Loss: 4.0456, Perplexity: 57.15
Epoch [1/30], Step[2000/2058], Loss: 4.0858, Perplexity: 59.49
Epoch [2/30], Step[0/2058], Loss: 4.2854, Perplexity: 72.63
Epoch [2/30], Step[500/2058], Loss: 3.6717, Perplexity: 39.32
Epoch [2/30], Step[1000/2058], Loss: 3.6281, Perplexity: 37.64
Epoch [2/30], Step[1500/2058], Loss: 3.4806, Perplexity: 32.48
Epoch [2/30], Step[2000/2058], Loss: 3.3482, Perplexity: 28.45
Epoch [3/30], Step[0/2058], Loss: 3.6578, Perplexity: 38.78
Epoch [3/30], Step[500/2058], Loss: 2.8998, Perplexity: 18.17
Epoch [3/30], Step[1000/2058], Loss: 2.8596, Perplexity: 17.45
Epoch [3/30], Step[1500/2058], Loss: 2.8442, Perplexity: 17.19
Epoch [3/30], Step[2000/2058], Loss: 2.5870, Perplexity: 13.29
Epoch [4/30], Step[0/2058], Loss: 2.8493, Perplexity: 17.28
Epo

In [19]:
def make_test_data( test_data ):
    final_data = []

    for line in test_data:
        temp = ['<s>'] + line.split() + ['</s>']

        if len(temp) >= 4:
            final_data.append(temp)
        
    return final_data

test_refine = make_test_data(test_data)

In [20]:
def convert_int(data , word_2_int ):

    final = []

    for line in data:
        temp = []

        for w in line:

            if w in word_2_int.keys():
                temp.append(word_2_int[w])
            else:
                temp.append(word_2_int['<UNK>'])
    
        final.append(temp)

    return final

test_int = convert_int(test_refine , word_dict)

In [22]:
def test_seq( data ,seq_len):

    final_input = []
    final_output = []

    for line in data:

        seq_list_ip = []
        seq_list_op = []

        for i in range(seq_len, len(line) ,seq_len):
            t = line[i-seq_len: i]
            
            seq_list_ip.append( t[:-1] )
            seq_list_op.append( t[1:]) 
        
        final_input.append(seq_list_ip)
        final_output.append(seq_list_op)

    return final_input , final_output 

test_input , test_output = test_seq( test_int , para['seq'])

In [25]:
def predict(net , ip ,op , h):
    
    criterion = nn.CrossEntropyLoss()
    x = np.array([ip])
    inputs = torch.from_numpy(x)
    inputs = inputs.cuda()

    y = np.array([op])
    output = torch.from_numpy(y)
    output = output.cuda()

    h = tuple([each.data for each in h])

    out, h = net(inputs, h)

    loss = criterion(out, output.view(-1))

    print( np.exp(loss.item()))
    return loss.item()



In [None]:
def get_perplexity( net , data_ip ,data_op ):

    net.eval()
    net.cuda()
    
    h = (torch.zeros(para['layers'], 1, para['hidden']).cuda(),
              torch.zeros(para['layers'], 1, para['hidden']).cuda() )

    total_loss = 0
    for i in range( len(data_ip) ):
        total_loss += predict(net, data_ip[i] , data_op[i] , h)

    return np.exp(total_loss)

preps = []

for stmt in range( len(test_input) ):
    prp = get_perplexity( model , test_input[stmt] , test_output[stmt] )
    preps.append(prp)    

In [19]:
avg = sum(preps) / len(preps)

f = open('lstm.txt', 'a')

for i in range(len(test_data)):
    f.write( test_data[i] +'\t' + str(preps[i]) + '\n')

f.write(str(avg))

18