In [None]:
#load modules
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import random
import numpy as np
import os
import re
import unicodedata
import torch.nn.utils.rnn as rnn
#set a binary to represent whether use gpu
use_gpu = torch.cuda.is_available()

In [None]:
#load data
data =open('news-headline.txt','r',encoding='utf-8').readlines()

In [None]:
#lower case and remove non-vocab letters
def normalize(string):
    string = string.lower().strip()
    string = re.sub(r"([,.!?])", r" \1 ", string)
    string = re.sub(r"[^a-zA-Z,.!?]+", r" ", string)
    string = re.sub(r"\s+", r" ", string).strip()
    return string

In [None]:
#split each pair of data into words
input_text=[]
output_text=[]
for pairs in data:
    try:
        target_r,input_r = pairs[:-1].split('\t')
    except:
        continue

    if input_r.strip()=="" or target_r.strip()=="": 
        continue
    
    input_n = normalize(input_r).split()[:120]
    target_n = normalize(target_r).split()
    if len(input_n)>=1 and len(target_n)>=1:
        
        input_text.append(input_n)
        output_text.append(target_n)

In [None]:
# count number of words appear in input and output total
# create dictionary to convert each word to a index
flatten = lambda l: [i for s in l for i in s]

X_all=[i for s in input_text for i in s]
Y_all=[i for s in output_text for i in s]


input_vocabs = list(set(flatten(input_text)))
target_vocabs = list(set(flatten(output_text)))

input2index = {'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
for i in input_vocabs:
    if i not in input2index.keys():
        input2index[i]=len(input2index)
        
index2input = {v:k for k,v in input2index.items()}

target2index = {'<PAD>':0,'<UNK>':1,'<SOS>':2,'<EOS>':3}
for i in target_vocabs:
    if i not in target2index.keys():
        target2index[i]=len(target2index)
index2target = {v:k for k,v in target2index.items()}



In [None]:
# if the sequence is in index, convert it to a tensor, if not, assign unknown character in it
def to_variable(seq, to_index):
    idxs = list(map(lambda w: to_index[w] if w in to_index.keys() else to_index["<UNK>"], seq))
    if use_gpu:    
        return Variable(torch.cuda.LongTensor(idxs))
    else: torch.LongTensor
    return Variable(torch.LongTensor(idxs))

In [None]:
#prepare the data
x_prep,y_prep=[],[]

for so,ta in zip(input_text,output_text):
    x_prep.append(to_variable(so+['<EOS>'],input2index).view(1,-1))
    y_prep.append(to_variable(ta+['<EOS>'],target2index).view(1,-1))
    
train_data = list(zip(x_prep,y_prep))

In [None]:
#open numberbatch file
numberbatch=open('numberbatch-en.txt','r',encoding='utf-8').readlines()

In [None]:
#create embbeding matrix
embeddings_index = {}
for line in numberbatch:
    vector = line.split(' ')
    word = vector[0]
    embedding = np.asarray(vector[1:], dtype='float32')
    embeddings_index[word] = embedding

In [None]:
#dimension is 300 as the vector length are 300 in number batch
embedding_dim=300

In [None]:
#create the embbeding for input
input_embedding_matrix = np.zeros(((len(input2index)), embedding_dim))
for word,i in input2index.items():
    if word in embeddings_index:
        input_embedding_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        input_embedding_matrix[i] = new_embedding
        
#save if needed        

#np.save('input_embedding_matrix.npy',input_embedding_matrix)

In [None]:
#input_embedding_matrix=np.load('batch/final_input_embedding_matrix.npy')

In [None]:
#create the embbeding for output
output_embedding_matrix = np.zeros((len(target2index), embedding_dim))
for word,i in target2index.items():
    if word in embeddings_index:
        output_embedding_matrix[i] = embeddings_index[word]
    else:
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        output_embedding_matrix[i] = new_embedding
        
#save if needed
#np.save('output_embedding_matrix.npy',output_embedding_matrix)

In [None]:
#output_embedding_matrix=np.load('batch/final_output_embedding_matrix.npy')

In [None]:
#define encoder
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size,hidden_size, layers):
        super(Encoder, self).__init__()      
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.layers = layers        
        self.embedding = nn.Embedding(input_size, embedding_size)
        #load our embedding
        self.embedding.weight.data.copy_(torch.from_numpy(input_embedding_matrix)) 
        self.lstm = nn.LSTM(embedding_size, hidden_size, layers, batch_first=True,bidirectional=True)
    
    def init_hidden(self,inputs):
        hidden = (Variable(torch.zeros(self.layers*2,inputs.size(0),self.hidden_size)),Variable(torch.zeros(self.layers*2,inputs.size(0),self.hidden_size)))
        if use_gpu:
            return (hidden[0].cuda(), hidden[1].cuda())                
        else: return hidden
    
    def init_weight(self):
        #normalize weights
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
        self.lstm.weight_hh_l0 = nn.init.xavier_uniform(self.lstm.weight_hh_l0)
        self.lstm.weight_ih_l0 = nn.init.xavier_uniform(self.lstm.weight_ih_l0)
    
    def forward(self, inputs, input_lengths):
        hidden = self.init_hidden(inputs)        
        embedded = self.embedding(inputs)
        packed = rnn.pack_padded_sequence(embedded, input_lengths,batch_first=True)
        outputs, hidden = self.lstm(packed, hidden)
        outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True)                
        hidden=hidden[0]
        hidden = hidden[-2:]
        return outputs, torch.cat(hidden,1).unsqueeze(1)


In [None]:
#define decoder
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, layers, dropout):
        super(Decoder, self).__init__()
        
        self.hidden_size = hidden_size
        self.layers = layers
        #load embbeddings
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.embedding.weight.data.copy_(torch.from_numpy(output_embedding_matrix)) 
        self.dropout = nn.Dropout(dropout)
        
        self.lstm = nn.LSTM(embedding_size+hidden_size, hidden_size, layers,batch_first=True)
        self.linear = nn.Linear(hidden_size*2, input_size)
        self.attn = nn.Linear(self.hidden_size,self.hidden_size)
    
    def init_hidden(self,inputs):
        hidden = (Variable(torch.zeros(self.layers,inputs.size(0),self.hidden_size)),Variable(torch.zeros(self.layers,inputs.size(0),self.hidden_size)))
        if use_gpu:
            return (hidden[0].cuda(), hidden[1].cuda())
        else: return hidden    
    
    def init_weight(self):
        #normalize weights
        self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight)
        self.lstm.weight_hh_l0 = nn.init.xavier_uniform(self.lstm.weight_hh_l0)
        self.lstm.weight_ih_l0 = nn.init.xavier_uniform(self.lstm.weight_ih_l0)
        self.linear.weight = nn.init.xavier_uniform(self.linear.weight)
        self.attn.weight = nn.init.xavier_uniform(self.attn.weight)
    
    def Attention(self, hidden, encoder_outputs, encoder_maskings): 
        #define attention
        hidden = hidden[0].unsqueeze(2)        
        batch_size = encoder_outputs.size(0) 
        length = encoder_outputs.size(1) 
        energies = self.attn(encoder_outputs.contiguous().view(batch_size*length,-1)) 
        energies = energies.view(batch_size,length,-1)
        attn_energies = energies.bmm(hidden).squeeze(2)         
        alpha = F.softmax(attn_energies) 
        alpha = alpha.unsqueeze(1) 
        context = alpha.bmm(encoder_outputs)         
        return context, alpha   
    
    def forward(self,inputs,encode_hidden,max_length,encoder_outputs,encoder_maskings,is_training=False):
        embedded = self.embedding(inputs)
        hidden = self.init_hidden(inputs)
        if is_training:
            embedded = self.dropout(embedded)
        decode=[]
        for i in range(max_length):
            output, hidden = self.lstm(torch.cat((embedded,encode_hidden),2), hidden) 
            concated = torch.cat((hidden[0],encode_hidden.transpose(0,1)),2) 
            score = self.linear(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            decoded = softmaxed.max(1)[1]
            embedded = self.embedding(decoded).unsqueeze(1)  
            if is_training:
                embedded = self.dropout(embedded)
            context, alpha = self.Attention(hidden[0], encoder_outputs,encoder_maskings)
            
        scores = torch.cat(decode,1)
        return scores.view(inputs.size(0)*max_length,-1)

    #this is for evaluate
    def decode(self,context,encoder_outputs):
        if use_gpu:
            start_decode = Variable(torch.cuda.LongTensor([[target2index['<SOS>']]*1])).transpose(0,1)
        else: start_decode = Variable(torch.LongTensor([[target2index['<SOS>']]*1])).transpose(0,1)
        embedded = self.embedding(start_decode)
        hidden = self.init_hidden(start_decode)
        decodes=[]
        attentions=[]
        decoded = embedded
        while decoded.data.tolist()[0]!=target2index['<EOS>']:
            output, hidden = self.lstm(torch.cat((embedded,context),2), hidden) 
            concated = torch.cat((hidden[0],context.transpose(0,1)),2) 
            score = self.linear(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decodes.append(softmaxed)
            decoded = softmaxed.max(1)[1]
            embedded = self.embedding(decoded).unsqueeze(1) 
            context, alpha = self.Attention(hidden[0], encoder_outputs,None)
            attentions.append(alpha.squeeze(1))
        
        return torch.cat(decodes).max(1)[1], torch.cat(attentions)


In [None]:
#define hyper parameters
epochs=1000
batch_size = 64
embedding_size = 300
hidden_size = 512
learning_rate = 0.001

In [None]:
#initialize model
encoder = Encoder(len(input2index),embedding_size,hidden_size,layers=3)
decoder = Decoder(len(target2index),embedding_size,hidden_size*2,layers=1,dropout=0.5)
encoder.init_weight()
decoder.init_weight()

if use_gpu:
    encoder = encoder.cuda()
    decoder = decoder.cuda()

loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optimizer = optim.Adam(encoder.parameters(),lr=learning_rate)
dec_optimizer = optim.Adam(decoder.parameters(),lr=learning_rate*5)

In [None]:
#this is for saving the model
#the size is 1.5GB, so be careful
def save_checkpoint(state,filename='lstmnewscheckpoint.tar'):
    torch.save(state, filename)

In [None]:
#break into batches,this is exceptionally usefull when run on hpc
def getBatch(batch_size,traindata):
    random.shuffle(traindata)
    start=0
    end=batch_size
    n=len(traindata)
    while end < n:
        batch = traindata[start:end]
        temp = end
        end+=batch_size
        start = temp
        yield batch
    
    if end >= n:
        batch = traindata[start:]
        yield batch


In [None]:
#pad each input to make the size constant for each batch
def pad_to_batch(batch,input2index,target2index):
    
    sorted_batch = sorted(batch, key=lambda b:b[0].size(1),reverse=True) 
    x,y = list(zip(*sorted_batch))
    inputmax = max([item.size(1) for item in x])
    outputmax = max([item.size(1) for item in y])
    x_paddeds,y_paddeds=[],[]
    for i in range(len(batch)):
        if x[i].size(1)<inputmax:
            if use_gpu:
                pads=Variable(torch.cuda.LongTensor([input2index['<PAD>']]*(inputmax-x[i].size(1))))
            else: pads=Variable(torch.LongTensor([input2index['<PAD>']]*(inputmax-x[i].size(1))))
            x_padded=torch.cat([x[i],pads.view(1,-1)],1)
            x_paddeds.append(x_padded)
        else:
            x_paddeds.append(x[i])
        if y[i].size(1)<outputmax:
            if use_gpu:
                pads=Variable(torch.cuda.LongTensor([target2index['<PAD>']]*(outputmax-y[i].size(1))))
            else: pads=Variable(torch.LongTensor([target2index['<PAD>']]*(outputmax-y[i].size(1))))
            y_padded=torch.cat([y[i],pads.view(1,-1)],1)
            y_paddeds.append(y_padded)
        else:
            y_paddeds.append(y[i])
        
    input_variable = torch.cat(x_paddeds)
    target_variable = torch.cat(y_paddeds)
    input_length=[]
    target_length=[]
    for i in input_variable:
        input_length.append(list(map(lambda a: a ==0, i.data)).count(False))
    for i in target_variable:
        target_length.append(list(map(lambda a: a ==0, i.data)).count(False))
          
    return input_variable, target_variable, input_length, target_length


We Trained this on GPU, it will be really slow if just use cpu

In [None]:
for epoch in range(epochs):
    losses=[]
    for i,batch in enumerate(getBatch(batch_size,train_data)):
        inputs,targets,input_lengths,target_lengths = pad_to_batch(batch,input2index,target2index) 
        if use_gpu:
            has_value=[Variable(torch.cuda.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in inputs]
        else:
            has_value=[Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in inputs]
            
        masks = torch.cat(has_value).view(inputs.size(0),-1)
        
        if use_gpu:
            start_decode = Variable(torch.cuda.LongTensor([[target2index['<SOS>']]*targets.size(0)])).transpose(0,1)
        else:
            start_decode = Variable(torch.LongTensor([[target2index['<SOS>']]*targets.size(0)])).transpose(0,1)
        encoder.zero_grad()
        decoder.zero_grad()
        output, encode_hidden = encoder(inputs,input_lengths)
        predict_prob = decoder(start_decode,encode_hidden, targets.size(1), output, masks, True)
                                
        loss = loss_function(predict_prob,targets.view(-1))
        losses.append(loss.data.tolist()[0])
        loss.backward()
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 50.0) 
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 50.0) 
        enc_optimizer.step()
        dec_optimizer.step()
        print(loss)
###comment out this part if don't want to save output
    save_checkpoint({'encoder_dict': encoder.state_dict(),'decoder_dict': decoder.state_dict(),'enc_optimizer' : enc_optimizer.state_dict(),'dec_optimizer' : dec_optimizer.state_dict()})
    with open('lstmnewsmeanloss.txt', 'a') as f:
        print(np.mean(losses), file=f)
    with open('lstmnewsloss_v2.txt', 'a') as f:
        print('losses', file=f)
        print(losses, file=f)
    with open('finalloss.txt', 'a') as f:
        print("mean_loss : %0.2f" %np.mean(losses),file=f)
####
    losses=[]


In [None]:
##This is for loading the saved model
# checkpoint = torch.load('finalcheckpoint.tar', map_location={'cuda:0': 'cpu'})
# encoder.load_state_dict(checkpoint['encoder_dict'])
# decoder.load_state_dict(checkpoint['decoder_dict'])
# enc_optimizer.load_state_dict(checkpoint['enc_optimizer'])
# dec_optimizer.load_state_dict(checkpoint['dec_optimizer'])


In [None]:
#we can randomly choose data for evaluation

def evaluate(traindata):
    test = random.choice(traindata)
    testinput = test[0]
    testoutput = test[1]

    output, hidden = encoder(testinput,[testinput.size(1)])
    pred,_ = decoder.decode(hidden,output)

    testinput = [index2input[i] for i in testinput.data.tolist()[0]]
    pred = [index2target[i] for i in pred.data.tolist()]

    print('Input : ',' '.join([i for i in testinput if i not in ['<EOS>']]))
    print('Target : ',' '.join([index2target[i] for i in testoutput.data.tolist()[0] if i not in [2,3]]))
    print('Prediction : ',' '.join([i for i in pred if i not in ['<EOS>']]))

evaluate(train_data)  
evaluate(train_data) 
evaluate(train_data) 
evaluate(train_data) 
evaluate(train_data) 