In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_

In [3]:
from __future__ import unicode_literals, print_function, division

# import basic lib
import re
import math
import random
import string
import unicodedata
import numpy as np
from io import open

# import pytorch
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.autograd import Variable
from torch.nn.utils import clip_grad_norm_

# import loss func
import masked_cross_entropy

# check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
SOS_idx = 0
EOS_idx = 1
UNK_idx = 2
PAD_idx = 3

USE_CUDA = True

class Preprocessor:
    '''
    class for preprocessing
    '''
    def __init__(self, name):
        '''
        initialize vocab and counter
        '''
        self.name = name
        self.w2idx = {"<sos>" : 0, "<eos>" : 1, "<unk>" : 2, "<pad>" : 3}
        self.counter = {}
        self.idx2w = {0: "<sos>", 1: "<eos>", 2:"<unk>", 3:"<pad>"}
        self.num = 4

    def SentenceAdder(self, sentence):
        '''
        Add a sentence to dataset
        '''
        for word in sentence.split(' '):
            self.WordAdder(word)

    def WordAdder(self, word):
        '''
        Add single word to dataset and update vocab and counter
        '''
        if word in self.w2idx:
            self.counter[word] += 1
        else:
            self.w2idx[word] = self.num
            self.counter[word] = 1
            self.idx2w[self.num] = word
            self.num += 1
            
    def trim(self, min_count=5):
        '''
        Trim to remove non-frequent word
        '''
        keep = []
        for k, v in self.counter.items():
            if v >= min_count: keep.append(k)
        print(self.name+':')
        print('Total words', len(self.w2idx))
        print('After Trimming', len(keep))
        print('Keep Ratio %', 100 * len(keep) / len(self.w2idx))
        self.w2idx = {"<sos>" : 0, "<eos>" : 1, "<unk>" : 2, "<pad>" : 3}
        self.counter = {}
        self.idx2w = {0: "<sos>", 1: "<eos>", 2:"<unk>", 3:"<pad>"}
        self.num = 4
        for w in keep:
            self.WordAdder(w)

In [5]:
def Uni2Ascii(s):
    '''
    transfer from unicode to ascii
    '''
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')

def StrCleaner(s):
    '''
    trim, delete non-letter and lowercase string
    '''
    s = Uni2Ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def DataReader(path, lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open(path, encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    #pairs = [[StrCleaner(s) for s in l.split('<------>')] for l in lines]
    pairs = [[s.lower() for s in l.split('<------>')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Preprocessor(lang2)
        output_lang = Preprocessor(lang1)
    else:
        input_lang = Preprocessor(lang1)
        output_lang = Preprocessor(lang2)

    return input_lang, output_lang, pairs

In [6]:
MIN_LENGTH = 10
MAX_LENGTH = 50

def filterPair(p):
    '''
    Filter to get expected pairs with specific length
    '''
    return MIN_LENGTH <= len(p[0].split(' ')) <= MAX_LENGTH and \
        MIN_LENGTH <= len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [8]:
def prepareData(path, lang1, lang2, reverse=True):
    input_lang, output_lang, pairs = DataReader(path, lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.SentenceAdder(pair[0])
        output_lang.SentenceAdder(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.num)
    print(output_lang.name, output_lang.num)
    return input_lang, output_lang, pairs


src, tgt, pairs = prepareData('data/train.txt', 'english', 'chinese')
src.trim()
tgt.trim()
print(random.choice(pairs))

Reading lines...
Read 1500000 sentence pairs
Trimmed to 1188373 sentence pairs
Counting words...
Counted words:
chinese 303831
english 262492
chinese:
Total words 303831
After Trimming 95724
Keep Ratio % 31.505672561391037
english:
Total words 262492
After Trimming 63444
Keep Ratio % 24.16987946299316
['那样 的 爱好 就 要求 人们 静下来 呀 , 电脑游戏 也 属于 这种 。 ', 'they are not active hobbies . they are passive . but playing computer game is also sedentary .']


In [7]:
def sentence2idx(preprocessor, sentence):
    '''
    Read sentence and translate into word index plus eos
    '''
    return [SOS_idx] + [preprocessor.w2idx[w] if w in preprocessor.w2idx \
            else UNK_idx for w in sentence.split(' ')] + [EOS_idx]

def pad(seq, max_len):
    '''
    Add padding to sentence with different length
    '''
    seq += [PAD_idx for i in range(max_len - len(seq))]
    return seq

def random_batch(src, tgt, batch_size=5):
    '''
    Randomly generate batch data
    '''
    inputs, target = [], []
    
    # Choose batch randomly
    for _ in range(batch_size):
        pair = random.choice(pairs)
        inputs.append(sentence2idx(src, pair[0]))
        target.append(sentence2idx(tgt, pair[1]))
        
    # Sort by length
    seq_pairs = sorted(zip(inputs, target), key=lambda p: len(p[0]), reverse=True)
    inputs, target = zip(*seq_pairs)
    
    # Obtain length of each sentence and pad
    input_lens = [len(s) for s in inputs]
    input_max = max(input_lens)
    input_padded = [pad(s, input_max) for s in inputs]
    target_lens = [len(s) for s in target]
    target_max = max(target_lens)
    target_padded = [pad(s, target_max) for s in target]

    # Create Variable
    if USE_CUDA:
        input_vars = Variable(torch.LongTensor(input_padded).cuda()).transpose(0, 1)
        input_lens = Variable(torch.LongTensor(input_lens).cuda())
        target_vars = Variable(torch.LongTensor(target_padded).cuda()).transpose(0, 1)
        target_lens = Variable(torch.LongTensor(target_lens).cuda())
    else:
        input_vars = Variable(torch.LongTensor(input_padded)).transpose(0, 1)
        input_lens = Variable(torch.LongTensor(input_lens))
        target_vars = Variable(torch.LongTensor(target_padded)).transpose(0, 1)
        target_lens = Variable(torch.LongTensor(target_lens))

    return input_vars, input_lens, target_vars, target_lens

In [9]:
class Encoder(nn.Module):
    '''
    Define encoder and forward process
    '''
    def __init__(self, dim_input, dim_embed, dim_hidden, num_layers, dropout):
        super(Encoder, self).__init__()
        self.dim_input = dim_input
        self.dim_hidden = dim_hidden
        self.dim_embed = dim_embed
        self.embed = nn.Embedding(dim_input, dim_embed)
        self.cell = nn.GRU(dim_embed, dim_hidden, 
                          num_layers, dropout=dropout, 
                          bidirectional=True)
        
    def forward(self, inputs, inputs_lens, hidden=None):
        '''
        We need to sum the outputs since bi-diretional is used
        '''
        #print('e')
        embedded = self.embed(inputs)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, inputs_lens)
        outputs, hidden = self.cell(packed, hidden)
        outputs, output_lengths = nn.utils.rnn.pad_packed_sequence(outputs)
        outputs = outputs[:, :, :self.dim_hidden] + \
                    outputs[:, :, self.dim_hidden:]
        return outputs, hidden


class Attention(nn.Module):
    '''
    Define attention mechanism
    '''
    def __init__(self, dim_hidden):
        super(Attention, self).__init__()
        self.dim_hidden = dim_hidden
        # 2*dim_hidden is needed since bi-direction is used
        self.attn = nn.Linear(2*self.dim_hidden, dim_hidden)
        self.v = nn.Parameter(torch.rand(dim_hidden))
        stdv = 1. / math.sqrt(self.v.size(0))
        self.v.data.uniform_(-stdv, stdv)

    def forward(self, hidden, encoder_outputs):
        #print('a')
        timestep = encoder_outputs.size(0)
        h = hidden.repeat(timestep, 1, 1).transpose(0, 1)
        encoder_outputs = encoder_outputs.transpose(0, 1)
        scores = self.score(h, encoder_outputs)
        return F.relu(scores).unsqueeze(1)

    def score(self, hidden, encoder_outputs):
        e = F.softmax(self.attn(torch.cat([hidden, encoder_outputs], 2)),dim=1)
        e = e.transpose(1, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        e = torch.bmm(v, e)
        return e.squeeze(1)


class Decoder(nn.Module):
    '''
    Define decoder with attention
    '''
    def __init__(self, dim_embed, dim_hidden, dim_output, num_layers, dropout):
        super(Decoder, self).__init__()
        self.dim_embed = dim_embed
        self.dim_hidden = dim_hidden
        self.dim_output = dim_output
        self.num_layers = num_layers

        self.embed = nn.Embedding(dim_output, dim_embed)
        self.dropout = nn.Dropout(dropout, inplace=True)
        self.attention = Attention(dim_hidden)
        self.cell = nn.GRU(dim_hidden + dim_embed, dim_hidden,
                          num_layers, dropout=dropout)
        self.out = nn.Linear(2*dim_hidden, dim_output)

    def forward(self, inputs, last_hidden, encoder_outputs):
        #print('d')
        # Get the embedding of the current input word (last output word)
#         print(inputs)
        embedded = self.embed(inputs).unsqueeze(0)  # (1,B,N)
        embedded = self.dropout(embedded)
        # Calculate attention weights and apply to encoder outputs
        attn_weights = self.attention(last_hidden[-1], encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))  # (B,1,N)
        context = context.transpose(0, 1)  # (1,B,N)
        # Combine embedded input word and attended context, run through RNN
        
#         print(embedded.size())
#         print(context.size())
        rnn_input = torch.cat([embedded, context], 2)
#         print(rnn_input.size())
#         print(last_hidden.size())
        output, hidden = self.cell(rnn_input, last_hidden)
        output = output.squeeze(0)  # (1,B,N) -> (B,N)
        context = context.squeeze(0)
        # For Debug
#         print("flag3")
#         print(output.size())
#         print(context.size())
        torch.cat([output, context], 1)
        output = self.out(torch.cat([output, context], 1))
        output = F.log_softmax(output, dim=1)
        return output, hidden, attn_weights

In [195]:
class BeamSearch(nn.Module):
    '''
    Implement BeamSearch for testing
    '''
    
    def __init__(self, encoder, decoder, width):
        super(BeamSearch, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.width = width
        
    def forward(self, src, src_len, tgt, tgt_len, width):
        
        pred_result = []
        batch_size = src.size(1)
        max_len = tgt.size(0)
        vocab_size = self.decoder.dim_output
        encoder_output, hidden = self.encoder(src, src_len)
#         print(encoder_output.size())
        hidden = hidden[:self.decoder.num_layers]
        #record most possible beams from index 1-t 
#         outputs = Variable(torch.zeros(width, batch_size, max_len)).cuda().long()
        #record most possible word at index t
#         output = Variable(torch.zeros(width)).cuda().long()
        
        #beam options, all possible width * width options
        beam_options = Variable(torch.zeros(width * width, max_len)).cuda()
        prob_options = Variable(torch.zeros(width * width, 1)).cuda()
#         output[0] = Variable(tgt.data[0, :]).cuda()
        output = Variable(tgt.data[0, :])
        output, hidden, attn_weights = self.decoder(
            output, hidden, encoder_output)
        
        #val: prob, idx: index
        val, idx = output.data.topk(k = width, dim = 1)
        
#         print(idx)
#         print('size1' + str(val.size()))
        #print(idx.size(1))
        #print(idx[0][0])
        #print(val[0][0])
        for i in range(idx.size(1)):
            for j in range(idx.size(1)):
                if j == 0:
                    beam_options[i*10 + j][1] = idx[0][i]
                    prob_options[i*10 + j] = val[0][i].exp()
        
        #print('beam' + str(beam_options[0]))
        #print('beam' + str(prob_options[0]))
        #the probability is log softmax
        #the prediction score ranging from -10 to -12
        #print(output[0].data)
        for t in range(2, max_len):
            cnt = 0
            for i in range(idx.size(1)):
                #probability of previous vector
                curr_prob = val[0][i]
                
                pop_input = idx[0][i].reshape(1)
#                 print(pop_input)
                output, hidden, attn_weights = self.decoder(
                        pop_input, hidden, encoder_output)
                
                output.data = output.data.exp() * curr_prob #use add because this is log here
                val1, idx1 = output.data.topk(k = width, dim = 1)
                for j in range(idx1.size(1)):
                    beam_options[cnt][t] = idx1[0][j]
                    prob_options[cnt].data = val[0][j]
                    cnt += 1
            
            #update val, idx from beam_options
#             print(prob_options)
        
            topVal, topInx = prob_options.topk(k = width, dim = 0)
#             print(topVal.size())
#             print(idx.size())
#             print(topInx.size())
#             idx.transpose_(0, 1)
#             val.transpose_(0, 1)
            for j in range(topInx.size(0)):
                val[0][j] = prob_options[topInx[j][0]]  #val stores the probablity
                idx[0][j] = beam_options[topInx[j][0]][t]  #idx stores the new index

#             print('val' + str(val))
#             print('idx' + str(idx))
        
#         print('beam' + str(beam_options[0]))
#         print('beam' + str(beam_options[0].data))
        
#         print(beam_options.data.max(1)[1])
#         best_trans = beam_options[beam_options.data.max(1)[1]]
#         print(best_trans[0][2])
        
#         print(best_trans[0][10])
        maxIndex = prob_options.max(0)[1].item()
        for i in range(0, max_len):
            if beam_options[maxIndex][i] == 2:
                continue
            elif beam_options[maxIndex][i] == 1:
                break
            else:
                pred_result.append(beam_options[maxIndex][i].item())
            
#         return best_translate
#         print(prob_options.max(0)[1].item())
        return pred_result

In [196]:
batch_size = 1
hidden_size = 5
embed_size = 10
n_layers = 4
width = 10
encoder_test = Encoder(src.num, embed_size, hidden_size, n_layers, dropout=0.5)
decoder_test = Decoder(embed_size, hidden_size, tgt.num, n_layers, dropout=0.5)

In [197]:
net = BeamSearch(encoder_test,decoder_test, width).cuda()
opt = optim.Adam(net.parameters(),lr=0.01)
print(net)

BeamSearch(
  (encoder): Encoder(
    (embed): Embedding(95728, 10)
    (cell): GRU(10, 5, num_layers=4, dropout=0.5, bidirectional=True)
  )
  (decoder): Decoder(
    (embed): Embedding(63448, 10)
    (dropout): Dropout(p=0.5, inplace)
    (attention): Attention(
      (attn): Linear(in_features=10, out_features=5, bias=True)
    )
    (cell): GRU(15, 5, num_layers=4, dropout=0.5)
    (out): Linear(in_features=10, out_features=63448, bias=True)
  )
)


In [205]:
grad_clip = 10

input_batches, input_lengths,\
        target_batches, target_lengths = random_batch(src,tgt,batch_size)
output = net(input_batches, input_lengths, target_batches, target_lengths, width)

# print(output)
string = ""
for i in output:
    string += tgt.idx2w[int(i)]
    string += " "
print(output)
print(string)
# for step in range(1,500):
#     input_batches, input_lengths,\
#         target_batches, target_lengths = random_batch(src,tgt,batch_size)
#     output = net(input_batches, input_lengths, target_batches, target_lengths, width, upper_limit)
#     print(output)

[0.0, 5515.0, 25645.0, 5515.0, 5515.0, 1610.0, 5515.0, 5515.0, 30528.0, 30528.0, 5515.0, 5515.0, 5515.0]
<sos> inn categorically inn inn dedication inn inn mandating mandating inn inn inn 


In [203]:
tgt.idx2w[100]

'countermeasure'