In [0]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
SOS_token = 0
EOS_token = 1
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [0]:
def unicodeToAscii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [0]:
def readLangs(lang1, lang2):
    print("Reading lines...")

    lines_data = open('title_train_data.txt', encoding='utf-8').\
        read().strip().split('\n')
    
    lines_label = open('title_train_label.txt', encoding='utf-8').\
        read().strip().split('\n')

    train_pairs = list()
    test_pairs = list()
    for i in range (len(lines_data)):
        temp = list()
        temp.append(lines_data[i])
        temp.append(lines_label[i])
        if i < (len(lines_data)/3):
            train_pairs.append(temp)
        else:
            test_pairs.append(temp)

    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, train_pairs, test_pairs

In [0]:
MAX_LENGTH = 25

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH

def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [6]:
def prepareData(lang1, lang2):
    input_lang, output_lang, pairs,test = readLangs(lang1, lang2)

    pairs = filterPairs(pairs)
    test = filterPairs(test)

    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    for temp in test:
        input_lang.addSentence(temp[0])
        output_lang.addSentence(temp[1])

    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs,test


input_lang, output_lang, pairs,test = prepareData('text', 'title')
print(random.choice(pairs))

Reading lines...
text 61109
title 21166
['A longer stroke would be nice so it would evacuate more quickly but it gets the job done and it does hold a seal. A very handy tool!', 'Yes it works']


In [0]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size) 
        self.gru = nn.GRU(hidden_size, hidden_size) 

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1) 
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights
    
    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [0]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]


def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)


def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

In [0]:
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [0]:
def trainIters(encoder, decoder, n_iters, print_every=1000, learning_rate=0.001):
    plot_losses = []
    print_loss_total = 0
    plot_loss_total = 0

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)] 
    criterion = nn.NLLLoss()
    # print("done.")
    for iter in range(1, n_iters + 1):
        # print(iter)
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print(iter / n_iters * 100, print_loss_avg)
            

In [0]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(output_lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [0]:
def evaluateRandomly(encoder, decoder, n=1000):
    for i in range(n):
        pair = random.choice(test)
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        if output_sentence != "Great <EOS>":
            print('text: ', pair[0])
            print('titls: ', pair[1])
            print('predict: ', output_sentence)
            print('')
    return pair[1],output_sentence

In [0]:
from nltk.translate.bleu_score import sentence_bleu
total_score = 0
def compute_BLEU(encoder, decoder):
    global total_score 
    
    fileObject = open('predict.txt', 'w')
    for i in range(len(test)):
        result = list()
        pair = test[i]
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        result.append(pair[0])
        result.append(output_sentence)
        fileObject.write(str(result))
        fileObject.write('\n')
        total_score = total_score + sentence_bleu(pair[1], output_sentence)
        if i % 500 == 0:
            print(total_score, i/len(test))
    print(total_score/len(test))
    fileObject.close()
    return 0

In [27]:
hidden_size = 256
encoder1 = EncoderRNN(input_lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1).to(device)

trainIters(encoder1, attn_decoder1,50000, print_every=200)

2.0 7.61419933500744
4.0 4.443823688824972
6.0 5.350110034942627
8.0 5.370601828893025
10.0 5.506299050649007
12.0 6.090205070858909
14.000000000000002 6.947001113891602
16.0 7.421654551823934
18.0 7.115130188351586
20.0 6.126129935582479
22.0 7.631723483197101
24.0 4.8056346257527665
26.0 5.554920352300008
28.000000000000004 5.68193473815918
30.0 6.725165571712313
32.0 6.775475212732951
34.0 5.755916161589569
36.0 5.827496519088745
38.0 6.063457586651757
40.0 5.671056847572326
42.0 6.039765370005654
44.0 5.994796178908575
46.0 6.124028835296632
48.0 5.482612615766979
50.0 5.073507807519702
52.0 5.168522497812907
54.0 5.124506147702535
56.00000000000001 7.02450527826945
57.99999999999999 4.964296480019888
60.0 6.998600899378459
62.0 6.60289470695314
64.0 5.402724662054153
66.0 5.033140703837077
68.0 5.5760384405226935
70.0 5.800476741790772
72.0 6.442473587459989
74.0 5.925675959814162
76.0 4.528161048964849
78.0 6.162072556359427
80.0 7.071530186108181
82.0 5.316745942433675
84.0 5.27

In [67]:
evaluateRandomly(encoder1, attn_decoder1)

text:  This is my favorite of all the coffees I've tried. Smooth and flavorful without being a flavored coffee, its a definite treat anytime of day.
titls:  Great brew
predict:  Love this <EOS>

text:  This is my favorite of all the coffees I've tried. Smooth and flavorful without being a flavored coffee, its a definite treat anytime of day.
titls:  Great brew
predict:  Love this <EOS>

text:  Better than many I have tried.  With a little honey this is a terrific cup of tea.  Going to order more.  Nice and smooth.
titls:  Love this tea
predict:  Love this <EOS>

text:  K-cup brewing system makes this wonderful tea become superb. Twinings got it just right.<br />It comes out sweet and flavorful. A nice end to a long day.
titls:  great herbal tea
predict:  Love this <EOS>

text:  this product is very fresh has a great taste<br />come on time<br />it is a great deal for what you get<br />love the way you can store it
titls:  Great Fresh
predict:  Great this <EOS>

text:  Better than many 

('quick delivery', 'Great <EOS>')

In [71]:
compute_BLEU(encoder1, attn_decoder1)

Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


0.8593887047640296 0.0
370.1116673427414 0.012075252976549858
738.3935338964305 0.024150505953099716
1110.3250034642813 0.036225758929649576


KeyboardInterrupt: ignored