### 数据读取

数据格式

`raw_addr_words_train` and `raw_addr_words_test`: [['s.', 'par', '53', 'sidanegara', '4', 'cilacap', 'tengah', 'BOS'], ['angg', 'per,', 'baloi', 'indah', 'kel.', 'lubuk', 'baja', 'BOS'], ['asma', 'laun,', 'mand', 'imog,', 'BOS']]

`POIs_words` and `streets_words`: [['kakap', 'raya', 'EOS'], ['jend', 'ahmad', 'yani', 'EOS'], ['raya', 'cila', 'kko', 'EOS'], ['EOS']]

`sents` concatenates `raw_addr_words_train`, `raw_addr_words_test`, `POIs_words` and `streets_words`.

In [None]:
import pandas as pd

df_train = pd.read_csv("train.csv")
df_train.head()
df_test = pd.read_csv("test.csv")
df_test.head()

# 处理raw_addr_words_train和raw_addr_words_test：split和append('BOS')
raw_addr_train = df_train['raw_address'].tolist()
raw_addr_test = df_test['raw_address'].tolist()
raw_addr_words_train = []
raw_addr_words_test = []
for sentence in raw_addr_train:
    sentence = sentence.replace(",", "")
    sentence = sentence.split()
    sentence.append('BOS')
    raw_addr_words_train.append(sentence)
for sentence in raw_addr_test:
    sentence = sentence.replace(",", "")
    sentence = sentence.split()
    sentence.append('BOS')
    raw_addr_words_test.append(sentence)

# 把raw_addr_words_train和raw_addr_words_test都添加到sents
sents = raw_addr_words_train
sents.extend(raw_addr_words_test)

# 求出input中sentence最长的长度，作为LSTM的time_step的参考
max_sentence_length = 0
for sentence in sents:
    if len(sentence) > max_sentence_length:
        max_sentence_length = len(sentence)

# 处理POIs_words和streets_words：split和append('EOS')
POIs_words = []
streets_words = []
for gt in df_train['POI/street'].tolist():
    POI = gt.split('/', 1)[0]
    street = gt.split('/', 1)[1]
    POI = POI.split()
    POI.append('EOS')
    street = street.split()
    street.append('EOS')
    POIs_words.append(POI)
    streets_words.append(street)

# 求出ground_truth中sentence最长的长度，作为LSTM的time_step的参考
max_poi_length = 0
for sentence in POIs_words:
    if len(sentence) > max_poi_length:
        max_poi_length = len(sentence)
max_street_length = 0
for sentence in streets_words:
    if len(sentence) > max_street_length:
        max_street_length = len(sentence)

# 把POIs_words和streets_words都添加到sents
sents.extend(POIs_words)
sents.extend(streets_words)

print(sents[:10])
print(sents[-10:])
print(len(sents))
print("max_sentence_length: ", max_sentence_length)
print("max_poi_length: ", max_poi_length)
print("max_street_length: ", max_street_length)
# 综合考虑，time_step为64就够用

# df_words = pd.DataFrame()

[['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii', 'lippo', 'cika', '11', 'a', 'cicau', 'cikarang', 'pusat', 'BOS'], ['aye', 'jati', 'sampurna', 'BOS'], ['setu', 'siung', '119', 'rt', '5', '1', '13880', 'cipayung', 'BOS'], ['toko', 'dita', 'kertosono', 'BOS'], ['jl.', 'orde', 'baru', 'BOS'], ['raya', 'samb', 'gede', '299', 'toko', 'bb', 'kids', 'BOS'], ['kem', 'mel', 'raya', 'no', '4', 'bojong', 'rawalumbu', 'rt', '1', '36', 'rawalumbu', 'BOS'], ['tela', 'keuramat', 'kuta', 'alam', 'BOS'], ['gg.', 'i', 'wates', 'magersari', 'BOS'], ['bunga', 'ncole', 'ix', '2', 'BOS']]
[['EOS'], ['prib', '3', 'EOS'], ['EOS'], ['perum', 'tata', 'resid', 'nirwana', 'EOS'], ['kakap', 'raya', 'EOS'], ['jend', 'ahmad', 'yani', 'EOS'], ['raya', 'cila', 'kko', 'EOS'], ['EOS'], ['EOS'], ['EOS']]
950000
max_sentence_length:  33
max_poi_length:  21
max_street_length:  16


### Word2Vec

In [None]:
import gensim

# https://radimrehurek.com/gensim/models/word2vec.html
# 一般小语料库的vector维度用200-300
# sentences (iterable of iterables, optional) – 供训练的句子，可以使用简单的列表，但是对于大语料库，建议直接从磁盘/网络流迭代传输句子。参阅word2vec模块中的BrownCorpus，Text8Corpus或LineSentence。
# corpus_file (str, optional) – LineSentence格式的语料库文件路径。
# size (int, optional) – word向量的维度。
# window (int, optional) – 一个句子中当前单词和被预测单词的最大距离。
# min_count (int, optional) – 忽略词频小于此值的单词。
# workers (int, optional) – 训练模型时使用的线程数。

model = gensim.models.Word2Vec(sentences=sents, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
# model.load("word2vec.model")

In [None]:
print(model.wv['<BOS>'])
print(model.wv['<EOS>'])
print(model.wv['kapuk'])
print(model.similarity('kapuk','timur'))
print(model.similarity('yaya','yayasan'))

[-4.7510145e-03 -4.7370796e-03  2.9411297e-03  4.8859799e-03
  2.8488715e-03 -2.6719654e-03  3.6631844e-03  2.3172596e-03
  2.5307646e-03 -2.2685218e-03  1.7151979e-04  3.7271092e-03
 -3.4571113e-03  1.6590450e-03 -1.7351321e-03  3.1850750e-03
 -4.2197909e-03  3.4302576e-03 -1.9578892e-03  3.5440323e-03
  3.2221025e-04  1.5887568e-03  3.7067465e-03  1.3883339e-03
 -2.5920195e-03 -9.9590898e-04 -3.0845886e-03  3.4212242e-03
 -3.2305140e-03  2.5452189e-03 -4.3210844e-04  4.2407182e-03
  4.6338956e-03 -8.9179759e-04 -4.9878997e-03 -4.3554399e-03
 -3.0786749e-03  2.9000796e-03 -2.4078618e-05  7.2029402e-04
 -1.3743703e-03 -1.7121357e-03  3.6064379e-03  4.6777684e-04
  3.5667121e-03 -2.5101295e-03 -3.6852087e-03  2.1823538e-03
 -3.2539712e-04 -4.3696836e-03 -1.6757927e-03 -1.2041670e-03
 -2.9920565e-03  1.9305609e-03 -1.3281346e-03  1.8970171e-03
 -1.9230577e-03  2.5611972e-03  4.3853882e-04  1.3362422e-03
 -3.8731010e-03  6.1253677e-06  1.9853523e-03  2.8401176e-03
  9.6844560e-05 -3.59047

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


### 预处理

In [None]:
import numpy as np

# raw_addr_words_train
# raw_addr_words_test
# POIs_words
# streets_words

def word2idx(word):
    return word_model.wv.vocab[word].index
def idx2word(idx):
    return word_model.wv.index2word[idx]

# em_weights = model.wv
# embedding = nn.Embedding.from_pretrained(em_weights)

print(POIs)

# max_sentence_length = 33
# max_poi_length = 21
# max_street_length = 16
train_input = np.zeros([len(raw_addr_words_train), 64], dtype=np.int32)
test_input = np.zeros([len(raw_addr_words_test), 64], dtype=np.int32)
for i, sentence in enumerate(raw_addr_words_train):
    for t, word in enumerate(sentence[:-1]):
        train_input[i, t] = word2idx(word)
for i, sentence in enumerate(raw_addr_words_test):
    for t, word in enumerate(sentence[:-1]):
        test_input[i, t] = word2idx(word)

AttributeError: 'Word2VecKeyedVectors' object has no attribute 'dim'

### 模型部分

torch.nn.LSTM(*args, **kwargs)
- input_size – The number of expected features in the input x
- hidden_size – The number of features in the hidden state h
- num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
- bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
- batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
- dropout – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
- bidirectional – If True, becomes a bidirectional LSTM. Default: False
- proj_size – If > 0, will use LSTM with projections of corresponding size. Default: 0

关于batch_first
RNN的输入是(seq_len, batch_size, input_size)，batch_size位于第二维度
https://www.jianshu.com/p/41c15d301542
https://www.cnblogs.com/picassooo/p/13637140.html

In [None]:
# Hyper Parameters
EPOCH = 1               
BATCH_SIZE = 32
TIME_STEP = 66          # rnn time step / image height # 设置为max_sentence_length的两倍，应该够用
INPUT_SIZE = 100         # rnn input size / image width # 和前面的vector的长度保持一致
HIDDEN_SIZE = 100
LR = 0.01

In [None]:
import pytorch
from torch import nn
class RNN(nn.Module):
    def __init__(self):
        super(RNN, self).__init__()

        self.rnn = nn.LSTM(         
            input_size=INPUT_SIZE,
            hidden_size=HIDDEN_SIZE,         # rnn hidden unit
            num_layers=1,           # number of rnn layer
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.out = nn.Linear(100, 100)

    def forward(self, x):
        # x shape (batch, time_step, input_size)
        # r_out shape (batch, time_step, output_size)
        # h_n shape (n_layers, batch, hidden_size)
        # h_c shape (n_layers, batch, hidden_size)
        r_out, (h_n, h_c) = self.rnn(x, None)   # None represents zero initial hidden state

        # choose r_out at the last time step
        out = self.out(r_out[:, -1, :])
        return out

In [None]:
import numpy as np
import matplotlib.pyplot as plt

test_x = test_data.test_data.type(torch.FloatTensor)[:2000]/255.   # shape (2000, 28, 28) value in range(0,1) 注意训练数据会自动规范化，但测试数据不会，所以这里要手动除以255，否则会导致训练不收敛
test_y = test_data.test_labels.numpy()[:2000]    # covert to numpy array

rnn = RNN()
print(rnn)

optimizer = torch.optim.Adam(rnn.parameters(), lr=LR) 
loss_func = nn.CrossEntropyLoss()

# training and testing
for epoch in range(EPOCH):
    for step, (b_x, b_y) in enumerate(train_loader):        # gives batch data
        b_x = b_x.view(-1, INPUT_SIZE)              # reshape x to (batch, time_step, input_size)

        output = rnn(b_x)                               # rnn output
        loss = loss_func(output, b_y)                   # cross entropy loss
        optimizer.zero_grad()                           # clear gradients for this training step
        loss.backward()                                 # backpropagation, compute gradients
        optimizer.step()                                # apply gradients

        if step % 50 == 0:
            test_output = rnn(test_x)                   # (samples, time_step, input_size)
            pred_y = torch.max(test_output, 1)[1].data.numpy()
            accuracy = float((pred_y == test_y).astype(int).sum()) / float(test_y.size)
            print('Epoch: ', epoch, '| train loss: %.4f' % loss.data.numpy(), '| test accuracy: %.2f' % accuracy)

# print 10 predictions from test data
test_output = rnn(test_x[:10].view(-1, 28, 28))
pred_y = torch.max(test_output, 1)[1].data.numpy()
print(pred_y, 'prediction number')
print(test_y[:10], 'real number')

### Encoder-Decoder translation模型
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/shopee contest2')

In [3]:
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### 辅助代码

In [4]:
import time
import math


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [140]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np
%matplotlib inline

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

#### 数据预处理

In [141]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [142]:
import pandas as pd
def readLangs(lang_name):
    print("Reading lines...")

    # Read the file and split into lines
    df_train = pd.read_csv("train.csv")
    raw_addr_train = df_train['raw_address'].tolist()
    df_test = pd.read_csv("test.csv")
    raw_addr_test = df_test['raw_address'].tolist()
    POIs = []
    # streets = []
    for gt in df_train['POI/street'].tolist():
        POI = gt.split('/', 1)[0]
        POIs.append(POI)
        # street = gt.split('/', 1)[1]
        # streets.append(street)

    # Combine addresses and POIs into pairs
    pairs = []
    for i in range(len(raw_addr_train)):
        pairs.append([raw_addr_train[i].replace(",", ""), POIs[i]])
        # pairs.append([raw_addr_train[i].replace(",", ""), streets[i]])
    
    tests = []
    for sent in raw_addr_test:
        tests.append(sent.replace(",",""))
    
    # Reverse pairs, make Lang instances
    lang = Lang(lang_name)

    return lang, pairs, tests

In [153]:
def prepareData(lang_name):
    lang, pairs, tests = readLangs(lang_name)
    print("Read %s sentence pairs" % len(pairs))
    print("Read %s testing sentence" % len(tests))
    print("Counting words...")
    for pair in pairs:
        lang.addSentence(pair[0])
        lang.addSentence(pair[1])
    # 取前20000个作为验证集validation
    validates = pairs[:20000]
    pairs = pairs[20000:]
    for test in tests:
        lang.addSentence(test)
    print("Counted words:")
    print(lang.name, lang.n_words)
    return lang, pairs, validates, tests

lang, pairs, validates, tests = prepareData('All_words')
print(random.choice(pairs))

Reading lines...
Read 300000 sentence pairs
Read 50000 testing sentence
Counting words...
Counted words:
All_words 120699
['kuningan timur men anugrah 19th floor kan taman e 3.3 jl mega kuni rt 1 2 setia budi', 'menara anugrah 19th floor kantor taman e 3.3']


In [144]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(lang, pair[0])
    target_tensor = tensorFromSentence(lang, pair[1])
    return (input_tensor, target_tensor)

In [93]:
MAX_LENGTH = 33

#### 核心代码

In [145]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden) # 上一个unit的output和hidden到这一个unit来
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [146]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [147]:
teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

In [148]:
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [149]:
def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            else:
                decoded_words.append(lang.index2word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

In [152]:
def evaluateRandomly(encoder, decoder, n=20):
    for i in range(n):
        # 取validates验证集来查看
        pair = random.choice(validates)
        print('>', pair[0])
        print('=', pair[1])
        output_words, attentions = evaluate(encoder, decoder, pair[0])
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [154]:
import pandas as pd
def saveSentence(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = tensorFromSentence(lang, sentence)
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[SOS_token]], device=device)  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == EOS_token:
                break
            else:
                decoded_words.append(lang.index2word[topi.item()])
            decoder_input = topi.squeeze().detach()

        output_sentence = ' '.join(decoded_words)
        return output_sentence

def saveTrainOutput(encoder, decoder, pairs, name):
    output_sentences = []
    for i, sent in enumerate(pairs):
        if i%10000 == 0:
            print("%d %d%%" % (i, i*100/len(pairs)))
        if name == 'POI':
            output_sentences.append({'POI': saveSentence(encoder, decoder, sent[0])})
        elif name == 'street':
            output_sentences.append({'street': saveSentence(encoder, decoder, sent[0])})
    df_output = pd.DataFrame(output_sentences)
    if name == 'POI':
        df_output.to_csv("saved_answers/train_POIs.csv")
    elif name == 'street':
        df_output.to_csv("saved_answers/train_streets.csv")

def saveValOutput(encoder, decoder, validates, name):
    output_sentences = []
    for i, sent in enumerate(validates):
        if i%5000 == 0:
            print("%d %d%%" % (i, i*100/len(validates)))
        if name == 'POI':
            output_sentences.append({'POI': saveSentence(encoder, decoder, sent[0])})
        elif name == 'street':
            output_sentences.append({'street': saveSentence(encoder, decoder, sent[0])})
    df_output = pd.DataFrame(output_sentences)
    if name == 'POI':
        df_output.to_csv("saved_answers/val_POIs.csv")
    elif name == 'street':
        df_output.to_csv("saved_answers/val_streets.csv")

def saveTestOutput(encoder, decoder, tests, name):
    output_sentences = []
    for i, sent in enumerate(tests):
        if i%5000 == 0:
            print("%d %d%%" % (i, i*100/len(tests)))
        if name == 'POI':
            output_sentences.append({'POI': saveSentence(encoder, decoder, sent)})
        elif name == 'street':
            output_sentences.append({'street': saveSentence(encoder, decoder, sent)})
    df_output = pd.DataFrame(output_sentences)
    if name == 'POI':
        df_output.to_csv("saved_answers/test_POIs.csv")
    elif name == 'street':
        df_output.to_csv("saved_answers/test_streets.csv")

In [134]:
# continue-train
n_iterations = 100000
encoder1 = torch.load('saved_models/POI-encoder1-%diters.pt'%(300000-n_iterations))
attn_decoder1 = torch.load('saved_models/POI-attn_decoder1-%diters.pt'%(300000-n_iterations))
trainIters(encoder1, attn_decoder1, n_iterations, print_every=5000)
torch.save(encoder1, 'saved_models/POI-encoder1-300000iters.pt')
torch.save(attn_decoder1, 'saved_models/POI-attn_decoder1-300000iters.pt')

1m 49s (- 34m 49s) (5000 5%) 1.9025
3m 35s (- 32m 16s) (10000 10%) 1.8816
5m 20s (- 30m 17s) (15000 15%) 1.8796
7m 5s (- 28m 20s) (20000 20%) 1.7657
8m 50s (- 26m 30s) (25000 25%) 1.8195
10m 35s (- 24m 42s) (30000 30%) 1.8460
12m 19s (- 22m 53s) (35000 35%) 1.8601
14m 4s (- 21m 6s) (40000 40%) 1.8202
15m 48s (- 19m 19s) (45000 45%) 1.7702
17m 34s (- 17m 34s) (50000 50%) 1.7759
19m 19s (- 15m 48s) (55000 55%) 1.7290
21m 5s (- 14m 3s) (60000 60%) 1.8111
22m 50s (- 12m 17s) (65000 65%) 1.7963
24m 35s (- 10m 32s) (70000 70%) 1.7362
26m 21s (- 8m 47s) (75000 75%) 1.7547
28m 6s (- 7m 1s) (80000 80%) 1.7239
29m 51s (- 5m 16s) (85000 85%) 1.7196
31m 37s (- 3m 30s) (90000 90%) 1.6645
33m 23s (- 1m 45s) (95000 95%) 1.7495
35m 8s (- 0m 0s) (100000 100%) 1.7176


In [None]:
hidden_size = 256
encoder1 = EncoderRNN(lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, lang.n_words, dropout_p=0.1).to(device)

n_iterations = 400000
trainIters(encoder1, attn_decoder1, n_iterations, print_every=5000)
torch.save(encoder1, 'saved_models/POI-encoder1-%diters.pt'%n_iterations)
torch.save(attn_decoder1, 'saved_models/POI-attn_decoder1-%diters.pt'%n_iterations)

2m 2s (- 161m 8s) (5000 1%) 3.2216
3m 44s (- 145m 54s) (10000 2%) 2.8536
5m 26s (- 139m 50s) (15000 3%) 2.8211
7m 9s (- 136m 5s) (20000 5%) 2.6845
8m 52s (- 133m 7s) (25000 6%) 2.6744
10m 35s (- 130m 38s) (30000 7%) 2.6410
12m 18s (- 128m 21s) (35000 8%) 2.5117
14m 1s (- 126m 16s) (40000 10%) 2.5091
15m 45s (- 124m 20s) (45000 11%) 2.5061
17m 28s (- 122m 21s) (50000 12%) 2.4185
19m 12s (- 120m 28s) (55000 13%) 2.3698
20m 55s (- 118m 36s) (60000 15%) 2.3372
22m 39s (- 116m 44s) (65000 16%) 2.3538
24m 22s (- 114m 54s) (70000 17%) 2.2624
26m 6s (- 113m 6s) (75000 18%) 2.2959
27m 50s (- 111m 20s) (80000 20%) 2.3513
29m 34s (- 109m 36s) (85000 21%) 2.2181
31m 18s (- 107m 50s) (90000 22%) 2.1396
33m 2s (- 106m 5s) (95000 23%) 2.1507
34m 46s (- 104m 20s) (100000 25%) 2.1505
36m 30s (- 102m 33s) (105000 26%) 2.1149
38m 14s (- 100m 49s) (110000 27%) 2.0905
39m 58s (- 99m 5s) (115000 28%) 2.1121
41m 43s (- 97m 21s) (120000 30%) 2.0849
43m 28s (- 95m 38s) (125000 31%) 2.1054
45m 13s (- 93m 55s) (

In [116]:
hidden_size = 256
encoder1 = EncoderRNN(lang.n_words, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, lang.n_words, dropout_p=0.1).to(device)

n_iterations = 400000
trainIters(encoder1, attn_decoder1, n_iterations, print_every=5000)
torch.save(encoder1, 'saved_models/street-encoder1-%diters.pt'%n_iterations)
torch.save(attn_decoder1, 'saved_models/street-attn_decoder1-%diters.pt'%n_iterations)

1m 53s (- 112m 3s) (5000 1%) 4.9881
3m 32s (- 102m 42s) (10000 3%) 4.4488
5m 11s (- 98m 43s) (15000 5%) 4.2210
6m 50s (- 95m 51s) (20000 6%) 4.0978
8m 30s (- 93m 37s) (25000 8%) 3.9365
10m 11s (- 91m 41s) (30000 10%) 3.7642
11m 51s (- 89m 50s) (35000 11%) 3.6750
13m 32s (- 88m 1s) (40000 13%) 3.4709
15m 14s (- 86m 23s) (45000 15%) 3.3909
16m 56s (- 84m 40s) (50000 16%) 3.2648
18m 38s (- 83m 0s) (55000 18%) 3.1567
20m 20s (- 81m 21s) (60000 20%) 3.1587
22m 2s (- 79m 40s) (65000 21%) 3.0064
23m 44s (- 78m 0s) (70000 23%) 2.9263
25m 27s (- 76m 21s) (75000 25%) 2.8871
27m 8s (- 74m 38s) (80000 26%) 2.7798
28m 50s (- 72m 56s) (85000 28%) 2.7262
30m 32s (- 71m 16s) (90000 30%) 2.6267
32m 15s (- 69m 35s) (95000 31%) 2.6481
33m 57s (- 67m 54s) (100000 33%) 2.5631
35m 39s (- 66m 13s) (105000 35%) 2.5133
37m 22s (- 64m 33s) (110000 36%) 2.4839
39m 4s (- 62m 52s) (115000 38%) 2.4444
40m 47s (- 61m 11s) (120000 40%) 2.3950
42m 30s (- 59m 30s) (125000 41%) 2.3498
44m 13s (- 57m 49s) (130000 43%) 2.

In [139]:
n_iterations = 200000
encoder1 = torch.load('saved_models/POI-encoder1-%diters.pt'%n_iterations)
attn_decoder1 = torch.load('saved_models/POI-attn_decoder1-%diters.pt'%n_iterations)
# saveTrainOutput(encoder1, attn_decoder1, pairs[:1000], 'POI')
saveTestOutput(encoder1, attn_decoder1, tests, 'POI')

0 0%
5000 10%
10000 20%
15000 30%
20000 40%
25000 50%
30000 60%
35000 70%
40000 80%
45000 90%


In [120]:
n_iterations = 300000
encoder1 = torch.load('saved_models/street-encoder1-%diters.pt'%n_iterations)
attn_decoder1 = torch.load('saved_models/street-attn_decoder1-%diters.pt'%n_iterations)
# saveTrainOutput(encoder1, attn_decoder1, pairs[:1000], 'street')
saveTestOutput(encoder1, attn_decoder1, tests, 'street')

0 0%
5000 10%
10000 20%
15000 30%
20000 40%
25000 50%
30000 60%
35000 70%
40000 80%
45000 90%


In [None]:
# 存储validates和tests的结果
n_iterations = 400000
encoder1 = torch.load('saved_models/POI-encoder1-%diters.pt'%n_iterations)
attn_decoder1 = torch.load('saved_models/POI-attn_decoder1-%diters.pt'%n_iterations)
saveValOutput(encoder1, attn_decoder1, validates, 'POI')
saveTestOutput(encoder1, attn_decoder1, tests, 'POI')
encoder1 = torch.load('saved_models/street-encoder1-%diters.pt'%n_iterations)
attn_decoder1 = torch.load('saved_models/street-attn_decoder1-%diters.pt'%n_iterations)
saveValOutput(encoder1, attn_decoder1, validates, 'street')
saveTestOutput(encoder1, attn_decoder1, tests, 'street')

In [117]:
evaluateRandomly(encoder1, attn_decoder1)

> bagus advert raya batu bulan 61 sukawati
= raya batu bulan
< raya batu <EOS>

> rumah tumbuh blok t nomer 5 bapak romza aidi dekat taman ribang gale
= 
<  <EOS>

> bumi alam pertiwi cv balow 2
= balowerti 2
<  <EOS>

> war leo raya purwo purwodadi
= raya purwo
< raya purwo <EOS>

> seja 88-104 cimekar cileunyi
= seja
<  <EOS>

> toko fatih raya lab karaton
= raya labuan
< raya lab <EOS>

> g h. timan no 101 ceger rt 5 5 cipayung
= g h. timan
< h. h. <EOS>

> lubang buaya gg. h. jirin 10 rt 5 7 cipayung
= 
< gg. h. h. <EOS>

> kiara sari vii margasari kel.
= kiara sari vii
< kiara sari vii <EOS>

> jl tera 89a medan estate percut sei tuan
= jl tera
< jl tera <EOS>

