References:
- https://torchtext.readthedocs.io/en/latest/data.html
- [Torchtext使用教程](https://blog.csdn.net/JWoswin/article/details/92821752)
- [Pytorch学习记录-更深的TorchText学习01](https://www.jianshu.com/p/da3a5d5ed2ba)
- [Kaggle Competetion: Seq2Seq Implementation - Failed Experiment](https://www.kaggle.com/mylee2009/seq2seq-implementation-failed-experiment)
- https://zhuanlan.zhihu.com/p/87708546
- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

In [15]:
import torch
DEVICE= torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

device(type='cpu')

### I. 数据预处理
1. 读取数据
2. 使用torchtext构建词表（转化为vector）
3. 构建Iterator方便以batch进行训练

#### 1.读取数据

In [2]:
# 数据读取
import pandas as pd
import re

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df['POI'] = df['POI/street'].str.extract(r'(.*)/', expand=True)
df['street'] = df['POI/street'].str.extract(r'/(.*)', expand=True)
df.sample(10)

df.to_csv('train_POI.csv', columns=('id', 'raw_address', 'POI'), index=False)
df.to_csv('train_street.csv', columns=('id', 'raw_address', 'street'), index=False)

In [3]:
# 数据读取，加入BOS和EOS字段，并增加valid.csv
import pandas as pd
import re
import numpy as np

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

df['POI'] = df['POI/street'].str.extract(r'(.*)/', expand=True)
df['street'] = df['POI/street'].str.extract(r'/(.*)', expand=True)

# df['POI'] = df['POI'].apply(lambda x : 'BOS '+ x + ' EOS')
# df['street'] = df['street'].apply(lambda x : 'BOS '+ x + ' EOS')

msk = np.random.rand(len(df)) < 0.9

train = df[msk]
val = df[~msk]

train.to_csv('train_POI.csv', columns=('id', 'raw_address', 'POI'), index=False)
train.to_csv('train_street.csv', columns=('id', 'raw_address', 'street'), index=False)
val.to_csv('val_POI.csv', columns=('id', 'raw_address', 'POI'), index=False)
val.to_csv('val_street.csv', columns=('id', 'raw_address', 'street'), index=False)

In [None]:
# 数据存储
import pandas as pd
import numpy as np

df_POI = pd.read_csv("test_POIs.csv")
df_street = pd.read_csv("test_streets.csv")

POIs = df_POI['POI'].tolist()
streets = df_street['street'].tolist()

output = []
for i in range(len(POIs)):
    if pd.isnull(POIs[i]):
        POI = ''
    else:
        POI = str(POIs[i])
    if pd.isnull(streets[i]):
        street = ''
    else:
    street = str(streets[i])
    output.append({'id':i, 'POI/street': POI + '/' + street})

df_output = pd.DataFrame(output)
df_output.to_csv("test_answers.csv", columns=('id','POI/street'), index=False)

#### 2.构建词表和对应的train、validate、test数据集

In [44]:
# for torchtext<=0.8.0
# from torchtext.data import Field
# from torchtext.data import TabularDataset
# for torchtext>=0.9.0
from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset

# 建立trn、val、tst数据集
tokenize=lambda x: x.split()
TEXT=Field(sequential=True, tokenize=tokenize, lower=True)

tv_datafields=[
    ('id',None),
    ('raw_address',TEXT),
    ('POI',TEXT)
]
trn,vld=TabularDataset.splits(
    path=r'.',
    train='train_POI.csv',
    validation='val_POI.csv',
    format='csv',
    skip_header=True,
    fields=tv_datafields
)

tst_datafields=[
    ('id',None),
    ('raw_address',TEXT)
]
tst=TabularDataset(
    path=r'test.csv',
    format='csv',
    skip_header=True,
    fields=tst_datafields
)

print(trn[0].raw_address, trn[0].POI)
print(vld[1].raw_address, vld[1].POI)
print(tst[0].raw_address)

# 构建词表
# TEXT.build_vocab(trn)
TEXT.build_vocab(trn,val,tst)
print(TEXT.vocab.freqs.most_common(10))

['jl', 'kapuk', 'timur', 'delta', 'sili', 'iii', 'lippo', 'cika', '11', 'a', 'cicau', 'cikarang', 'pusat'] []
['toko', 'dita,', 'kertosono'] ['toko', 'dita']
['s.', 'par', '53', 'sidanegara', '4', 'cilacap', 'tengah']
[('no', 52513), ('rt', 45538), ('raya', 32701), ('1', 23636), ('2', 21756), ('rw', 21667), ('3', 18371), ('toko', 16667), ('4', 15146), ('barat', 14815)]


#### 3.建立Iterator方便以batch训练

In [45]:
from torchtext.legacy.data import Iterator, BucketIterator

# sort_key就是告诉BucketIterator使用哪个key值去进行组合，很明显，在这里是comment_text
# repeat设定为False是因为之后要打包这个迭代层
# train_iter, val_iter=BucketIterator.splits(
#     (trn,vld),
#     batch_sizes=(64,64),
#     device=DEVICE,
#     sort_key=lambda x:len(x.raw_address),
#     sort_within_batch=False,
#     repeat=False
# )

train_iter=Iterator(
    trn,
    batch_size=64,
    device=DEVICE,
    sort_key=lambda x:len(x.raw_address),
    sort_within_batch=False,
    repeat=False
)

val_iter=Iterator(
    vld,
    batch_size=64,
    device=DEVICE,
    sort_key=lambda x:len(x.raw_address),
    sort_within_batch=False,
    repeat=False
)

test_iter = Iterator(
    tst, 
    batch_size=64, 
    device=DEVICE, 
    sort=False, 
    sort_within_batch=False, 
    repeat=False
)

# 显示一个batch的结构
batch=next(train_iter.__iter__());batch


[torchtext.legacy.data.batch.Batch of size 64]
	[.raw_address]:[torch.LongTensor of size 16x64]
	[.POI]:[torch.LongTensor of size 7x64]

In [49]:
# BucketIterator或Iterator返回一个名为torchtext.data.Batch的自定义数据类型
# 这使得代码重用变得困难（因为每次列名更改时，我们都需要修改代码）
# 并且使得torchtext很难与其他库一起用于某些用例（例如torchsample和fastai）
# 因此定义一个Iterator的包装器

class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        # self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
        self.dl, self.x_var, self.y_var = dl, x_var, y_vars
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_var is not None: # we will concatenate y into a single tensor
                # y = torch.cat([getattr(batch, feat).unsqueeze(1) for feat in self.y_vars], dim=1).float()
                y = getattr(batch, self.y_var)
            else:
                y = torch.zeros((1))
            
            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

train_dl = BatchWrapper(train_iter, "raw_address", "POI")
valid_dl = BatchWrapper(val_iter, "raw_address", "POI")
test_dl = BatchWrapper(test_iter, "raw_address", None)

# 显示一个batch的数据
next(train_dl.__iter__())

(tensor([[   120,    969,    321,   1667,   1899,   3015,     57,   4042,   1389,
            7472,   1363,  10310,     20,    239,    511,     74,     42,    393,
             165,   3290,    311,  13113,   4767,   2770,     42,   1422,     63,
             744,  53226,     42,     17,    533,    141,   1142,  12395,      4,
           21514,   1082,   7480,    765,   1266,  12355,     60,    269,   1857,
            3827,   3733,     53,    508,     82,    666,      4,     71,    947,
              75,  43942,   8965,    587,    251,   1660,    750,    361,    182,
          133730],
         [   993,   1308,     40,     94,   1737,  26091,     73,      2,   5891,
              41,  28830,    508,  26759,   1739,     16,   4514,  43730,   6047,
             235,    588,   6282,    181,    932,   3972,  14991,   1378,   1036,
               4,    114,  49963,   4358,  99370,  13692,    113,    291,  42180,
              11,   3050,    536,   1393,     46,   7954,  14690,     32,    68

In [48]:
len(TEXT.vocab)

141476

### II. LSTM分类模型

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class EncoderLSTM(nn.Module):
    def __init__(self, in_size, hid_size=256, emb_size=256):
        super().__init__()
        self.embedding == nn.Embedding(in_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hid_size)
        
    def forward(self, inp, hid, cel):
        embedded = self.embedding(inp)
        output, (hidden, cell) = self.lstm(embedded, (hid, cel))
        return output, hidden, cell

class DecoderLSTM(nn.Module):
    def __init__(self, out_size, hid_size=256, emb_size=256, drop_rate=0.5):
        super().__init__()
        self.embedding = nn.Embedding(out_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hid_size)
        self.out = nn.Linear(hid_size, out_size)
        self.dropout = nn.Dropout(drop_rate)
    
    def forward(self, inp, hid, cel):
        # embedded = self.embedding(inp)
        embedded = F.relu(self.embedding(inp))
        output, (hidden, cell) = self.lstm(embedded, (hid, cel))
        output = self.out(output)
        output = self.dropout(output)
        return output, hidden, cell

class seq2seq(nn.Module):
    def __init__(self, in_out_size, hid_size, emb_size, drop_rate):
        self.encoder = EncoderLSTM(in_out_size)
        self.decoder = DecoderLSTM(in_out_size)
    
    def forward(self, seq, hid, cel):
        output, hidden, cell = self.encoder(seq, hid, cel)
        output hidden, cell = self.decoder(seq, hidden, cell)
        return output

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden) # 上一个unit的output和hidden到这一个unit来
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout

        self.embedding = nn.Embedding(output_dim, emb_dim)

        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout)

        self.out = nn.Linear(hid_dim, output_dim)

        self.dropout = nn.Dropout(dropout)


    def forward(self, input, hidden, cell):
        # input = [batch size]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]

        # n directions in the decoder will both always be 1, therefore:
        # hidden = [n layers, batch size, hid dim]
        # context = [n layers, batch size, hid dim]

        input = input.unsqueeze(0)

        # input = [1, batch size]

        embedded = self.dropout(self.embedding(input))

        # embedded = [1, batch size, emb dim]

        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))

        # output = [sent len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]

        # !! sent len and n directions will always be 1 in the decoder,therefore:

        # output = [1, batch size, hid dim]
        # hidden = [n layers, batch size, hid dim]
        # cell = [n layers, batch size, hid dim]

        prediction = self.out(output.squeeze(0))

        # prediction = [batch size, output dim]

        return prediction, hidden, cell

    # seq2seq
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        # src = [src sent len, batch size]
        # trg = [trg sent len, batch size]
        # teacher_forcing_ratio is probability to use teacher forcing
        # e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time

        batch_size = trg.shape[1]
        max_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim

        # tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size,
                              trg_vocab_size).to(self.device)

        # last hidden state of the encoder is used as the initial hidden state of the decoder

        hidden, cell = self.encoder.forward(src)

        # first input to the decoder is the <sos> tokens
        input = trg[0, :]

        for t in range(1, max_len):
            # insert input token embedding, previous hidden and previous cell states

            # receive output tensor (predictions) and new hidden and cell states

            output, hidden, cell = self.decoder.forward(input, hidden, cell)

            # place predictions in a tensor holding predictions for each token

            outputs[t] = output

            # decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio

            # get the highest predicted token from our predictions
            top1 = output.argmax(1)

            # if teacher forcing, use actual next token as next input
            # if not, 203 use predicted token
            # 在 模型训练速度 和 训练测试差别不要太大 作一个均衡
            input = trg[t] if teacher_force else top1

        return outputs

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

class LSTM(nn.Module):
    def __init__(self, hidden_dim, emb_dim=300, num_linear=1):
        super().__init__()
        self.embedding = nn.Embedding(len(TEXT.vocab), emb_dim)
        self.encoder = nn.LSTM(emb_dim, hidden_dim, num_layers=1)
        self.linear_layers = []

        for _ in range(num_linear - 1):
            self.linear_layers.append(nn.Linear(hidden_dim, hidden_dim))
            self.linear_layer = nn.ModuleList(self.linear_layers)

        self.predictor = nn.Linear(hidden_dim, 6)

    def forward(self, seq):
        hdn, _ = self.encoder(self.embedding(seq))
        feature = hdn[-1, :, :]
        for layer in self.linear_layers:
            feature = layer(feature)
        preds = self.predictor(feature)

        return preds
em_sz = 100
nh = 500
nl = 3
model = LSTM(nh, emb_dim=em_sz)

In [None]:
import tqdm
opt=optim.Adam(model.parameters(),lr=1e-2)
loss_func=nn.BCEWithLogitsLoss()
epochs=2

In [None]:
for epoch in range(1, epochs + 1):
    running_loss = 0.0
    running_corrects = 0
    model.train()
    for x, y in tqdm.tqdm(train_dl):
        opt.zero_grad()
        preds = model(x)
        loss = loss_func(y, preds)
        loss.backward()
        opt.step()

        running_loss += loss.item()* x.size(0)
    epoch_loss = running_loss / len(trn)

    val_loss = 0.0
    model.eval()  # 评估模式
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(y, preds)
        val_loss += loss.item()* x.size(0)

    val_loss /= len(vld)
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))
test_preds = []
for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    preds = preds.data.numpy()
    # 模型的实际输出是logit，所以再经过一个sigmoid函数
    preds = 1 / (1 + np.exp(-preds))
    test_preds.append(preds)
    test_preds = np.hstack(test_preds)

print(test_preds)