## Read data

In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/Q-A summary/"

os.chdir(path)
os.listdir(path)

Mounted at /content/drive


['AutoMaster_TrainSet.csv',
 'AutoMaster_TestSet.csv',
 'stop',
 'merged_train_test_seg_data.csv',
 'word2vec.model',
 'wordcloud.png',
 'new_word2vec_model',
 'train_x_pad',
 'train_y_pad',
 'test_x_pad',
 'save_embedding_matrix_path',
 'embedding_matrix.txt',
 'embedding_matrix.txt.npy',
 'train_x_pad.txt',
 'train_y_pad.txt',
 'test_x_pad.txt',
 'train_x_pad.txt.npy']

In [None]:
import pandas as pd
import numpy as np

In [None]:
embedding_matrix_path='embedding_matrix.txt'

In [None]:
lines=[]
with open(embedding_matrix_path) as f:
    for line in f:
        l=line.split(" ")
        l=[float(i) for i in l]
        lines.append(l)
np.save(embedding_matrix_path,lines)

In [None]:
embedding_matrix=np.array(lines)

In [None]:
embedding_matrix.shape

(31937, 200)

In [None]:
def read_data(path):
  lines=[]
  with open(path) as f:
    for line in f:
        l=line.split(" ")
        l=[int(float(i)) for i in l]
        lines.append(l)
  return np.array(lines)

train_x=read_data("train_x_pad.txt")
train_y=read_data("train_y_pad.txt")
test_x=read_data("test_x_pad.txt")


## Simple Seq-to-Seq Model 

In [None]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
def seq2seq(input_length, output_sequence_length, embedding_matrix, vocab_size):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=200, weights=[embedding_matrix], trainable=False,
                        input_length=input_length))
    model.add(Bidirectional(GRU(200, return_sequences=False)))
    model.add(Dense(200, activation="relu"))
    model.add(RepeatVector(output_sequence_length))
    model.add(Bidirectional(GRU(200, return_sequences=True)))
    model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(1e-3))
    model.summary()
    return model

In [None]:
# 输入的长度   x  max_len
input_length = train_x.shape[1]
# 输出的长度  y  max_len
output_sequence_length = train_y.shape[1]
# 词表大小
vocab_size=31937

In [None]:
model = seq2seq(input_length,output_sequence_length,embedding_matrix,vocab_size)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 415, 200)          6387400   
                                                                 
 bidirectional (Bidirectiona  (None, 400)              482400    
 l)                                                              
                                                                 
 dense (Dense)               (None, 200)               80200     
                                                                 
 repeat_vector (RepeatVector  (None, 35, 200)          0         
 )                                                               
                                                                 
 bidirectional_1 (Bidirectio  (None, 35, 400)          482400    
 nal)                                                            
                                                        

In [None]:
model.fit(train_x, train_y, batch_size=32, epochs=5, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fd7900d1610>

* Encoder(即第一个GRU) 只在序列结束时输出一个语义向量，所以其 "return_sequences" 参数设置为 "False"  
* Decoder(即第二个GRU) 需要在每一个 time step 都输出，所以其 "return_sequences" 参数设置为 "True"  
* 使用 "RepeatVector" 将 Encoder 的输出(最后一个 time step)复制 N 份作为 Decoder 的N次输入  
* TimeDistributed 是为了保证 Dense 和 Decoder 之间的一致，可以不用太关心  
* 但其实并不符合Seq-seq论文的模型要求：不符合Decoder的每一个时刻的输出作为下一个时刻的输入  