In [4]:
import os
from google.colab import drive
drive.mount('/content/drive')

path = "/content/drive/My Drive/Q-A summary/"

os.chdir(path)
os.listdir(path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


['AutoMaster_TrainSet.csv',
 'AutoMaster_TestSet.csv',
 'stop',
 'save_embedding_matrix_path',
 'train_x_pad.txt',
 'train_y_pad.txt',
 'test_x_pad.txt',
 'train_x_pad.txt.npy',
 'embedding_matrix.txt.npy',
 'merged_train_test_seg_data.csv',
 'word2vec.model',
 'wordcloud.png',
 'train_x_pad',
 'train_y_pad',
 'test_x_pad',
 'new_word2vec_model',
 'embedding_matrix.txt',
 'vocab.json',
 'reverse_vocab.json']

In [5]:
import pandas as pd
import numpy as np

In [6]:
embedding_matrix_path='embedding_matrix.txt'

In [7]:
lines=[]
with open(embedding_matrix_path) as f:
    for line in f:
        l=line.split(" ")
        l=[float(i) for i in l]
        lines.append(l)
np.save(embedding_matrix_path,lines)

In [8]:
embedding_matrix=np.array(lines)

In [9]:
embedding_matrix.shape

(31937, 200)

In [10]:
def read_data(path):
  lines=[]
  with open(path) as f:
    for line in f:
        l=line.split(" ")
        l=[int(float(i)) for i in l]
        lines.append(l)
  return np.array(lines)

train_x=read_data("train_x_pad.txt")
train_y=read_data("train_y_pad.txt")
test_x=read_data("test_x_pad.txt")

In [11]:
import json
with open("vocab.json",'r', encoding='UTF-8') as f:
     vocab = json.load(f)

In [12]:
with open("reverse_vocab.json",'r', encoding='UTF-8') as f:
     reverse_vocab = json.load(f)

## Seq-to-Seq Model with attention

In [13]:
import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import unicodedata
import re
import numpy as np
import os
import io
import time

In [14]:
# 训练集的长度
BUFFER_SIZE = len(train_x)
 
# 输入的长度
max_length_inp=train_x.shape[1]
# 输出的长度
max_length_targ=train_y.shape[1]
 
BATCH_SIZE = 64
 
# 训练一轮需要迭代多少步
steps_per_epoch = len(train_x)//BATCH_SIZE
 
# 词向量维度
embedding_dim = 200
# 隐藏层单元数
units = 1024
 
# 词表大小
#vocab_size = len(vocab)

vocab_size = 31937
# 构建训练集
dataset = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)



In [15]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim ,embedding_matrix , enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],trainable=False)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
 
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
 
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))

In [16]:
encoder = Encoder(vocab_size, embedding_dim,embedding_matrix, units, BATCH_SIZE)
# example_input
example_input_batch = tf.ones(shape=(BATCH_SIZE,max_length_inp), dtype=tf.int32)
# sample input
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)
print ('Encoder output shape: (batch size, sequence length, units) {}'.format(sample_output.shape))
print ('Encoder Hidden state shape: (batch size, units) {}'.format(sample_hidden.shape))

Encoder output shape: (batch size, sequence length, units) (64, 415, 1024)
Encoder Hidden state shape: (batch size, units) (64, 1024)


In [17]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, query, values):
        
        # query为上次的GRU隐藏层
        # values为编码器的编码结果enc_output
        # 在seq2seq模型中，St是后面的query向量，而编码过程的隐藏状态hi是values。
        hidden_with_time_axis = tf.expand_dims(query, 1)
 
        
        # 计算注意力权重值
        score = self.V(tf.nn.tanh(
            self.W1(values) + self.W2(hidden_with_time_axis)))
 
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(score, axis=1)
        
        # # 使用注意力权重*编码器输出作为返回值，将来会作为解码器的输入
        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

In [18]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,embedding_matrix, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim,weights=[embedding_matrix],trainable=False)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
 
        # used for attention
        self.attention = BahdanauAttention(self.dec_units)
 
    def call(self, x, hidden, enc_output):
        # 使用上次的隐藏层（第一次使用编码器隐藏层）、编码器输出计算注意力权重
        # enc_output shape == (batch_size, max_length, hidden_size)
        context_vector, attention_weights = self.attention(hidden, enc_output)
 
        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)
        
        # 将上一循环的预测结果跟注意力权重值结合在一起作为本次的GRU网络输入
        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
 
        # passing the concatenated vector to the GRU
        output, state = self.gru(x)
 
        # output shape == (batch_size * 1, hidden_size)
        output = tf.reshape(output, (-1, output.shape[2]))
 
        # output shape == (batch_size, vocab)
        x = self.fc(output)
 
        return x, state, attention_weights


In [19]:
decoder = Decoder(vocab_size, embedding_dim,embedding_matrix, units, BATCH_SIZE)
 
sample_decoder_output, _, _ = decoder(tf.random.uniform((64, 1)),
                                      sample_hidden, sample_output)
 
print ('Decoder output shape: (batch_size, vocab size) {}'.format(sample_decoder_output.shape))



Decoder output shape: (batch_size, vocab size) (64, 31937)


In [20]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
 
pad_index=vocab['<PAD>']
 
def loss_function(real, pred):
    #找到非<pad>对应的位置
    mask = tf.math.logical_not(tf.math.equal(real, pad_index))
    
    loss_ = loss_object(real, pred)
 
    mask = tf.cast(mask, dtype=loss_.dtype)
    #排除<pad>的loss
    loss_ *= mask
 
    return tf.reduce_mean(loss_)



In [21]:
checkpoint_dir = 'data/checkpoints/training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder,
                                 decoder=decoder)


In [22]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
 
    with tf.GradientTape() as tape:
        # 1. 构建encoder
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        # 2. 复制
        dec_hidden = enc_hidden
        # 3. <START> * BATCH_SIZE 
        dec_input = tf.expand_dims([vocab['<START>']] * BATCH_SIZE, 1)
 
        # Teacher forcing - feeding the target as the next input
        for t in range(1, targ.shape[1]):
            # decoder(x, hidden, enc_output)
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            
            loss += loss_function(targ[:, t], predictions)
 
            # using teacher forcing
            dec_input = tf.expand_dims(targ[:, t], 1)
 
        batch_loss = (loss / int(targ.shape[1]))
 
        variables = encoder.trainable_variables + decoder.trainable_variables
 
        gradients = tape.gradient(loss, variables)
 
        optimizer.apply_gradients(zip(gradients, variables))
 
        return batch_loss

In [None]:
EPOCHS = 10
 
for epoch in range(EPOCHS):
    start = time.time()
    
    # 初始化隐藏层
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
 
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        # 
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
 
        if batch % 1 == 0:
            print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                         batch,
                                                         batch_loss.numpy()))
    # saving (checkpoint) the model every 2 epochs
    if (epoch + 1) % 2 == 0:
        checkpoint.save(file_prefix = checkpoint_prefix)
 
    print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))


Epoch 1 Batch 0 Loss 3.9419
Epoch 1 Batch 1 Loss 4.0303
Epoch 1 Batch 2 Loss 4.1316
Epoch 1 Batch 3 Loss 4.1997
Epoch 1 Batch 4 Loss 4.1552
Epoch 1 Batch 5 Loss 3.9734
Epoch 1 Batch 6 Loss 2.8997
Epoch 1 Batch 7 Loss 3.3855
Epoch 1 Batch 8 Loss 2.6335
Epoch 1 Batch 9 Loss 2.7535
Epoch 1 Batch 10 Loss 2.7068
Epoch 1 Batch 11 Loss 2.9738
Epoch 1 Batch 12 Loss 2.8820
Epoch 1 Batch 13 Loss 2.8849
Epoch 1 Batch 14 Loss 3.0601
Epoch 1 Batch 15 Loss 2.8586
Epoch 1 Batch 16 Loss 2.8794
Epoch 1 Batch 17 Loss 2.8129
Epoch 1 Batch 18 Loss 2.8019
Epoch 1 Batch 19 Loss 2.9293
Epoch 1 Batch 20 Loss 2.5359
Epoch 1 Batch 21 Loss 3.0643
Epoch 1 Batch 22 Loss 2.9162
Epoch 1 Batch 23 Loss 2.6880
Epoch 1 Batch 24 Loss 2.8615
Epoch 1 Batch 25 Loss 2.7165
Epoch 1 Batch 26 Loss 2.7638
Epoch 1 Batch 27 Loss 2.5627
Epoch 1 Batch 28 Loss 2.7990
Epoch 1 Batch 29 Loss 2.9568
Epoch 1 Batch 30 Loss 2.8978
Epoch 1 Batch 31 Loss 3.2291
Epoch 1 Batch 32 Loss 2.6036
Epoch 1 Batch 33 Loss 2.7034
Epoch 1 Batch 34 Loss 2.

In [None]:
def clean_sentence(sentence):
    '''
    特殊符号去除
    :param sentence: 待处理的字符串
    :return: 过滤特殊字符后的字符串
    '''
    if isinstance(sentence, str):
        return re.sub(
            r'[\s+\-\/\[\]\{\}_$%^*(+\"\')]+|[+——()【】“”~@#￥%……&*（）]+|你好,|您好,|你好，|您好，',
            # r'[\s+\-\!\/\[\]\{\}_,.$%^*(+\"\')]+|[:：+——()?【】“”！，。？、~@#￥%……&*（）]+|车主说|技师说|语音|图片|你好|您好',
            ' ', sentence)
    else:
        return ' '

In [None]:
def seg_proc(sentence):
    tokens = sentence.split('|')
    result = []
    for t in tokens:
        result.append(cut_sentence(t))
    return ' | '.join(result)

In [None]:
import jieba
def cut_sentence(line):
    # 切词，默认精确模式，全模式cut参数cut_all=True
    tokens = jieba.cut(line)
    return ' '.join(tokens)

In [None]:
remove_words = ['|', '[', ']', '语音', '图片']
def filter_words(sentence):
    '''
    过滤停用词
    :param seg_list: 切好词的列表 [word1 ,word2 .......]
    :return: 过滤后的停用词
    '''
    words = sentence.split(' ')
    # 去掉多余空字符
    words = [word for word in words if word and word not in remove_words]
    # 去掉停用词 包括一下标点符号也会去掉
    words = [word for word in words if word not in stop_words]
    return words

In [None]:
def sentence_proc(sentence):
    '''
    预处理模块
    :param sentence:待处理字符串
    :return: 处理后的字符串
    '''
    # 清除无用词
    sentence = clean_sentence(sentence)
    # 分段切词
    sentence = seg_proc(sentence)
    # 过滤停用词
    words = filter_words(sentence)
    # 拼接成一个字符串,按空格分隔
    return ' '.join(words)

In [None]:
def pad_proc(sentence, max_len, vocab):
    '''
    # 填充字段
    < start > < end > < pad > < unk > max_lens
    '''
    # 0.按空格统计切分出词
    words = sentence.strip().split(' ')
    # 1. 截取规定长度的词数
    words = words[:max_len]
    # 2. 填充< unk > ,判断是否在vocab中, 不在填充 < unk >
    sentence = [word if word in vocab else Vocab.UNKNOWN_TOKEN for word in words]
    # 3. 填充< start > < end >
    sentence = [Vocab.START_DECODING] + sentence + [Vocab.STOP_DECODING]
    # 4. 判断长度，填充　< pad >
    sentence = sentence + [Vocab.PAD_TOKEN] * (max_len - len(words))
    return ' '.join(sentence)

In [None]:
def transform_data(sentence, vocab):
    """
    word 2 index
    :param sentence: [word1,word2,word3, ...] ---> [index1,index2,index3 ......]
    :param vocab: 词表
    :return: 转换后的序列
    """
    # 字符串切分成词
    words = sentence.split(' ')
    # 按照vocab的index进行转换         # 遇到未知词就填充unk的索引
    ids = [vocab[word] if word in vocab else Vocab.UNKNOWN_TOKEN_INDEX for word in words]
    return ids

In [None]:
def preprocess_sentence(sentence, max_len, vocab):
    """
    单句话预处理
    """
    # 1. 切词处理
    sentence = sentence_proc(sentence)
    # 2. 填充
    sentence = pad_proc(sentence, max_len - 2, vocab)
    # 3. 转换index
    sentence = transform_data(sentence, vocab)
    return np.array([sentence])

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_targ, max_length_inp+2))
 
    inputs = preprocess_sentence(sentence,max_length_inp,vocab)
 
    inputs = tf.convert_to_tensor(inputs)
 
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
 
    dec_hidden = enc_hidden
    
    dec_input = tf.expand_dims([vocab['<START>']], 0)
 
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                             dec_hidden,
                                                             enc_out)
        
        # storing the attention weights to plot later on
        attention_weights = tf.reshape(attention_weights, (-1, ))
        
        attention_plot[t] = attention_weights.numpy()
        predicted_id = tf.argmax(predictions[0]).numpy()
 
        result += reverse_vocab[predicted_id] + ' '
        if reverse_vocab[predicted_id] == '<STOP>':
            return result, sentence, attention_plot
 
        # the predicted ID is fed back into the model
        dec_input = tf.expand_dims([predicted_id], 0)
 
    return result, sentence, attention_plot



In [None]:
sentence='漏机油 具体 部位 发动机 变速器 正中间 位置 拍 中间 上面 上 已经 看见'

translate(sentence)