In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_ag_news(split='train'):
    data = pd.read_csv('data/ag_news_csv/{}.csv'.format(split),header=-1)
    labels = data[0].values.tolist()
    texts = data[1]+data[2]
    texts = texts.values.tolist()
    return texts,labels

In [3]:
texts,labels = load_ag_news()

In [4]:
def text_preprocessing(texts):
    #只保留字母和数字空格
    texts = [re.sub(r'[^A-Za-z0-9 ]','',text) for text in texts]
    #去空格字母小写
    texts = [[word.strip().lower() for word in text.split(' ')] for text in texts]
    #去停用词
    english_stopwords = stopwords.words('english')
    texts = [list(filter(lambda x:x not in english_stopwords,text)) for text in texts]
    #Stemming
    stemmer = SnowballStemmer('english')
    texts = [[stemmer.stem(word) for word in text] for text in texts]
    #Lemma
    lemma = WordNetLemmatizer()
    texts = [[lemma.lemmatize(word) for word in text] for text in texts]
    texts = [' '.join(text) for text in texts]
    return texts

In [5]:
texts=text_preprocessing(texts)
texts[0]

'wall st bear claw back black reutersreut  shortsel wall street dwindlingband ultracyn see green'

In [6]:
def preprocess_label_for_model(labels):
    onehot_labels = np_utils.to_categorical(labels)
    onehot_labels = onehot_labels[:,1:]
    return onehot_labels

In [7]:
onehot_labels = preprocess_label_for_model(labels)

In [8]:
x_train,x_val,y_train,y_val = train_test_split(texts,onehot_labels,test_size=0.001,shuffle=True,random_state=42)

In [9]:
def preprocess_text_for_model(x,token=None,dict_size=8000,max_length=50):
    if token is None:
        token = Tokenizer(num_words=dict_size)
        token.fit_on_texts(x)
    texts_seq = token.texts_to_sequences(x)
    texts_seq_pad = sequence.pad_sequences(texts_seq,maxlen=max_length)
    return texts_seq_pad,token

In [10]:
x_train_seq,token = preprocess_text_for_model(x_train)
x_val_seq,_ = preprocess_text_for_model(x_val,token)

In [12]:
dimension_output=4
dict_size=8000
embedded_size=128
maxlen=50
pooling_size=5
num_layers=2
size_layer=128
num_heads = 8

In [16]:
tf.reset_default_graph()

In [14]:
def layer_norm(inputs,epsilon=1e-8):
    mean,var = tf.nn.moments(inputs,[-1],keep_dims=True) #以一条数据（一个embedding word）为单位进行normalization (10,50,1)
    norm = (inputs - mean)/(tf.sqrt(var+epsilon))
    params_shape = inputs.get_shape()[-1:] #128
    #防止layer normalization破坏学出的数据分布，通过gamma，beta两个学习的参数恢复norm之前的分布
    gamma = tf.get_variable('gamma', params_shape, tf.float32, tf.ones_initializer())
    beta = tf.get_variable('beta', params_shape, tf.float32, tf.zeros_initializer())
    return gamma*norm+beta

def multihead_attention(inputs,masks):
    '''
    把WQ，WK，WV三个矩阵拼起来直接计算，矩阵的第一个维度要和embedding_size相同，第二个维度为embedding乘以矩阵后所得向量的维度（自定义）
    embedding乘以一个WQ，WK，WV沿Axis=0方向拼接矩阵，同时得到q,k,v三个向量
    '''
    Q_K_V = tf.layers.dense(inputs,3*embedded_size,activation=tf.nn.relu) #(10,50,128) dot (128,384) = (10,50,384)
    Q,K,V = tf.split(Q_K_V,3,-1) #(10,50,128)
    Q_ = tf.concat(tf.split(Q,num_heads,axis=2),axis=0)#(80,50,16)
    K_ = tf.concat(tf.split(K,num_heads,axis=2),axis=0)#(80,50,16)
    V_ = tf.concat(tf.split(V,num_heads,axis=2),axis=0)#(80,50,16)
    weight_bf_softmax = tf.matmul(Q_,tf.transpose(K_,[0,2,1]))/np.sqrt(K_.get_shape().as_list()[-1]) #(80,50,50)  根号里的是k向量的长度
    #padding mask因为输入的句子要做padding，不够长的要补0，对于这些0不应该给予attention，通过赋负无穷，使得在softmax过后，权重非常小
    if masks is not None:
        paddings = tf.fill(tf.shape(weight_bf_softmax),float('-inf'))
        weight_bf_softmax = tf.where(tf.equal(masks,0),paddings,weight_bf_softmax)
    weight = tf.nn.softmax(weight_bf_softmax) #(80,50,50)
    outputs = tf.matmul(weight,V_)  #(80,50,50) dot (80,50,16) = (80,50,16)
    outputs = tf.concat(tf.split(outputs,num_heads,0),2) # (10,50,128)
    outputs += inputs #残差连接
    return layer_norm(outputs)

def window_mask(size):
    masks = np.zeros([maxlen,maxlen])
    for i in range(maxlen):
        if i < size:
            masks[i,:i+size+1]=1.
        elif i > maxlen - size - 1:
            masks[i,i-size:]=1.
        else:
            masks[i,i-size:i+size+1]=1.
    masks = tf.convert_to_tensor(masks) #(50,50)
    #(1,50,50)  -> (batch_size*heads,50,50),
    return tf.tile(tf.expand_dims(masks,0),[tf.shape(x)[0]*num_heads,1,1]) 

def position_encoding(inputs,embed_dim):
    T = inputs.get_shape().as_list()[1] #50
    v = tf.range(T) #(50,)
    m = tf.expand_dims(v,0) #(1,50)
    m = tf.tile(m,[tf.shape(inputs)[0],1]) #(batch_size,50)
    lookup_table = tf.get_variable('lookup_table', dtype=tf.float32, shape=[T, embed_dim]) #(50,128)
    outputs = tf.nn.embedding_lookup(lookup_table, m) #(batch_size,50,128)
    return outputs

def pointwise_feedforward(inputs,num_units=[None,None],activation=None):
    outputs = tf.layers.conv1d(inputs,num_units[0],kernel_size=1,activation=activation) #(batch_size,50,512)
    outputs = tf.layers.conv1d(outputs,num_units[1],kernel_size=1) #(batch_size,50,128)
    outputs += inputs #(batch_size,50,128)
    return layer_norm(outputs)

In [17]:
x = tf.placeholder(tf.int32,shape=[None,maxlen],name='x')
y = tf.placeholder(tf.float32,shape=[None,dimension_output],name='y')
with tf.name_scope('embedding'):
    embedding_matrix = tf.Variable(tf.random_normal([dict_size,embedded_size],-1,1))
    feed = tf.nn.embedding_lookup(embedding_matrix,x)
with tf.name_scope('multihead_attention'):
    for window_size in range(1,6):
        with tf.variable_scope('mask_window_%d' % window_size):
            feed = multihead_attention(feed,window_mask(window_size))
    feed = tf.add(feed,position_encoding(feed,embedded_size)) #加上position encoding
    with tf.variable_scope('multihead'):
        feed = multihead_attention(feed, None)#（batch_size,50,128）
with tf.name_scope('feed_forward'):
    with tf.variable_scope('pointwise'):
        feed = pointwise_feedforward(feed, num_units=[4*embedded_size, 
                                                          embedded_size], activation=tf.nn.relu)
with tf.name_scope('output'):
    logits = tf.layers.dense(feed,dimension_output)[:,-1]
    pred = tf.argmax(logits,1)
with tf.name_scope('train'):
    loss_func = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=y))
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss_func)
with tf.name_scope('eval'):
    accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(logits,1),tf.argmax(y,1)),tf.float32))

In [18]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

In [23]:
epochs=10
batch_size=20

In [24]:
def shuffle_batch(x,y,batch_size):
    index = np.random.permutation(len(x))
    n_batch = len(x) // batch_size
    for batch_index in (np.array_split(index,n_batch)):
        x_batch,y_batch = x[batch_index],y[batch_index]
        yield x_batch,y_batch

In [25]:
for epoch in range(epochs):
    for x_batch,y_batch in shuffle_batch(x_train_seq,y_train,batch_size):
        sess.run(optimizer,feed_dict={x:x_batch,y:y_batch})
    l,a = sess.run([loss_func,accuracy],feed_dict={x:x_val_seq,y:y_val})
    print('epoch:%02d loss:%.2f acc:%.2f' %(epoch,l,a))

epoch:00 loss:0.42 acc:0.88
epoch:01 loss:0.30 acc:0.89
epoch:02 loss:0.30 acc:0.90
epoch:03 loss:0.38 acc:0.88
epoch:04 loss:0.32 acc:0.88
epoch:05 loss:0.30 acc:0.92
epoch:06 loss:0.29 acc:0.93
epoch:07 loss:0.33 acc:0.90
epoch:08 loss:0.31 acc:0.89
epoch:09 loss:0.29 acc:0.93


In [26]:
sents_test,labels_test=load_ag_news('test')

In [27]:
texts_test=text_preprocessing(sents_test)

In [28]:
onehot_labels_test = preprocess_label_for_model(labels_test)
x_test_seq,_=preprocess_text_for_model(sents_test,token)

In [29]:
y_pred=[]
for batch in np.array_split(np.arange(x_test_seq.shape[0]),100):
    p = sess.run(pred,feed_dict={x:x_test_seq[batch]})
    y_pred.extend(p)

In [30]:
final_result = [item+1 for item in y_pred]
from sklearn.metrics import accuracy_score
accuracy_score(labels_test,final_result)

0.8063157894736842

In [31]:
sess.close()