In [1]:
import numpy as np
from collections import Counter
import tensorflow as tf
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def load_ag_news(split='train'):
    data = pd.read_csv('data/ag_news_csv/{}.csv'.format(split),header=-1)
    labels = data[0].values.tolist()
    texts = data[1]+data[2]
    texts = texts.values.tolist()
    return texts,labels

In [3]:
texts,labels = load_ag_news()

In [4]:
def text_preprocessing(texts):
    #只保留字母和数字空格
    texts = [re.sub(r'[^A-Za-z0-9 ]','',text) for text in texts]
    #去空格字母小写
    texts = [[word.strip().lower() for word in text.split(' ')] for text in texts]
    #去停用词
    english_stopwords = stopwords.words('english')
    texts = [list(filter(lambda x:x not in english_stopwords,text)) for text in texts]
    #Stemming
    stemmer = SnowballStemmer('english')
    texts = [[stemmer.stem(word) for word in text] for text in texts]
    #Lemma
    lemma = WordNetLemmatizer()
    texts = [[lemma.lemmatize(word) for word in text] for text in texts]
    texts = [' '.join(text) for text in texts]
    return texts

In [5]:
texts=text_preprocessing(texts)

In [6]:
def preprocess_label_for_model(labels):
    onehot_labels = np_utils.to_categorical(labels)
    onehot_labels = onehot_labels[:,1:]
    return onehot_labels

In [7]:
onehot_labels = preprocess_label_for_model(labels)

In [8]:
x_train,x_val,y_train,y_val = train_test_split(texts,onehot_labels,test_size=0.001,shuffle=True,random_state=42)

In [9]:
def preprocess_text_for_model(x,token=None,dict_size=8000,max_length=128):
    if token is None:
        token = Tokenizer(num_words=dict_size)
        token.fit_on_texts(x)
    texts_seq = token.texts_to_sequences(x)
    texts_seq_pad = sequence.pad_sequences(texts_seq,maxlen=max_length)
    return texts_seq_pad,token

In [10]:
x_train_seq,token = preprocess_text_for_model(x_train)
x_val_seq,_ = preprocess_text_for_model(x_val,token)

In [11]:
dimension_output=4
dict_size=8000
embedded_size=128
maxlen=128
pooling_size=5
num_layers=2
size_layer=128

In [12]:
tf.reset_default_graph()

In [13]:
def cells(reuse=False):
    return tf.nn.rnn_cell.LSTMCell(num_units=size_layer,initializer=tf.variance_scaling_initializer,reuse=reuse)

In [14]:
x = tf.placeholder(tf.int32,shape=(None,None),name='x')
y = tf.placeholder(tf.float32,shape=(None,dimension_output),name='y')
is_training = tf.placeholder_with_default(True,shape=(),name='is_training')

In [15]:
with tf.name_scope('embedding'):
    embedding_matrix = tf.Variable(tf.random_normal([dict_size,embedded_size],-1,1))
    embedding = tf.nn.embedding_lookup(embedding_matrix,x)
with tf.name_scope('rnn'):
    rnn_layers = tf.nn.rnn_cell.MultiRNNCell([cells() for _ in range(num_layers)])
    #state:(state_c,state_h)
    outputs,_ = tf.nn.dynamic_rnn(rnn_layers,embedding,dtype=tf.float32) #outputs dim batch_size * maxlen * embedded_size
with tf.name_scope('output'):
    #去最后一个时刻的输出
    logits = tf.layers.dense(outputs[:,-1],units=4,kernel_initializer=tf.variance_scaling_initializer,name='logits')
    pred = tf.argmax(logits, 1)
with tf.name_scope('train'):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = y))
    optimizer = tf.train.AdamOptimizer(learning_rate = 1e-4).minimize(cost)
with tf.name_scope('eval'):
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
def shuffle_batch(x,y,batch_size):
    index = np.random.permutation(len(x))
    n_batch = len(x) // batch_size
    for batch_index in np.array_split(index,n_batch):
        x_batch,y_batch = x[batch_index],y[batch_index]
        yield x_batch,y_batch

In [17]:
epochs=10
batch_size=128

In [18]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess,'model/lstm_model.ckpt')

# sess.run(tf.global_variables_initializer())
# for epoch in range(epochs):
#     for x_batch,y_batch in shuffle_batch(x_train_seq,y_train,batch_size):
#         sess.run(optimizer,feed_dict={x:x_batch,y:y_batch})
#     l,a = sess.run([cost,accuracy],feed_dict={x:x_val_seq,y:y_val})
#     print('epoch:%02d loss:%.2f acc:%.2f' %(epoch,l,a))

INFO:tensorflow:Restoring parameters from model/lstm_model.ckpt


In [19]:
sents_test,labels_test=load_ag_news('test')

In [20]:
texts_test=text_preprocessing(sents_test)

In [21]:
onehot_labels_test = preprocess_label_for_model(labels_test)
x_test_seq,_=preprocess_text_for_model(sents_test,token)

In [23]:
y_pred=[]
for item in x_test_seq:
    p = sess.run(pred,feed_dict={x:item.reshape(-1,128)})
    y_pred.append(p)

In [24]:
final_result = [item.tolist()[0]+1 for item in y_pred]
from sklearn.metrics import accuracy_score
accuracy_score(labels_test,final_result)

0.7872368421052631

In [72]:
saver = tf.train.Saver()
saver.save(sess, "model/lstm_model.ckpt")

'model/lstm_model.ckpt'

In [25]:
sess.close()