In [None]:
import numpy as np
from collections import Counter
import tensorflow as tf
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
import itertools
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [2]:
def load_ag_news(split='train'):
    data = pd.read_csv('data/ag_news_csv/{}.csv'.format(split),header=-1)
    labels = data[0].values.tolist()
    texts = data[1]+data[2]
    texts = texts.values.tolist()
    return texts,labels

In [3]:
texts,labels = load_ag_news()

In [4]:
texts[0]

"Wall St. Bears Claw Back Into the Black (Reuters)Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."

In [5]:
def text_preprocessing(texts):
    #只保留字母和数字空格
    texts = [re.sub(r'[^A-Za-z0-9 ]','',text) for text in texts]
    #去空格字母小写
    texts = [[word.strip().lower() for word in text.split(' ')] for text in texts]
    #去停用词
    english_stopwords = stopwords.words('english')
    texts = [list(filter(lambda x:x not in english_stopwords,text)) for text in texts]
    #Stemming
    stemmer = SnowballStemmer('english')
    texts = [[stemmer.stem(word) for word in text] for text in texts]
    #Lemma
    lemma = WordNetLemmatizer()
    texts = [[lemma.lemmatize(word) for word in text] for text in texts]
    texts = [' '.join(text) for text in texts]
    return texts

In [6]:
texts=text_preprocessing(texts)
texts[0]

'wall st bear claw back black reutersreut  shortsel wall street dwindlingband ultracyn see green'

label不是从0开始的，而是1，2，3，4需要手动处理去掉第一维的onehot

In [7]:
def preprocess_label_for_model(labels):
    onehot_labels = np_utils.to_categorical(labels)
    onehot_labels = onehot_labels[:,1:]
    return onehot_labels

In [8]:
onehot_labels = preprocess_label_for_model(labels)

In [9]:
x_train,x_val,y_train,y_val = train_test_split(texts,onehot_labels,test_size=0.001,shuffle=True,random_state=42)

In [10]:
len(x_val)

120

In [11]:
def preprocess_text_for_model(x,token=None,dict_size=8000,max_length=128):
    if token is None:
        token = Tokenizer(num_words=dict_size)
        token.fit_on_texts(x)
    texts_seq = token.texts_to_sequences(x)
    texts_seq_pad = sequence.pad_sequences(texts_seq,maxlen=max_length)
    return texts_seq_pad,token

In [12]:
x_train_seq,token = preprocess_text_for_model(x_train)
x_val_seq,_ = preprocess_text_for_model(x_val,token)

In [13]:
dimension_output=4
dict_size=8000
embedded_size=128
maxlen=128
pooling_size=5
num_layers=2
size_layer=128

In [14]:
tf.reset_default_graph()

In [15]:
X = tf.placeholder(tf.int32, [None, maxlen])
Y = tf.placeholder(tf.float32, [None, dimension_output])
is_training = tf.placeholder_with_default(True,shape=(),name='is_training')

with tf.name_scope('embedding'):
    encoder_embeddings = tf.Variable(tf.random_uniform([dict_size, embedded_size], -1, 1))
    encoder_embedded = tf.nn.embedding_lookup(encoder_embeddings, X)
    encoder_embedded = tf.expand_dims(encoder_embedded,-1)
    drop1=tf.layers.dropout(encoder_embedded,0.5,training=is_training)

with tf.name_scope('conv'):
    conv1 = tf.layers.conv2d(drop1,filters=16,kernel_size=3,strides=1,padding='VALID',name='conv1',activation='relu')
    pool1 = tf.layers.max_pooling2d(conv1,pool_size=2,strides=2,padding='VALID',name='pooling1')
    conv2 = tf.layers.conv2d(pool1,filters=32,kernel_size=4,strides=1,padding='VALID',name='conv2',activation='relu')
    pool2 = tf.layers.max_pooling2d(conv2,pool_size=2,strides=2,padding='VALID',name='pooling2')

with tf.name_scope('hidden'):
    flatten = tf.reshape(pool2,shape=(-1,32*30*30),name='flatten')
    hidden = tf.layers.dense(flatten,units=128,kernel_initializer=tf.variance_scaling_initializer,activation='relu',name='hidden')
    drop2=tf.layers.dropout(hidden,0.5,training=is_training)
with tf.name_scope('output'):
    logits = tf.layers.dense(hidden,units=4,kernel_initializer=tf.variance_scaling_initializer,name='logits')
    pred = tf.argmax(logits, 1)
with tf.name_scope('train'):
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits = logits, labels = Y))
    optimizer = tf.train.AdamOptimizer(learning_rate = 1e-3).minimize(cost)
with tf.name_scope('eval'):
    correct_pred = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [16]:
def shuffle_batch(x,y,batch_size):
    index = np.random.permutation(len(x))
    n_batch = len(x) // batch_size
    for batch_index in np.array_split(index,n_batch):
        x_batch,y_batch = x[batch_index],y[batch_index]
        yield x_batch,y_batch

In [17]:
epochs=5
batch_size = 128

In [18]:
x_val_seq.shape

(120, 128)

In [21]:
saver = tf.train.Saver()
sess = tf.Session()
saver.restore(sess, "model/cnn_model.ckpt")
# sess.run(tf.global_variables_initializer())

# for epoch in range(epochs):
#     for x_batch,y_batch in shuffle_batch(x_train_seq,y_train,batch_size):
#         sess.run(optimizer,feed_dict={X:x_batch,Y:y_batch,is_training:True})
#     l,a = sess.run([cost,accuracy],feed_dict={X:x_val_seq,Y:y_val,is_training:False})
#     print('epoch:%d loss:%.2f acc:%.2f' %(epoch,l,a))
        

INFO:tensorflow:Restoring parameters from model/cnn_model.ckpt


In [22]:
sents_test,labels_test=load_ag_news('test')

In [23]:
texts_test=text_preprocessing(sents_test)

In [24]:
onehot_labels_test = preprocess_label_for_model(labels_test)

In [25]:
x_test_seq,_=preprocess_text_for_model(sents_test,token)

In [26]:
y_pred=[]
for item in x_test_seq:
    p = sess.run(pred,feed_dict={X:item.reshape(-1,128),is_training:False})
    y_pred.append(p)

In [27]:
final_result = [item.tolist()[0]+1 for item in y_pred]

In [28]:
from sklearn.metrics import accuracy_score
accuracy_score(labels_test,final_result)

0.8210526315789474

In [41]:
saver = tf.train.Saver()
saver.save(sess, "model/cnn_model.ckpt")

'model/cnn_model.ckpt'

In [29]:
sess.close()