In [None]:
# 数据可视化
# import matplotlib.pyplot as plt
# df_data['sentence_len'].hist(bins=100)
# plt.xlim(0, 100)
# plt.xlabel('sentence_length')
# plt.ylabel('sentence_num')
# plt.title('Distribution of the Length of Sentence')
# plt.show()

In [1]:
import pandas as pd
import numpy as np
import re
from itertools import chain
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import np_utils

# filename = './data/sentiment_word_tagging_train.csv'
def word_to_index(sentence, flag, tokenizer):
    res = []
    if flag == 'content':
        tmp = tokenizer.texts_to_sequences(sentence)
        for i in tmp:
            if i:
                res.append(i[0])
    else:
        tag_dict = {'B':1, 'M':2, 'E':3, 'S':4, 'N':5}
        for word in sentence:
            res.append(tag_dict[word])
    return [res]

def get_Xy(sentence):
    """将 sentence 处理成 [word1, w2, ..wn], [tag1, t2, ...tn]"""
    sentence = sentence.replace("//", '$/')
    words_tags = re.findall('(.)/(.)', sentence)
    if words_tags:
        words_tags = np.asarray(words_tags)
        words = words_tags[:, 0]
        tags = words_tags[:, 1]
        return words, tags # 所有的字和tag分别存为 data / label
    return None

def initTokenizer():
    df = pd.read_csv('./data/allwords.csv', dtype=np.str, header=None)
    all_words = df[0].values
    tokenizer = Tokenizer(lower=False)
    tokenizer.fit_on_texts(all_words)
    return tokenizer
def generate_data(filename):
    raw_data = pd.read_csv(filename, header=None, delimiter='\t')
    s = ''
    for index, row in raw_data.iterrows():
        if index != 0:
            s = s + ' '
        s = s + row.values[0]
    sentences = re.split(u'[，。！？、‘’“”]/[BMENS]', s)
    datas = []
    labels = []
    for sentence in iter(sentences):
        res = get_Xy(sentence)
        if res:
            datas.append(res[0])
            labels.append(res[1])
    df_data = pd.DataFrame({'words': datas, 'tags': labels}, index=range(len(datas)))
    #　句子长度
    df_data['sentence_len'] = df_data['words'].apply(lambda words: len(words))


    tokenizer = initTokenizer()
    df_data['X'] = df_data['words'].apply(word_to_index, args = ['content', tokenizer])
    df_data['Y'] = df_data['tags'].apply(word_to_index, args = ['sentiment', tokenizer])
    print("finish word_to_index")
    
    maxlen = 40
    df_data['X'] = df_data['X'].apply(pad_sequences, args=[maxlen, 'int32', 'post'])
    df_data['Y'] = df_data['Y'].apply(pad_sequences, args=[maxlen, 'int32', 'post'])
    
    X = np.asarray(list(df_data['X'].values))
    y = np.asarray(list(df_data['Y'].values))
    X = X.reshape(X.shape[0], X.shape[2])
    y = y.reshape(y.shape[0], y.shape[2])
    
    #将标签向量one-hot
    def getY(y):
        res = []
        for row in y:
            tmp = []
            for col in row:
                tmp.append(np_utils.to_categorical(col, 6))
            res.append(tmp)
        return np.array(res)
    y = getY(y)
    y = y.reshape(-1, 40, 6)
    return X, y

Using TensorFlow backend.


In [2]:
X, y = generate_data('./data/sentiment_word_tagging_test.csv')

finish word_to_index


# 1. 模型训练

In [None]:
maxlen = 40
word_size = 128
from keras.layers import Dense, Embedding, LSTM, TimeDistributed, Input, Bidirectional
from keras.models import Model

sequence = Input(shape=(maxlen,), dtype='int32')
embedded = Embedding(len(all_words)+1, word_size, input_length=maxlen, mask_zero=True)(sequence)
blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(embedded)
output = TimeDistributed(Dense(6, activation='softmax'))(blstm)
model = Model(input=sequence, output=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 1024
history = model.fit(X, y, batch_size=batch_size, epochs=10)


# 2. 模型测试

In [3]:
from keras.models import load_model
model = load_model('./model/sentiment_model.hdf5')
model.predict()




[0.25671975493597587, 0.91665562338003237]

In [6]:
x1

array([10, 90,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0], dtype=int32)

In [10]:
x1 = np.array([X[0]])

y1 = y[0]
predict = model.predict(x1)

In [15]:
predict = predict.reshape(40, 6)


In [23]:
predict[0:10]

array([[  1.76341156e-04,   8.56741488e-01,   2.01830436e-02,
          1.45407394e-03,   2.97523191e-04,   1.21147558e-01],
       [  4.59006010e-03,   1.69442650e-02,   4.09273691e-02,
          8.89589727e-01,   1.18091851e-02,   3.61393802e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],
       [  7.62225222e-03,   1.04676122e-02,   5.20272665e-02,
          8.52759242e-01,   1.55924875e-02,   6.15311526e-02],


In [21]:
res = []
for row in predict:
    res.append(np.argmax(row))
res

[1,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3]

In [19]:
predict[0]

array([  1.76341156e-04,   8.56741488e-01,   2.01830436e-02,
         1.45407394e-03,   2.97523191e-04,   1.21147558e-01], dtype=float32)