In [1]:
import os

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.layers.recurrent import SimpleRNN
from keras.layers.recurrent import LSTM
from keras.layers import Dense,Dropout,Flatten,Conv2D,MaxPooling2D

text_path = '../data/text/'

Using TensorFlow backend.


In [2]:
#读取文件，并为数据设定标签：1:'负面的',0:'正面的'
def read_files():
    all_label = []
    with open(text_path + 'negitive.txt', 'rb') as fp:
        all_text = fp.readlines()

    all_text_len = len(all_text)
    for i in range(len(all_text)):
        all_label.append(1)

    with open(text_path + 'postive.txt', 'rb') as fp:
        all_text += fp.readlines()

    for i in range(len(all_text[all_text_len:])):
        all_label.append(0)

    return all_text, all_label

In [3]:
#建立词典
def create_dict():
    dict = open(text_path + 'negitive.txt', 'rb').read()
    dict += open(text_path + 'postive.txt', 'rb').read()
    dict_list = set(list(dict.decode('utf8')))
    dicts = {}
    for i, d in enumerate(dict_list):
        dicts[d] = i
    return dicts

In [4]:
#将评论的文字转换成序列
def create_seq(all_text):
    seq_len = []
    dicts = create_dict()
    sequences = []
    for text in all_text:
        if text == '\n':
            continue
        text = text.strip()
        text_list = list(text.decode('utf8'))
        sequence = [dicts[char] for char in text_list]
        seq_len.append(len(sequence))
        sequences.append(sequence)
    return sequences, seq_len, dicts

In [5]:
def text_model(max_seq, max_features):
    model = Sequential()
    model.add(Embedding(input_dim=max_features,#大或等于0的整数，字典长度，即输入数据最大下标+1
                        output_dim=128, #大于0的整数，代表全连接嵌入的维度
                        input_length=max_seq #当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，否则Dense层的输出维度无法自动推断
              ))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.35))
    model.add(Dense(units=1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [6]:
def text_rnn_model(max_seq, max_features):
    model = Sequential()
    model.add(Embedding(input_dim=max_features,#大或等于0的整数，字典长度，即输入数据最大下标+1
                        output_dim=128, #大于0的整数，代表全连接嵌入的维度
                        input_length=max_seq #当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，否则Dense层的输出维度无法自动推断
              ))
    model.add(Dropout(0.35))
    model.add(SimpleRNN(units=16))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.35))
    model.add(Dense(units=1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
def text_lstm_model(max_seq, max_features):
    model = Sequential()
    model.add(Embedding(input_dim=max_features,#大或等于0的整数，字典长度，即输入数据最大下标+1
                        output_dim=128, #大于0的整数，代表全连接嵌入的维度
                        input_length=max_seq #当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，否则Dense层的输出维度无法自动推断
              ))
    model.add(Dropout(0.35))
    model.add(LSTM(32))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.35))
    model.add(Dense(units=1, activation='sigmoid'))
    model.summary()
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [8]:
all_text, all_label = read_files()

#dicts = create_dict()
sequences, seq_len, dicts = create_seq(all_text)

print(max(seq_len))
max_seq = int(max(seq_len) / 2)
print('max_seq:', max_seq)

max_features = len(dicts) + 1

#固定每个sequences的长度
train_data = pad_sequences(sequences, maxlen=max_seq) 

#model = text_model(max_seq, max_features)
model = text_rnn_model(max_seq, max_features)
#model = text_lstm_model(max_seq, max_features)

'''
try:
    model.load_weights("saveModel/textModel.h5")
    print ("加载模型成功")
except:
    print ("加载模型失败")
'''    
model.fit(train_data, all_label, validation_split=0.2, batch_size=256, epochs=5, verbose=2)

#model.save_weights("saveModel/textModel.h5")
#print ("保存成功")

500
max_seq: 250
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 128)          405248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 250, 128)          0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 16)                2320      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 412,177
Trainable params: 412,177
Non-trainable params: 0
_____________________________________________________

<keras.callbacks.History at 0x1d8f8f7bcf8>

In [9]:
#测试
test = ["这个货很好，很流畅", "这个东西真好吃，",
        "服务太糟糕,味道差", "你他妈的是个傻逼",
        "这个贴花的款式好看",
        "看着不错，生产日期也是新的是16年12月份的，就是有点小贵",
        "一股淡淡的腥味.每次喝完都会吃一口白糖",
        "还没喝，不过，看着应该不错哟",
        "用来看电视还是不错的，就是有些大打字不习惯，要是可以换输入法就好了！",
        "嗯，中间出了点小问题已经联系苹果客服解决了，打游戏也没有卡顿，总体来讲还不错吧！",
        "下软件下的多的时候死了一回机，强制重启之后就恢复了",
        "东西用着还可以很流畅！"]
test_sequences = []
for line in test:
    l = list(line)
    sequence = [dicts[char] for char in l]
    test_sequences.append(sequence)

test_data = pad_sequences(test_sequences, maxlen=max_seq)

predict = model.predict_classes(test_data)
#print(predict[:20])
#转换为1维矩阵
predict_classes = predict.reshape(-1)
#print(predict_classes[:20])
sentimentDict = {1:'负面的',0:'正面的'}
    
result = model.predict(test_data)
for i, j in enumerate(test):
    print(j, '[预测结果:',sentimentDict[predict_classes[i]], ']', '[Probability:', result[i], ']')
    print()

这个货很好，很流畅 [预测结果: 正面的 ] [Probability: [ 0.00202224] ]

这个东西真好吃， [预测结果: 正面的 ] [Probability: [ 0.35236385] ]

服务太糟糕,味道差 [预测结果: 负面的 ] [Probability: [ 0.99977666] ]

你他妈的是个傻逼 [预测结果: 负面的 ] [Probability: [ 0.87382108] ]

这个贴花的款式好看 [预测结果: 正面的 ] [Probability: [ 0.02073266] ]

看着不错，生产日期也是新的是16年12月份的，就是有点小贵 [预测结果: 正面的 ] [Probability: [ 0.29493278] ]

一股淡淡的腥味.每次喝完都会吃一口白糖 [预测结果: 负面的 ] [Probability: [ 0.75243485] ]

还没喝，不过，看着应该不错哟 [预测结果: 正面的 ] [Probability: [ 0.05180429] ]

用来看电视还是不错的，就是有些大打字不习惯，要是可以换输入法就好了！ [预测结果: 正面的 ] [Probability: [ 0.02825704] ]

嗯，中间出了点小问题已经联系苹果客服解决了，打游戏也没有卡顿，总体来讲还不错吧！ [预测结果: 负面的 ] [Probability: [ 0.5875963] ]

下软件下的多的时候死了一回机，强制重启之后就恢复了 [预测结果: 负面的 ] [Probability: [ 0.99715388] ]

东西用着还可以很流畅！ [预测结果: 正面的 ] [Probability: [ 0.0041736] ]

