In [1]:
from keras.layers.core import Activation,Dense
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
import nltk#分词
import collections#用来统计词频
import numpy as np



In [35]:
#我们需要知道数据中有多少个不同的单词，每句话由多少个单词组成。 
maxlen=0
word_freqs=collections.Counter()#为可以进行哈希的对象计数
num_recs=0#样本数
with open ('train_data.txt','r+',encoding='utf-8') as f:
    for line in f:
        label,sentence=line.strip().split('\t')
        words=nltk.word_tokenize(sentence.lower())
        if len(words)>maxlen:
            maxlen=len(words)
        for word in words:
            word_freqs[word]+=1
        num_recs+=1
    
            


In [36]:
print('max_len:{}'.format(maxlen))
print('nb_words:{}'.format(len(word_freqs)))

max_len:42
nb_words:2330


In [37]:
# 根据不同单词的个数 (nb_words)，我们可以把词汇表的大小设为一个定值，并且对于不在词汇表里的单词，把它们用伪单词 UNK 代替。 根据句子的最大长度 (max_lens)，我们可以统一句子的长度，把短句用 0 填充。 
#依前所述，我们把 VOCABULARY_SIZE 设为 2002。包含训练数据中按词频从大到小排序后的前 2000 个单词，外加一个伪单词 UNK 和填充单词 0。 最大句子长度 MAX_SENTENCE_LENGTH 设为40。 
MAX_FEATURES=2000
MAX_SENTENCE_LENGTH=40

In [38]:
#接下来建立两个 lookup tables，分别是 word2index 和 index2word，用于单词和数字转换。 
vocab_size=min(MAX_FEATURES,len(word_freqs))+2
word2index={x[0]: i+2 for i,x in enumerate(word_freqs.most_common(MAX_FEATURES)) }
word2index['PAD']=0
word2index['UNK']=1
index2word={v:k for k,v in word2index.items()}


In [39]:
#下面就是根据 lookup table 把句子转换成数字序列了，并把长度统一到 MAX_SENTENCE_LENGTH， 不够的填 0 ， 多出的截掉
X=np.empty(num_recs,dtype=list)
y=np.zeros(num_recs)
i=0
with open('train_data.txt','r+',encoding='utf-8') as f:
    for line in f:
        label,sentence=line.strip().split('\t')
        words=nltk.word_tokenize(sentence.lower())
        seqs=[]
        for word in words:
            if word in word2index:
                seqs.append(word2index[word])
            else:
                seqs.append(word2index['UNK'])
        X[i]=seqs       
        
        y[i]=int(label)
    
        i+=1
X=sequence.pad_sequences(X,maxlen=MAX_SENTENCE_LENGTH)
        
    

In [41]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
#数据准备好后，就可以上模型了。这里损失函数用 binary_crossentropy， 优化方法用 adam。 至于 EMBEDDING_SIZE , HIDDEN_LAYER_SIZE , 以及训练时用到的BATCH_SIZE 和 NUM_EPOCHS 这些超参数，就凭经验多跑几次调优了。 
EMBEDDING_SIZE=128
HIDDEN_LAYER_SIZE=64

model=Sequential()
model.add(Embedding(vocab_size,EMBEDDING_SIZE,input_length=MAX_SENTENCE_LENGTH))
# Embedding层只能作为模型的第一层

# 参数
# input_dim：大或等于0的整数，字典长度，即输入数据最大下标+1

# output_dim：大于0的整数，代表全连接嵌入的维度

# embeddings_initializer: 嵌入矩阵的初始化方法，为预定义初始化方法名的字符串，或用于初始化权重的初始化器。参考initializers

# embeddings_regularizer: 嵌入矩阵的正则项，为Regularizer对象

# embeddings_constraint: 嵌入矩阵的约束项，为Constraints对象

# mask_zero：布尔值，确定是否将输入中的‘0’看作是应该被忽略的‘填充’（padding）值，该参数在使用递归层处理变长输入时有用。设置为True的话，模型中后续的层必须都支持masking，否则会抛出异常。如果该值为True，则下标0在字典中不可用，input_dim应设置为|vocabulary| + 1。

# input_length：当输入序列的长度固定时，该值为其长度。如果要在该层后接Flatten层，然后接Dense层，则必须指定该参数，否则Dense层的输出维度无法自动推断。
model.add(LSTM(HIDDEN_LAYER_SIZE,dropout=0.2,recurrent_dropout=0.2))
#recurrent_dropout：0~1之间的浮点数，控制循环状态的线性变换的神经元断开比例
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
BATCH_SIZE=32
NUM_EPOCHS=10
model.fit(Xtrain,ytrain,batch_size=BATCH_SIZE,epochs=NUM_EPOCHS,
         validation_data=(Xtest,ytest))

Train on 5668 samples, validate on 1418 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x8eead30>

In [55]:
#我们用已经训练好的 LSTM 去预测已经划分好的测试集的数据，查看其效果。选了 5 个句子的预测结果，并打印出了原句。

score,acc=model.evaluate(Xtest,ytest,batch_size=BATCH_SIZE)
print("\nTest score : %.3f,accuracy:%.3f"%(score,acc))

print('{}  {}    {}'.format('预测','真实','句子'))
for i in range(5):
    idx=np.random.randint(len(Xtest))
    xtest=Xtest[idx].reshape(1,40)
    
    ylabel=ytest[idx]
    
    ypred=model.predict(xtest)[0][0]
    
    sent=' '.join([index2word[x] for x in xtest[0] if x!=0])
    print(' {}      {}     {}'.format(int(round(ypred)), int(ylabel), sent))



Test score : 0.048,accuracy:0.991
预测  真实    句子
 1      1     i am going to start reading the harry potter series again because that is one awesome story .
 1      1     the last stand and mission impossible 3 both were awesome movies .
 1      1     mission impossible 3 was awesome..
 0      0     i think i hate harry potter because it outshines much better reading material out there and the movies are just plain stupid to begin with .
 1      1     sunday before that we went and saw mission impossible 3 so that was awesome .


In [57]:
INPUT_SENTENCES=['I love reading.','You are so boring.']
XX=np.empty(len(INPUT_SENTENCES),dtype=list)
i=0
for sentence in INPUT_SENTENCES:
    words=nltk.word_tokenize(sentence.lower())
    seq=[]
    for word in words:
        if word in word2index:
            seq.append(word2index[word])
        else:
            seq.append(word2index['UNK'])
    XX[i] = seq
    i+=1
XX = sequence.pad_sequences(XX, maxlen=MAX_SENTENCE_LENGTH)
labels = [int(round(x[0])) for x in model.predict(XX) ]
label2word = {1:'积极', 0:'消极'}
for i in range(len(INPUT_SENTENCES)):
    print('{}   {}'.format(label2word[labels[i]], INPUT_SENTENCES[i]))

积极   I love reading.
消极   You are so boring.
