In [58]:
#LSTM 原理展示

![lstm展示](lstm.png)

In [59]:
from sklearn.model_selection import train_test_split #划分train test
import multiprocessing #多进程模块
import numpy as np #numpy 模块
import gensim
from gensim.models.word2vec import Word2Vec #引入word2vec
from gensim.corpora.dictionary import Dictionary #引入字典

In [60]:
from keras.preprocessing import sequence 
from keras.models import Sequential #序列模型
from keras.layers import Bidirectional #双向模块
from keras.callbacks import EarlyStopping #早停机制
from keras.layers.embeddings import Embedding #嵌入层
from keras.layers.recurrent import LSTM #循环神经网络
from keras.layers.core import Dense, Dropout,Activation #全联接层，dropout防过拟合， 激活层
from keras.models import load_model #导入模型层

In [61]:
import jieba #结巴分词
import pandas as pd #pandas包，数据表格处理模块
import yaml #数据序列化

In [62]:
def is_chinese(uchar):
#   判断一个unicode是否是汉字
    if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
        return True
    else:
        return False

In [63]:
def get_data_df():
#   获取正负例dataframe 和 停用词list
    neg_df=pd.read_table('neg.txt',sep='\t',names=['label','chat'])
    neg_df['label'] = 0
    pos_df=pd.read_table('pos.txt',sep='\t',names=['label','chat'])
    stop_list=list(set(pd.read_table('chinese-stopword.txt',sep='\t',names=['stop'])['stop']))
    return neg_df,pos_df,stop_list

In [64]:
def get_xdata_label(method_select):
#   词频特征的向量化
    neg_df,pos_df,stop_list = get_data_df()
    neg_df['cut']=neg_df['chat'].map(str).apply(method_select)
    del neg_df['chat']
    pos_df['cut']=pos_df['chat'].map(str).apply(method_select)
    del pos_df['chat']
#     neg_df,pos_df = get_cut_word(method_select)
    xdata=[]
    ylabel=[]
    neg_pos = pd.concat([neg_df,pos_df])
    for i in range(len(neg_pos)):
        ixdata = neg_pos.iloc[i,1]
        ilabel = neg_pos.iloc[i,0]
        if ixdata!='':
            xdata.append(ixdata)
            ylabel.append(ilabel)
    return xdata,ylabel

In [65]:
def jieba_getdata(str_str):
#   结巴切词
    stopwords=list(set(pd.read_table('chinese-stopword.txt',sep='\t',names=['stop'])['stop']))
    cut_list=jieba.lcut(str_str)
    cut_list=[iword for iword in cut_list if iword not in stopwords]
    cut_list=[iword for iword in cut_list if is_chinese(iword) is True]
    cut_str=' '.join(cut_list)
    return cut_str

In [66]:
def create_dictionaries(p_model):
    # 根据现有的word2vec模型构建词向量
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(p_model.wv.vocab.keys(), allow_update=True) #doc2bow, 计算机只认识数字
    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号,频数较小的词语索引为0
    w2vec = {word: p_model[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec

In [67]:
def parse_dataset(xdata,w2indx):
#   把word变为整数
    data=[]
    for sentence in xdata:
        new_txt = []
        for word in sentence:
            try:
                new_txt.append(w2indx[word])
            except:
                new_txt.append(0)
        data.append(new_txt)
    return data

In [68]:
def input_transform(string,p_model):
    words=jieba.lcut(string)
    words=np.array(words).reshape(1,-1)
    _,_,combined=create_dictionaries(p_model,words)
    return combined

In [69]:
p_model = gensim.models.Word2Vec.load('w2v_input.model') #导入w2v
w2indx, w2vec = create_dictionaries(p_model) #构建字典



In [70]:
xdata,ylabel=get_xdata_label(jieba_getdata) # 分词

In [None]:
xdata

In [71]:
data = parse_dataset(xdata,w2indx) #word转id

In [None]:
data

In [72]:
train_data= sequence.pad_sequences(data, maxlen=100) #padding 填充

In [27]:
train_data.shape

(9897, 100)

In [73]:
x_train, x_test, y_train, y_test = train_test_split(train_data, ylabel, test_size=0.2) #划分数据集

In [36]:
my_callbacks = [EarlyStopping(monitor='accuracy', patience=2, verbose=1, mode='max')]

# '''
# 如果epoch数量太少，网络有可能发生欠拟合（即对于定型数据的学习不够充分）；
# 如果epoch数量太多，则有可能发生过拟合（即网络对定型数据中的“噪声”而非信号拟合）。

# 早停法旨在解决epoch数量需要手动设置的问题。它也可以被视为一种能够避免网络发生过拟合的正则化方法（与L1/L2权重衰减和丢弃法类似）。

# 目的还是解决过拟合！
# 早停法背后的原理其实不难理解：

# 将数据分为定型集和测试集
# 每个epoch结束后（或每N个epoch后）：
# 用测试集评估网络性能
# 如果网络性能表现优于此前最好的模型：保存当前这一epoch的网络副本
# 将测试性能最优的模型作为最终网络模型
# keras.callbacks.EarlyStopping(monitor='val_loss', patience=0, verbose=0, mode='auto')

# monitor：需要监视的值

# verbose：信息展示模式，0或1

# save_best_only：当设置为True时，将只保存在验证集上性能最好的模型

# mode：‘auto’，‘min’，‘max’之一，在save_best_only=True时决定性能最佳模型的评判准则，例如，当监测值为val_acc时，模式应为max，当检测值为val_loss时，模式应为min。在auto模式下，评价准则由被监测值的名字自动推断。

# '''

In [37]:
vocab_dim = 128
n_symbols = len(w2indx)+1 # 词典数+1
batch_size = 64
n_epoch = 3

embedding_weights = np.zeros((n_symbols, 128))#索引为0的词语，词向量全为0
for word, index in w2indx.items():#从索引为1的词语开始，对每个词语对应其词向量
    embedding_weights[index, :] = w2vec[word]

model = Sequential()  # or Graph or whatever
model.add(Embedding(output_dim=vocab_dim,
                    input_dim=n_symbols,
                    mask_zero=True,
                    weights =[embedding_weights],
                    input_length=100))  # Adding Input Length
model.add(Bidirectional(LSTM(output_dim=50, activation='relu')))
model.add(Dropout(0.5))
model.add(Dense(1)) # 一个神经元
model.add(Activation('sigmoid'))



In [38]:
n_symbols

12028

In [39]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          1539584   
_________________________________________________________________
bidirectional_3 (Bidirection (None, 100)               71600     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 101       
_________________________________________________________________
activation_3 (Activation)    (None, 1)                 0         
Total params: 1,611,285
Trainable params: 1,611,285
Non-trainable params: 0
_________________________________________________________________


In [40]:
print ('Compiling the Model...')
model.compile(loss='binary_crossentropy',
              optimizer='adam',metrics=['accuracy'])

print ("Train...")
model.fit(x_train, np.array(y_train), batch_size=batch_size, nb_epoch=n_epoch,
          verbose=1, validation_data=(x_test, np.array(y_test)), callbacks=my_callbacks)

print ("Evaluate...")
score = model.evaluate(x_test, np.array(y_test),
                            batch_size=batch_size)

Compiling the Model...
Train...




Train on 7917 samples, validate on 1980 samples
Epoch 1/3
Epoch 2/3




Epoch 3/3
Evaluate...


In [None]:
## predict

In [45]:
string = '标准间太差房间还不如3星的而且设施非常陈旧.建议酒店把老的标准间从新改善'
# string = '我住的行政房是全新的，有五星级标准*v*。只可惜楼顶的中餐厅在停业装修，我挺喜欢这个餐厅的，正宗广东菜，希望下次来时复业了。似乎酒店花了很多精力在装修更新。'
str_list = jieba_getdata(string)
new_txt = []
for word in str_list:
    try:
        new_txt.append(w2indx[word])
    except:
        new_txt.append(0)

pre_text = sequence.pad_sequences([new_txt], maxlen=100)

In [46]:
pre_text

array([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,  7381,  2476, 11393,     0,  4297,
            0,  5040,     0,  5891, 11393,     0,  6889,     0, 10294,
         6695,     0, 11452,  6802,     0,  5312,     0,     0, 11156,
         5269,     0,  7381,  2476, 11393,     0,  6651,     0,  6513,
            0]], dtype=int32)

In [47]:
result=model.predict_proba(pre_text)
if result[0][0]>0.5:
    print (string,'positive')
else:
    print (string,'negative')

标准间太差房间还不如3星的而且设施非常陈旧.建议酒店把老的标准间从新改善 positive


In [48]:
result

array([[0.57297057]], dtype=float32)

In [None]:
############## text CNN ###########

![textcnn](textcnn.png)

In [74]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout

In [75]:
maxlen = 100
n_symbols = len(w2indx)+1 # 词典数+1
input = Input((maxlen,))

In [76]:
embedding = Embedding(n_symbols, 128, weights = [embedding_weights],input_length=maxlen)(input)

In [77]:
convs = []
for kernel_size in [3, 4, 5]:
    c = Conv1D(64, kernel_size, activation='relu')(embedding) #卷积
    c = GlobalMaxPooling1D()(c)
    convs.append(c)
x = Concatenate()(convs)

output = Dense(1, activation='relu')(x)
model = Model(inputs=input, outputs=output)

In [78]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 100, 128)     1539584     input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 98, 64)       24640       embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 97, 64)       32832       embedding_5[0][0]                
__________________________________________________________________________________________________
conv1d_6 (

In [79]:
batch_size = 128
epochs = 3

In [80]:
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

In [81]:
early_stopping = EarlyStopping(monitor='val_acc', patience=3, mode='max')
model.fit(x_train, np.array(y_train),
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, np.array(y_test)))

Train on 7917 samples, validate on 1980 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x10c028dd8>

In [84]:
# string = '标准间太差房间还不如3星的而且设施非常陈旧.建议酒店把老的标准间从新改善'
string = '我住的行政房是全新的，有五星级标准*v*。只可惜楼顶的中餐厅在停业装修，我挺喜欢这个餐厅的，正宗广东菜，希望下次来时复业了。似乎酒店花了很多精力在装修更新。'
str_list = jieba_getdata(string)
new_txt = []
for word in str_list:
    try:
        new_txt.append(w2indx[word])
    except:
        new_txt.append(0)

pre_text = sequence.pad_sequences([new_txt], maxlen=100)

In [85]:
result=model.predict(pre_text)
if result[0][0]>0.5:
    print (string,'positive')
else:
    print (string,'negative')

我住的行政房是全新的，有五星级标准*v*。只可惜楼顶的中餐厅在停业装修，我挺喜欢这个餐厅的，正宗广东菜，希望下次来时复业了。似乎酒店花了很多精力在装修更新。 positive
