In [1]:
# 导入numpy 
import numpy as np
# 导入word2vec 文字转向量包
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
# 导入jieba分词
import jieba
import jieba.analyse
from keras.utils import to_categorical

Using TensorFlow backend.


In [2]:
# 从文本读取数据
def load_data(file):
    data = open(file)
    return data.readlines()

In [37]:
data = load_data('data3.txt')
# 处理文本数据分离文本和标签
X=[]
Y_=[]
for i in data:
    text = i.split('***',1)
    if len(text) == 2:
        X.append(text[0]) 
        Y_.append(text[1].replace('\n',''))
Y_value = list(set(Y_))
class_data = {i:Y_value.index(i) for i in Y_value}
Y_data = [class_data[i] for i in Y_]
# 转为onehot编码
Y = to_categorical(np.array(Y_data))
print(Y)
# 处理文本
sentences_list = []
for line in X:
    single_list = line.strip().split(' ')
    single_list = jieba.analyse.extract_tags(single_list[0],topK=20,withWeight=False,allowPOS=())
    while '' in single_list:
        single_list.remove('')
    sentences_list.append(single_list)
print(sentences_list)
with open('class_data.txt','w') as f:
    f.write(str(class_data))

[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [11]:
# 调用word2vec 模型返回字典和词的向量

def create_dictionaries(model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(),allow_update=True)
    w2indx = {v:k+1 for k,v in gensim_dict.items()}
    w2vec = {word:model[word] for word in w2indx.keys()}
    return w2indx,w2vec

In [39]:
# word2vec 方法转向量
model = Word2Vec(sentences_list,size=100,min_count=5,window=5)
index_dict,word_vectors = create_dictionaries(model)

with open('index_dict.txt','w') as f:
    f.write(str(index_dict))

  import sys


In [13]:
# 生成词嵌入向量
# 把上边一个词向量的长度转为100
n_symbols = len(index_dict) +1
embedding_weights = np.zeros((n_symbols,100))
for w,index in index_dict.items():
    embedding_weights[index,:] = word_vectors[w]
print(embedding_weights.shape)
# 词有7767个

(13745, 100)


In [14]:
# 在字典中找到词返回索引
# 文本和词典匹配将我们的词特征转为数字
def text_to_index_array(dic,sentence):
    new_sentence = []
    for sen in sentence:
        new_sen = []
        for word in sen:
            try:
                new_sen.append(dic[word])
            except:
                new_sen.append(0)
        new_sentence.append(new_sen)
    return np.array(new_sentence)

x = text_to_index_array(index_dict,sentences_list)
print(x[10])

[33, 484, 9854, 3471, 2577, 3804, 6853, 4996, 3005, 10648]


In [15]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense,Dropout,Activation
from sklearn.model_selection import train_test_split

In [16]:
# 划分数据集生成训练和测试4:1
# sklearn包划分数据
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size=0.2)
# 把x输入特征标准化，不够的补0是每一个输入的x 为50长度
x_train = sequence.pad_sequences(x_train,maxlen=50)
x_test = sequence.pad_sequences(x_test,maxlen=50)
x_train = np.array(x_train)
x_test = np.array(x_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
print(x_train.shape)

(88053, 50)


In [30]:
model = Sequential()
# 输入特征
model.add(Embedding(output_dim=100,input_dim=n_symbols,mask_zero=True,weights=[embedding_weights]))
# model.add(Dense(12))
# lstm隐藏
model.add(LSTM(output_dim=50,activation='relu',inner_activation='hard_sigmoid'))
# 随机失活
model.add(Dropout(0.5))
# 全连接层 输出12个分类
model.add(Dense(12))
# softmax激活
model.add(Activation('softmax'))
# 多分类损失函数，梯度下降
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

  


In [31]:
model.fit(x_train,y_train,batch_size=128,epochs=20,validation_data=(x_test,y_test))
score,acc = model.evaluate(x_test,y_test,batch_size=128)
print(score,acc)
print(model.summary())

Train on 88053 samples, validate on 22014 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
1.184155183266454 0.8137548831748824
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 100)         1374500   
_________________________________________________________________
lstm_7 (LSTM)                (None, 50)                30200     
_________________________________________________________________
dropout_7 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 12)                612       
_________________________________________________________________
activation_7 (Activati

In [32]:
model.save('weight.h5')

In [33]:
from keras.models import load_model

this_model = load_model('weight.h5')

In [35]:
# 直接输入一句话对其进行预测
def convert_vector_predict(str_r):
    new_str = jieba.analyse.extract_tags(str_r,topK=20,withWeight=False,allowPOS=())
#     print(new_str)
    x = text_to_index_array(index_dict,[new_str])
    x = sequence.pad_sequences(x,maxlen=50)
#     print(x)
    y = this_model.predict_classes(x)
    return y
value = convert_vector_predict('孩子出不出色，关注在于这个……')
print([k for k,v in class_data.items() if v==value[0]][0])

通知提醒
