In [1]:
import pandas as pd
import os
import re
import numpy as np
import jieba
from gensim.models import word2vec
import multiprocessing
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
from sklearn.cross_validation import train_test_split
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml
import yaml

Using TensorFlow backend.


In [2]:
#读取数据
data = pd.read_csv('app_data_new.csv',sep=None,encoding='utf8',engine = 'python')

In [3]:
app_tags_list = data['app_tags']

In [4]:
#APP标签
app_tags = []
for d in app_tags_list:
    app_tag = d.strip().split()
    for x in app_tag:
        app_tags.append(x)

In [5]:
#标签去重
app_tags = list(set(app_tags))

In [6]:
#定义函数增0列
def get_zero_list():
    zero_list = []
    for i in range(len(app_tags)):
        zero_list.append(0)
    return zero_list

In [7]:
#APP标签0-1向量化为列表
app_tags_matrix = []
zero_list = get_zero_list()
for a in app_tags_list:
    zero_list = get_zero_list()
    for per_tag in a:
        for app_tag in app_tags:
            if(per_tag == app_tag):
                zero_list[app_tags.index(app_tag)] = 1
    app_tags_matrix.append(zero_list)

In [8]:
#APP描述
app_discribe = data['app_discribe']

In [9]:
#创建停用词列表
def stopwordslist(filepath):  
    stopwords = [line.strip() for line in open(filepath, 'r').readlines()]  
    return stopwords 
def dt_remove(data):
    pattern = u'[a-zA-Z0-9]+'
    wd_length = len(re.findall(pattern,data))
    return wd_length

In [10]:
#jieba分词
def seg_sentence(sentence):  
    sentence_seged = jieba.cut(sentence.strip())
    stopwords = stopwordslist('stopwords.txt')   
    outstr = []
    for word in sentence_seged:  
        if word not in stopwords:  
            if word != '\t': 
                if word != '\xa0':
                    if dt_remove(word) ==0:
                        outstr.append(word) 
    return outstr 

In [11]:
#对APP描述进行分词
app_discribe_list = []
for app in app_discribe:
    app_discribe_list.append(seg_sentence(app))
with open('app_discribe_cut_list.txt','w',encoding='utf-8') as output:
    for app in app_discribe_list: 
        output.write(' '.join(app) + '\n')

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Ax\AppData\Local\Temp\jieba.cache
Loading model cost 0.571 seconds.
Prefix dict has been built succesfully.


In [12]:
model = word2vec.Word2Vec(app_discribe_list,min_count=2,size=100) 
model.save('Word2vec_model20180108.pkl')

In [13]:
#创建词语字典，并返回word2vec模型中词语的索引，词向量
def create_dictionaries(model):
    gensim_dict = Dictionary()
    gensim_dict.doc2bow(model.wv.vocab.keys(), allow_update=True)
    w2indx = {v: k + 1 for k, v in gensim_dict.items()} #词语的索引，从1开始编号
    w2vec = {word: model[word] for word in w2indx.keys()} #词语的词向量
    return w2indx, w2vec

In [14]:
#词向量模型训练
index_dict, word_vectors= create_dictionaries(model)

In [15]:
#每个APP描述对应的词索引
discribe_index = []
for sentence in app_discribe_list:
    new_txt = []
    for word in sentence:
        try:
            new_txt.append(w2indx[word])
        except:
            new_txt.append(0)
    discribe_index.append(new_txt)
discribe_index = sequence.pad_sequences(discribe_index, maxlen=100)

In [16]:
label_array = np.array(app_tags_matrix)
n_symbols = len(index_dict) + 1  #所有单词的索引数，频数小于10的词语索引为0，所以加1
embedding_weights = np.zeros((n_symbols, 100)) #索引为0的词语，词向量全为0
for word, index in index_dict.items(): #从索引为1的词语开始，对每个词语对应其词向量
    embedding_weights[index, :] = word_vectors[word]
x_train, x_test, y_train, y_test = train_test_split(discribe_index, label_array, test_size=0.2)

In [21]:
#定义网络结构
def train_lstm(n_symbols, embedding_weights, x_train, y_train, x_test, y_test):
    model = Sequential()
    model.add(Embedding(output_dim=100, input_dim=n_symbols, mask_zero=True,weights=[embedding_weights], input_length=100)) 
    model.add(LSTM(output_dim=100, activation='sigmoid', inner_activation='hard_sigmoid'))
    model.add(Dropout(0.5))
    model.add(Dense(4703))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
    model.fit(x_train, y_train, batch_size=1000, verbose=1,epochs=3, shuffle=True, validation_split=0.2)
    score = model.evaluate(x_test, y_test, batch_size=100)
    yaml_string = model.to_yaml()
    model.save_weights('lstm.h5')
    print('Test score:', score)

In [17]:
model = Sequential()
model.add(Embedding(output_dim=100, input_dim=n_symbols, mask_zero=True,weights=[embedding_weights], input_length=100)) 
model.add(LSTM(output_dim=100, activation='sigmoid', inner_activation='hard_sigmoid'))
model.add(Dropout(0.5))
model.add(Dense(4703))
model.add(Activation('sigmoid'))
#model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=1000, verbose=1,epochs=3, validation_split=0.1)
score = model.evaluate(x_test, y_test, batch_size=1000)
yaml_string = model.to_yaml()
model.save_weights('lstm.h5')
print('Test score:', score)

  This is separate from the ipykernel package so we can avoid doing imports until


Train on 108394 samples, validate on 12044 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: [0.69297343492507935, 0.99997634471591146]


In [22]:
train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

  """


Train on 96350 samples, validate on 24088 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: [0.69299240149520469, 0.99997630403588988]
