## Session 5: 训练情感分类模型 (Train an emotion classifier with keras)

___

### 1. set up your data

In [121]:
from auto_everything.base import Terminal
t = Terminal()

commands = """
mkdir data
cd data

mkdir positive
mkdir negative

cd ..
cp data.txt data/negative/data.txt
cp another_data.txt data/positive/another_data.txt
"""

t.run(commands)

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘positive’: File exists
mkdir: cannot create directory ‘negative’: File exists



___

### 2. build model

#### import packages

In [122]:
from os import listdir
from os.path import isfile, join
from pprint import pprint
from auto_everything.base import IO
io = IO()

import jieba
import codecs
import pickle
import random

from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.layers.core import Dense
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

import numpy as np

#### define save or load function for reuse var

In [123]:
def __pickleStuff(filename, stuff):
    save_stuff = open(filename, "wb")
    pickle.dump(stuff, save_stuff)
    save_stuff.close()
    
def __loadStuff(filename):
    saved_stuff = open(filename,"rb")
    stuff = pickle.load(saved_stuff)
    saved_stuff.close()
    return stuff

#### load data

In [124]:
dataBaseDirPos = "./data/positive/"
dataBaseDirNeg = "./data/negative/"

positiveFiles = [dataBaseDirPos + f for f in listdir(dataBaseDirPos) if isfile(join(dataBaseDirPos, f)) and '.txt' in f]
negativeFiles = [dataBaseDirNeg + f for f in listdir(dataBaseDirNeg) if isfile(join(dataBaseDirNeg, f)) and '.txt' in f]

print(positiveFiles)
print(negativeFiles)

['./data/positive/msgs.txt', './data/positive/another_data.txt']
['./data/negative/data.txt', './data/negative/BilibiliComments.txt']


In [125]:
documents = []
positive_nums = 0
negative_nums = 0

for filename in positiveFiles:
    all_text = io.read(filename)
    text_list = all_text.split("\n\n——————————————\n\n")
    for text in text_list:
        documents.append((text, "pos"))
        positive_nums += 1

for filename in negativeFiles:
    all_text = io.read(filename)
    text_list = all_text.split("\n\n——————————————\n\n")
    for text in text_list:
        documents.append((text, "neg"))
        negative_nums += 1

print('positive_nums:', positive_nums)
print('negative_nums:', negative_nums)
pprint(documents[:3])

positive_nums: 7975
negative_nums: 10600
[('我对垃圾的断绝能力一直很低\n\n导致我在现实中经常很不爽\n\n\n\n\n要是拒绝可以更坚决一点，就没那么多伤害了', 'pos'),
 ('喜剧之王 一点都不好看', 'pos'),
 ('构建一套系统真的没那么容易\n'
  '\n'
  '比如 找工作APP\n'
  '\n'
  '\n'
  '\n'
  '\n'
  '\n'
  '如何构建一个诚信机制，既能让没有任何认证的人找到工作，又不让企业吃亏\n'
  '\n'
  '\n'
  '\n'
  '(淘宝是怎么做的？让人数少的想赚钱的商家交保证金，人数多的消费者不交钱；\n'
  '\n'
  '当把这一套逻辑放在程序员身上，各种问题：\n'
  '\n'
  '万一公司想空手套白狼招人免费干活怎么办？\n'
  '\n'
  '万一有一些傻逼啥都不会恶意给企业差评怎么办？\n'
  '\n'
  '万一有企业派人刷好评洗白怎么办？\n'
  '\n'
  '这些都是问题)',
  'pos')]


#### shuffle data

In [126]:
random.shuffle(documents)

pprint(documents[:3])

[('好像已经被盗了', 'neg'), ('龙哥日常忘词', 'neg'), ('腿超伸怎么办', 'neg')]


#### prepare data for model 1

In [127]:
# Tokenize only
totalX = []
totalY = [str(document[1]) for document in documents]

for document in documents:
    seg_list = jieba.cut(document[0], cut_all=False)
    seg_list = list(seg_list)
    totalX.append(seg_list)
    
print(totalX[:1])
print(totalY[:1])

[['好像', '已经', '被盗', '了']]
['neg']


#### prepare data for model 2

In [128]:
h = sorted([len(sentence) for sentence in totalX])
maxLength = h[int(len(h) * 0.60)]



# Keras Tokenizer expect the words tokens to be seperated by space 
totalX = [" ".join(wordslist) for wordslist in totalX]

input_tokenizer = Tokenizer(30000) # Initial vocab size
input_tokenizer.fit_on_texts(totalX)

vocab_size = len(input_tokenizer.word_index) + 1
print("input vocab_size:",vocab_size)

totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(totalX), maxlen=maxLength))
print(totalX[:1])



__pickleStuff("./data/input_tokenizer_chinese.p", input_tokenizer)

input vocab_size: 40821
[[    0     0     0     0     0     0     0     0     0     0     0     0
      0   446   268 11059     4]]


#### prepare data for model 3

In [129]:
target_tokenizer = Tokenizer(3)
target_tokenizer.fit_on_texts(totalY)
print("output vocab_size:",len(target_tokenizer.word_index) + 1)

totalY = np.array(target_tokenizer.texts_to_sequences(totalY)) -1
totalY = totalY.reshape(totalY.shape[0])

print(totalY[3], documents[3])

output vocab_size: 3
0 ('233评论', 'neg')


In [130]:
totalY = to_categorical(totalY, num_classes=2)
print(totalY[:3])

[[1. 0.]
 [1. 0.]
 [1. 0.]]


In [131]:
output_dimen = totalY.shape[1]

target_reverse_word_index = {v: k for k, v in list(target_tokenizer.word_index.items())}
sentiment_tag = [target_reverse_word_index[1],target_reverse_word_index[2]] 
metaData = {"maxLength":maxLength,"vocab_size":vocab_size,"output_dimen":output_dimen,"sentiment_tag":sentiment_tag}
__pickleStuff("./data/meta_sentiment_chinese.p", metaData)

#### main model

In [132]:
embedding_dim = 256


model = Sequential()
model.add(Embedding(vocab_size, embedding_dim,input_length = maxLength))
# Each input would have a size of (maxLength x 256) and each of these 256 sized vectors are fed into the GRU layer one at a time.
# All the intermediate outputs are collected and then passed on to the second GRU layer.
model.add(GRU(256, dropout=0.9, return_sequences=True))
# Using the intermediate outputs, we pass them to another GRU layer and collect the final output only this time
model.add(GRU(256, dropout=0.9))
# The output is then sent to a fully connected layer that would give us our final output_dim classes
model.add(Dense(output_dimen, activation='softmax'))

In [133]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [134]:
model.fit(totalX, totalY, validation_split=0.1, batch_size=32, epochs=1, verbose=1)

Train on 16717 samples, validate on 1858 samples
Epoch 1/1


<keras.callbacks.History at 0x7f943214a278>

In [135]:
model.save('./data/sentiment_chinese_model.H5')

___

### 3. predict

In [136]:
from keras.models import load_model

model = load_model('./data/sentiment_chinese_model.H5')

In [137]:
def findFeatures(text):
    seg_list = jieba.cut(text, cut_all=False)
    seg_list = list(seg_list)
    text = " ".join(seg_list)
    textArray = [text]
    input_tokenizer_load = __loadStuff("./data/input_tokenizer_chinese.p")
    textArray = np.array(pad_sequences(input_tokenizer_load.texts_to_sequences(textArray), maxlen=maxLength))
    return textArray

def predict(text):
    if model is None:
        print("Please run \"loadModel\" first.")
        return None
    features = findFeatures(text)
    predicted = model.predict(features)[0] # we have only one sentence to predict, so take index 0
    predicted = np.array(predicted)
    probab = predicted.max()
    predition = sentiment_tag[predicted.argmax()]
    return predition, probab

In [139]:
predict("还好，床很大而且很干净，前台很友好，很满意，下次还来。")

('neg', 0.9712805)

In [140]:
predict("体验太差")

('neg', 0.74777484)

In [142]:
predict("hi, i'm yingshaoxo")

('pos', 0.9814556)

In [143]:
predict("说了多少次了！老子不叫塞尔达！那个臭女人才叫塞尔达")

('neg', 0.97021693)

In [144]:
predict("""这上面讲只有 Open your mind，才更容易学会新知识
我同意，因为只有在放长假的时候，我才能不担心忘记学校教的垃圾知识，
我才能全身心地投入新知识的学习""")

('pos', 0.9988325)