In [2]:
# coding: utf-8

import gensim
import random
from sklearn.externals import joblib
import os
import csv
import matplotlib as mpt
import jieba
import numpy as np
import tensorflow as tf
hidden_size=768
seq_len=128

'''data_path defines where "data.csv" is(from extract_trainset.ipynb).'''
data_path="../../mid_data/training_data/mda_data"
'''model_path defines where the trained model will save'''
model_path="../../model/word2vec_model"



if not os.path.exists(model_path):
    os.makedirs(model_path)

In [3]:
'''lead ino pre-parpared word vector model file'''
VECTOR_DIR = './dictionary/word_vector.bin'  
model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=False)

# Building Training set, Test set
####  根据extract_trainset.ipynb 中生成的data.csv，得到 train.csv,dev.csv,test.csv(如果已经存在此类的csv，可以跳过)
- 从原csv数据中抽取训练模型时需要的句子内容，具体标签。
- 按照标签分为不同的list，-1为消极，0为中性，1为积极。统计各个标签的数据量。
- 由于数据的不均衡，按照最少标签数据量向其他两个list中随机取相同数量的数据。
- 按比例将标签均衡的数据集分为train set 和dev set。
- 把剩下所有其他数据归入test set。
- 返回三个csv 文件 后续使用

In [4]:
def build_trainset_testset(scale):
    data_positive=list()
    data_negative=list()
    data_neutral=list()
    data_all=list()
    data_file=csv.reader(open(os.path.join(data_path,"data.csv"),'r',encoding='utf-8-sig'))
    
    
    for line in data_file:
        
        if line[4]=='1':
            data_positive.append([line[4],line[5]])
            data_all.append([line[4],line[5]])
        elif line[4]=='-1':
            data_negative.append([line[4],line[5]])
            data_all.append([line[4],line[5]])
        elif line[4]=='0':
            data_neutral.append([line[4],line[5]])
            data_all.append([line[4],line[5]])
        
    print("len_positive:"+str(len(data_positive))+" len_neutral:"+str(len(data_neutral))+" len_negative:"+str(len(data_negative)) )  
    data_min_num=min([len(data_positive),len(data_neutral),len(data_negative)])
    
    data_positive = random.sample(data_positive, data_min_num)
    data_negative =random.sample(data_negative, data_min_num)
    data_neutral = random.sample(data_neutral,  data_min_num)
    data_test=[item for item in data_all if item not in data_positive and item not in data_neutral and item not in data_negative]
    
    data_positive_train = random.sample(data_positive, int(scale*data_min_num))
    data_negative_train = random.sample(data_negative,  int(scale*data_min_num))
    data_neutral_train = random.sample(data_neutral,  int(scale*data_min_num))  
    data_positive_test=[item for item in data_positive if item not in data_positive_train]
    data_negative_test=[item for item in data_negative if item not in data_negative_train]
    data_neutral_test=[item for item in data_neutral if item not in data_neutral_train]
    
    
    
    print("len_positive_test:"+str(len(data_positive_test))+" len_neutral_test:"+str(len(data_neutral_test))+" len_negative_test:"+str(len(data_negative_test)) )
    data_train=data_positive_train+data_negative_train+data_neutral_train
    data_dev=data_positive_test+data_negative_test+data_neutral_test
    print(len(data_train))
    print(len(data_dev))
    print(len(data_test))
    f_train=open(os.path.join(data_path,"train.csv"),mode="w",encoding="utf-8-sig",newline="")
    for item in data_train:
        f_train_write=csv.writer(f_train,dialect='excel')
        f_train_write.writerow(item)
    f_train.close()
    
    f_dev=open(os.path.join(data_path,"dev.csv"),mode="w",encoding="utf-8-sig",newline="")
    for item in data_dev:
        f_dev_write=csv.writer(f_dev,dialect='excel')
        f_dev_write.writerow(item)
    f_dev.close()
    
    f_test=open(os.path.join(data_path,"test.csv"),mode="w",encoding="utf-8-sig",newline="")
    for item in data_test:
        f_test_write=csv.writer(f_test,dialect='excel')
        f_test_write.writerow(item)
    f_test.close()


'''the rate to depart total data into train set and dev set is 80% '''
build_trainset_testset(0.8)

len_positive:18464 len_neutral:20097 len_negative:4504
len_positive_test:899 len_neutral_test:891 len_negative_test:889
10809
2679
29406


# Building array
#### 从train.csv,dev.csv,test.csv 中读取句子和相应标签并构建数组。由于word2vec比较快，不需要保存向量，可以随时建立数据随时训练。
- 对读入csv的每个句子，用jieba进行分词，并删除里面的空格，对每个分好的词语在词向量模型中找到相应向量。
- 如果是为了训练深度学习的模型，每一个句子对应的向量是（1，128，250）。128是一个句子最长的分词数量，250是每个词语在词向量文件中对应的编码长度。
- 如果是为了训练机器学习的模型，每一个句子对应的向量是（1,250），具体做法在上一步的基础上对向量进行平均处理即可。

In [5]:

def rep_sentencevector(sentence,if_deep=False):
    '''participle'''   
    word_list = jieba.lcut(sentence, cut_all=True)
    while '' in word_list:
        word_list.remove('')
    embedding_dim = 250
    if not if_deep:
        embedding_matrix = np.zeros(embedding_dim)
        for index, word in enumerate(word_list):
            try:
                embedding_matrix += model[word]
            except:
                pass
        return embedding_matrix/len(word_list)
    else:
        max_words=seq_len
        embedding_matrix = np.zeros((max_words, embedding_dim))
        for index, word in enumerate(word_list):
            try:
                embedding_matrix[index] = model[word]
            except:
                pass

    return embedding_matrix

   

def build_traindata(if_deep=False):
    X_train = list()
    Y_train = list()
    X_dev = list()
    Y_dev = list()
    X_test = list()
    Y_test = list()
    data_path_list=[data_path]
    for datapath in data_path_list:
        for line in csv.reader(open(os.path.join(datapath,"train.csv"),mode='r',encoding='utf-8-sig')):

            sent_vector = rep_sentencevector(line[1],if_deep)

            X_train.append(sent_vector)
            if line[0] == '1':
                if if_deep==False:
                    Y_train.append(1)
                else:
                    Y_train.append([0,0,1])
            elif line[0]=='0':
                if if_deep==False:
                    Y_train.append(0)
                else:
                    Y_train.append([0,1,0])
            elif line[0]=='-1':
                if if_deep==False:
                    Y_train.append(-1)
                else:
                    Y_train.append([1,0,0])
            else:
                print("wrong!")

        for line in csv.reader(open(os.path.join(datapath,"dev.csv"),mode='r',encoding='utf-8-sig')):

            sent_vector = rep_sentencevector(line[-1],if_deep)
            X_dev.append(sent_vector)
            if line[0] == '1':
                if if_deep==False:
                    Y_dev.append(1)
                else:
                    Y_dev.append([0,0,1])
            elif line[0]=='0':
                if if_deep==False:
                    Y_dev.append(0)
                else:
                    Y_dev.append([0,1,0])
            elif line[0]=='-1':
                if if_deep==False:
                    Y_dev.append(-1)
                else:
                    Y_dev.append([1,0,0])
            else:
                print("wrong!") 

        for line in csv.reader(open(os.path.join(datapath,"test.csv"),mode='r',encoding='utf-8-sig')):

            sent_vector = rep_sentencevector(line[-1],if_deep)
            X_test.append(sent_vector)
            if line[0] == '1':
                if if_deep==False:
                    Y_test.append(1)
                else:
                    Y_test.append([0,0,1])
            elif line[0]=='0':
                if if_deep==False:
                    Y_test.append(0)
                else:
                    Y_test.append([0,1,0])
            elif line[0]=='-1':
                if if_deep==False:
                    Y_test.append(-1)
                else:
                    Y_test.append([1,0,0])
            else:
                print("wrong!")            
    return np.array(X_train), np.array(Y_train), np.array(X_dev), np.array(Y_dev),np.array(X_test), np.array(Y_test)


X_train, Y_train, X_dev, Y_dev,X_test,Y_test = build_traindata()
X_train_2, Y_train_2, X_dev_2,Y_dev_2, X_test_2, Y_test_2 = build_traindata(if_deep=True)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Lenovo\AppData\Local\Temp\jieba.cache
Loading model cost 1.675 seconds.
Prefix dict has been built succesfully.


# SVM

In [6]:
def train_svm(X_train, Y_train):
    from sklearn.svm import SVC
    model = SVC(kernel='linear',probability=True)
    model.fit(X_train, Y_train)
    joblib.dump(model, os.path.join(model_path,"sentiment_svm_model.m"))


def evaluate_svm(model_filepath, X_test, Y_test):
    model = joblib.load(model_filepath)
    Y_predict = list()
    Y_test = list(Y_test)
    right = 0
    for sent in X_test:
        Y_predict.append(model.predict(sent.reshape(1, -1))[0])
    for index in range(len(Y_predict)):
        if int(Y_predict[index]) == int(Y_test[index]):
            right += 1
    score = right / len(Y_predict)
    print('model accuray is :{0}'.format(score)) #0.8302767589196399  model accuray is :0.77675891963988
    return score


def predict_svm(model_filepath):
    model = joblib.load(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
    rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
    rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
    print('sentence1', model.predict_proba(rep_sen1)) #sentence1 [1]
    print('sentence2', model.predict_proba(rep_sen2)) #sentence2 [0]

In [7]:
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)
train_svm(X_train, Y_train)
model_filepath_svm = os.path.join(model_path,'sentiment_svm_model.m')
evaluate_svm(model_filepath_svm, X_dev, Y_dev)
evaluate_svm(model_filepath_svm, X_test, Y_test)
predict_svm(model_filepath_svm)

(10809, 250) (10809,)
(2679, 250) (2679,)
(29406, 250) (29406,)
model accuray is :0.7592385218365062
model accuray is :0.6866965925321363
sentence1 [[0.00178768 0.10630487 0.89190745]]
sentence2 [[0.78767601 0.16386698 0.048457  ]]


# Bayes

In [26]:

def train_bayes(X_train, Y_train):
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()
    model.fit(X_train, Y_train)
    joblib.dump(model, os.path.join(model_path,"sentiment_bayes_model.m"))


def evaluate_bayes(model_filepath, X_test, Y_test):
    model = joblib.load(model_filepath)
    Y_predict = list()
    Y_test = list(Y_test)
    right = 0
    for sent in X_test:
        Y_predict.append(model.predict(sent.reshape(1, -1))[0])
    for index in range(len(Y_predict)):
        if int(Y_predict[index]) == int(Y_test[index]):
            right += 1
    score = right / len(Y_predict)
    print('model accuray is :{0}'.format(score)) 
    return score


def predict_bayes(model_filepath):
    model = joblib.load(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
 
    rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
    rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
    print('sentence1', model.predict_proba(rep_sen1))
    print('sentence2', model.predict_proba(rep_sen2))


In [27]:
model_filepath = os.path.join(model_path,"sentiment_bayes_model.m")
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)
train_bayes(X_train, Y_train)
evaluate_svm(model_filepath, X_dev, Y_dev)
evaluate_svm(model_filepath, X_test, Y_test)
predict_svm(model_filepath)

(10809, 250) (10809,)
(2684, 250) (2684,)
(29429, 250) (29429,)
model accuray is :0.690387481371088
model accuray is :0.6553399707771246
sentence1 [[2.29578690e-15 3.08096752e-20 1.00000000e+00]]
sentence2 [[6.91483362e-01 3.08509873e-01 6.76560446e-06]]


# KNN

In [30]:
def train_knn(X_train, Y_train, X_test, Y_test):
    from sklearn.neighbors import KNeighborsClassifier
    
    for x in range(1, 1):
        model = KNeighborsClassifier(n_neighbors=x)
        model.fit(X_train, Y_train)
        preds = model.predict(X_test)
        num = 0
        num = 0
        preds = preds.tolist()
        for i, pred in enumerate(preds):
            if int(pred) == int(Y_test[i]):
                num += 1
        print('K= ' + str(x) + ', precision_score:' + str(float(num) / len(preds)))

    '''choose k=16 to build and train model'''
    model = KNeighborsClassifier(n_neighbors=16)
    model.fit(X_train, Y_train)
    joblib.dump(model, os.path.join(model_path,"sentiment_knn_model.m"))


def evaluate_knn(model_filepath, X_test, Y_test):
    model = joblib.load(model_filepath)
    Y_predict = list()
    Y_test = list(Y_test)
    right = 0
    for sent in X_test:
        Y_predict.append(model.predict(sent.reshape(1, -1)))
    for index in range(len(Y_predict)):
        if Y_predict[index] == Y_test[index]:
            right += 1
    score = right / len(Y_predict)
    print('model accuray is :{0}'.format(score))
    return score

def predict_knn(model_filepath):
    model = joblib.load(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
    rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
    rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
    print('sentence1', model.predict_proba(rep_sen1)) 
    print('sentence2', model.predict_proba(rep_sen2)) 


In [31]:
model_filepath = os.path.join(model_path,"sentiment_knn_model.m")
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)
train_knn(X_train, Y_train, X_dev, Y_dev)
evaluate_knn(model_filepath, X_dev, Y_dev)
evaluate_knn(model_filepath, X_test, Y_test)
predict_knn(model_filepath)

(10809, 250) (10809,)
(2684, 250) (2684,)
(29429, 250) (29429,)
model accuray is :0.7228017883755589
model accuray is :0.6204424207414455
sentence1 [[0. 0. 1.]]
sentence2 [[0.5625 0.3125 0.125 ]]


# Decision Tree

In [14]:
def train_decisiontree(X_train, Y_train):
    from sklearn import tree
    model = tree.DecisionTreeClassifier()
    model.fit(X_train, Y_train)
    joblib.dump(model, os.path.join(model_path,'sentiment_decisiontree_model.m'))

def evaluate_decisiontree(model_filepath, X_test, Y_test):
    model = joblib.load(model_filepath)
    Y_predict = list()
    Y_test = list(Y_test)
    right = 0
    for sent in X_test:
        Y_predict.append(model.predict(sent.reshape(1, -1))[0])
    for index in range(len(Y_predict)):
        if int(Y_predict[index]) == int(Y_test[index]):
            right += 1
    score = right / len(Y_predict)
    print('model accuracy is :{0}'.format(score)) 
    return score

def predict_decisiontree(model_filepath):
    model = joblib.load(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
    rep_sen1 = np.array(rep_sentencevector(sentence1)).reshape(1, -1)
    rep_sen2 = np.array(rep_sentencevector(sentence2)).reshape(1, -1)
    print('sentence1', model.predict_proba(rep_sen1)) 
    print('sentence2', model.predict_proba(rep_sen2)) 


In [15]:
model_filepath = os.path.join(model_path,'sentiment_decisiontree_model.m')
print(X_train.shape, Y_train.shape)
print(X_dev.shape, Y_dev.shape)
print(X_test.shape, Y_test.shape)
train_decisiontree(X_train, Y_train)
evaluate_decisiontree(model_filepath, X_dev, Y_dev)
evaluate_decisiontree(model_filepath, X_test, Y_test)
predict_decisiontree(model_filepath)

(7521, 250) (7521,)
(1865, 250) (1865,)
(19213, 250) (19213,)
model accuracy is :0.6026809651474531
model accuracy is :0.5516577317441316
sentence1 [[0. 0. 1.]]
sentence2 [[1. 0. 0.]]


# CNN

In [8]:
def train_cnn(X_train, Y_train, X_test, Y_test):
    from keras.models import Sequential
    from keras.layers import Dense, Dropout
    from keras.layers import Embedding
    from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D
    
    model = Sequential()
    
    model.add(Conv1D(128, 3, activation='relu', input_shape=(seq_len, 250)))
    model.add(Conv1D(128, 3, activation='relu'))
    model.add(MaxPooling1D(3))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(Conv1D(64, 3, activation='relu'))
    model.add(MaxPooling1D(3))
    
    model.add(Conv1D(32, 3, activation='relu'))
    model.add(Conv1D(32, 3, activation='relu'))
    model.add(GlobalAveragePooling1D())
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])


    history=model.fit(X_train, Y_train, batch_size=100, epochs=4,shuffle=True,validation_data=(X_test, Y_test))
    model.save(os.path.join(model_path,'sentiment_cnn_model.h5'))
    return history
    
def evaluate_cnn(X_test,Y_test,model_filepath):
    from keras.models import load_model
    model=load_model(model_filepath)
    loss,accuracy = model.evaluate(X_test,Y_test)
    print('model accuracy is :{0}'.format(accuracy))

    
def predict_cnn(model_filepath):
    from keras.models import load_model  
    model = load_model(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
    sentence_vector1 = np.array([rep_sentencevector(sentence1,if_deep=True)])
    sentence_vector2 = np.array([rep_sentencevector(sentence2,if_deep=True)])
    
    print('test after load: ', model.predict(sentence_vector1))
    print('test after load: ', model.predict(sentence_vector2))

In [None]:
model_filepath = os.path.join(model_path,'sentiment_cnn_model.h5')
print(X_train_2.shape, Y_train_2.shape)
print(X_dev_2.shape, Y_dev_2.shape)
print(X_test_2.shape, Y_test_2.shape)
history=train_cnn(X_train_2, Y_train_2, X_dev_2, Y_dev_2)
evaluate_cnn(X_test_2,Y_test_2,model_filepath)
predict_cnn(model_filepath)

(10809, 128, 250) (10809, 3)
(2679, 128, 250) (2679, 3)
(29406, 128, 250) (29406, 3)
Train on 10809 samples, validate on 2679 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

# Lstm

In [20]:
def train_lstm(X_train, Y_train, X_test, Y_test):
    from keras.models import Sequential
    from keras.layers import LSTM, Dense, Bidirectional
    import numpy as np
    data_dim = 250  
    timesteps = seq_len 
    
    '''expected input data shape: (batch_size, timesteps, data_dim)'''   
    model = Sequential()
    model.add(Bidirectional(LSTM(64, return_sequences=True,
                   input_shape=(timesteps, data_dim))))# returns a sequence of vectors of dimension 64
    model.add(Bidirectional(LSTM(32, return_sequences=True))) # returns a sequence of vectors of dimension 32
    model.add(Bidirectional(LSTM(32)))  # return a single vector of dimension 32
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    history=model.fit(X_train, Y_train, batch_size=200, epochs=4,shuffle=True,validation_data=(X_test, Y_test))
    model.save(os.path.join(model_path,'sentiment_lstm_model.h5'))
    return history


def evaluate_lstm(X_test,Y_test,model_filepath):
    from keras.models import load_model
    model=load_model(model_filepath)
    loss,accuracy = model.evaluate(X_test,Y_test)
    print('model accuracy is :{0}'.format(accuracy))  

def predict_lstm(model_filepath):
    from keras.models import load_model  
    model = load_model(model_filepath)
    sentence1 = '在经营中努力为客户提供快捷优质的信息、仓储、物流、类金融等服务，利用自身资源积极拓展新的客户，同时维护与上游客户良好的关系，总体保持持续稳定的发展。'
    sentence2 = '(3)  应收账款期末较期初减少 59,289,691.24 元，减少 35.01%，主要系本公司之子公司西藏泰达厚生医药有限公司本期销售收入下降以及整体出售原子公司四川禾正制药有限责任公司导致应收账款减少。'
    sentence_vector1 = np.array([rep_sentencevector(sentence1,if_deep=True)])
    sentence_vector2 = np.array([rep_sentencevector(sentence2,if_deep=True)])
    print('test after load: ', model.predict(sentence_vector1))
    print('test after load: ', model.predict(sentence_vector2))    


In [21]:

model_filepath = os.path.join(model_path,'sentiment_lstm_model.h5')
print(X_train_2.shape, Y_train_2.shape)
print(X_dev_2.shape, Y_dev_2.shape)   
print(X_test_2.shape, Y_test_2.shape)
history_lstm=train_lstm(X_train_2, Y_train_2, X_dev_2, Y_dev_2)
evaluate_lstm(X_test_2,Y_test_2,model_filepath)
predict_lstm(model_filepath)


(10809, 128, 250) (10809, 3)
(2687, 128, 250) (2687, 3)
(29434, 128, 250) (29434, 3)
Train on 10809 samples, validate on 2687 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
model accuracy is :0.7423727661806612
test after load:  [[0.0088307  0.10095891 0.89021033]]
test after load:  [[0.63351744 0.27766407 0.08881846]]
