# 语料库分词处理

In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import jieba
import numpy as np

# 加载语料库文件，并导入数据
neg = pd.read_excel('data_09/data/neg.xls', header=None, index=None)
pos = pd.read_excel('data_09/data/pos.xls', header=None, index=None)

# jieba 分词
word_cut = lambda x: jieba.lcut(x)
pos['words'] = pos[0].apply(word_cut)
neg['words'] = neg[0].apply(word_cut)

# 使用 1 表示积极情绪，0 表示消极情绪，并完成数组拼接
x = np.concatenate((pos['words'], neg['words']))
y = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# 将 Ndarray 保存为二进制文件备用
np.save('data_09/data/x_train.npy', x)
np.save('data_09/data/y_train.npy', y)

print('done.')

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/9f/ztmq3pbj1dnfh4wctr4nbnd80000gn/T/jieba.cache
Loading model cost 1.644 seconds.
Prefix dict has been built succesfully.


done.


# Word2vec处理

In [None]:
# -*- coding: utf-8 -*-

import numpy as np
from gensim.models.word2vec import Word2Vec

# 导入上面保存的分词数组
x_train = np.load('data_09/data/x_train.npy')

# 训练 Word2Vec 浅层神经网络模型
w2v = Word2Vec(size=300, min_count=10)
w2v.build_vocab(x_train)
w2v.train(x_train, total_examples=w2v.corpus_count, epochs=w2v.iter)

# 对每个句子的词向量进行均值计算
def average_vec(text):
    vec = np.zeros(300).reshape((1, 300))
    for word in text:
        try:
            vec += w2v[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 将词向量保存为 Ndarray
train_vec = np.concatenate([average_vec(z) for z in x_train])

# 保存 Word2Vec 模型及词向量
w2v.save('data_09/data/w2v_model.pkl')
np.save('data_09/data/x_train_vec.npy', train_vec)

# 训练支持向量机情绪分类模型

In [None]:
# -*- coding: utf-8 -*-

import numpy as np
from sklearn.externals import joblib
from sklearn.svm import SVC

# 导入词向量为训练特征
x = np.load('data_09/data/x_train_vec.npy')

# 导入情绪分类作为目标特征
y = np.load('data_09/data/y_train.npy')

# 构建支持向量机分类模型
model = SVC(kernel='rbf', verbose=True)

# 训练模型
model.fit(x, y)

# 保存模型为二进制文件
joblib.dump(model, 'data_09/data/svm_model.pkl')

from sklearn.model_selection import cross_val_score

[LibSVM]

In [None]:
# 输出模型交叉验证准确率
print(cross_val_score(model, x, y))

# 对实验楼评判进行情绪判断

In [None]:
# -*- coding: utf-8 -*-

from gensim.models.word2vec import Word2Vec
import numpy as np
import jieba
from sklearn.externals import joblib
import pandas as pd

# 读取 Word2Vec 并对新输入进行词向量计算
def average_vec(words):
    # 读取 Word2Vec 模型
    w2v = Word2Vec.load('data_09/data/w2v_model.pkl')
    vec = np.zeros(300).reshape((1, 300))
    for word in words:
        try:
            vec += w2v[word].reshape((1, 300))
        except KeyError:
            continue
    return vec

# 对实验楼评论进行情感判断
def svm_predict():
    # 读取实验楼评论
    df = pd.read_csv("data_09/comments.csv", header=0)

    comment_sentiment = []
    for string in df['评论内容']:
        # 对评论分词
        words = jieba.lcut(str(string))
        words_vec = average_vec(words)
        # 读取支持向量机模型
        model = joblib.load('data_09/data/svm_model.pkl')
        result = model.predict(words_vec)
        comment_sentiment.append(result[0])

        # 实时返回积极或消极结果
        if int(result[0]) == 1:
            print(string, '[积极]')
        else:
            print(string, '[消极]')

    #将情绪结果合并到原数据文件中
    merged = pd.concat([df, pd.Series(comment_sentiment, name='用户情绪')], axis=1)

    # 储存文件
    pd.DataFrame.to_csv(merged,'data_09/comment_sentiment.csv')
    print('done.')

# 执行
svm_predict()