In [34]:
# -*- coding: utf-8 -*-
import os
import re
import csv
import random

import numpy as np
import jieba

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

jieba.load_userdict("./data_set/userdict.txt")
file_path = './data_set/review.csv'
stopword_path = './data_set/stopwords.txt'


#加载语料库
def load_corpus(corpus_path):
    with open(corpus_path, 'r', encoding='UTF-8') as f:
        reader = csv.reader(f)
        rows = [row for row in reader]
    review_data = np.array(rows).tolist()
    # 打乱数据顺序
    random.shuffle(review_data)

    review_list = []
    sentiment_list = []
    for words in review_data:
        review_list.append(words[1])
        sentiment_list.append(words[0])

    return review_list, sentiment_list


#加载停用词
def load_stopwords(stopword_path):
    stop_words = []
    with open(stopword_path, encoding='UTF-8') as words:
        stop_words.extend([i.strip() for i in words.readlines()])
    return stop_words


# jieba分词
def review_to_text(review):
    stop_words = load_stopwords(stopword_path)
    review = jieba.cut(review)
    all_stop_words = set(stop_words)
    # 去掉停用词
    review_words = [w for w in review if w not in all_stop_words]

    return review_words


# 过滤文字中的英文与无关文字
def replace_text(text):
    text = re.sub(
        '((https?|ftp|file)://)?[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|].(com|cn)',
        '', text)
    text = text.replace('\u3000', '').replace('\xa0',
                                              '').replace('”',
                                                          '').replace('"', '')
    text = text.replace(' ', '').replace('↵', '').replace('\n', '').replace(
        '\r', '').replace('\t', '').replace('）', '')
    text_corpus = re.split('[！。？；……;]', text)
    return text_corpus


def cut_word(sentence, stop_words):
    words = [i for i in jieba.Tokenizer().cut(sentence) if i not in stop_words]
    result = ' '.join(words)
    return result


# 预测情绪指数
def predict_score(text_corpus, tfidftransformer, clf, vectorizer, stop_words):
    # 分词
    docs = [cut_word(sentence, stop_words) for sentence in text_corpus]
    new_tfidf = tfidftransformer.transform(vectorizer.transform(docs))
    predicted = clf.predict_proba(new_tfidf)
    # 四舍五入，保留三位
    result = np.around(predicted, decimals=3)
    return result


review_list, sentiment_list = load_corpus(file_path)

stop_words = load_stopwords(stopword_path)

train_review_list, test_review_list, train_sentiment_list, test_sentiment_list = train_test_split(
    review_list, sentiment_list, test_size=0.2, random_state=420)

print('训练集数量： {}'.format(str(len(train_review_list))))
print('测试集数量： {}'.format(str(len(test_review_list))))

review_train = [
    ' '.join(review_to_text(review)) for review in train_review_list
]
sentiment_train = train_sentiment_list

review_test = [' '.join(review_to_text(review)) for review in test_review_list]
sentiment_test = test_sentiment_list

vectorizer = CountVectorizer(max_df=0.8, min_df=3)

tfidftransformer = TfidfTransformer()

# 先转换成词频矩阵，再计算TFIDF值
tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(review_train))

# 朴素贝叶斯中的多项式分类器
clf = MultinomialNB().fit(tfidf, sentiment_train)

print("训练完成")

训练集数量： 41401
测试集数量： 10351
训练完成


In [35]:
test_tfidf = tfidftransformer.transform(vectorizer.transform(review_test))

test_labels = clf.predict(test_tfidf)

print ('测试集准确率为： {}'.format(metrics.accuracy_score(test_labels, sentiment_test)))

测试集准确率为： 0.798956622548546


In [36]:
scores = cross_val_score(clf,tfidf, sentiment_train,cv=5)

print('交叉验证准确率： {}'.format(scores.mean()))

交叉验证准确率： 0.7971304021136797


In [37]:
text = "如果你不喜欢这部动画电影，那么它不是为你准备的，故事的终章是为真正有童年，不忘初心，老漫迷的人而准备的。"
text_corpus = replace_text(text)
result = predict_score(text_corpus, tfidftransformer, clf, vectorizer,
                       stop_words)

neg = result[0][0]
pos = result[0][1]

print('差评： {} 好评： {}'.format(neg, pos))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\二龙熊\AppData\Local\Temp\jieba.cache
Loading model cost 0.669 seconds.
Prefix dict has been built successfully.


差评： 0.217 好评： 0.783
