In [30]:
# -*- coding: utf-8 -*-
import re
import random
import numpy as np
import csv
import jieba
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB


file_path = './data_set/review.csv'
jieba.load_userdict("./data_set/userdict.txt")
stopword_path = './data_set/stopwords.txt'

def load_corpus(corpus_path):
    with open(corpus_path, 'r',encoding='UTF-8') as f:
        reader = csv.reader(f)
        rows = [row for row in reader]
    # 将读取出来的语料转为list
    review_data = np.array(rows).tolist()
    # 打乱语料的顺序
    random.shuffle(review_data)

    review_list = []
    sentiment_list = []
    # 第一列为差评/好评， 第二列为评论
    for words in review_data:
        review_list.append(words[1])
        sentiment_list.append(words[0])

    return review_list, sentiment_list

def load_stopwords(file_path):
    stop_words = []
    with open(file_path, encoding='UTF-8') as words:
        stop_words.extend([i.strip() for i in words.readlines()])
    return stop_words


def review_to_text(review):
    stop_words = load_stopwords(stopword_path)
    # 去除英文
    review = re.sub("[^\u4e00-\u9fa5^a-z^A-Z]", '', review)
    review = jieba.cut(review)
    # 去掉停用词
    if stop_words:
        all_stop_words = set(stop_words)
        words = [w for w in review if w not in all_stop_words]
    return words

# 定义Pipeline对全部步骤的流式化封装和管理，可以很方便地使参数集在新数据集（比如测试集）上被重复使用。
def MNB_Classifier():
    return Pipeline([
        ('count_vec', CountVectorizer()),
        ('mnb', MultinomialNB())
    ])

review_list, sentiment_list = load_corpus(file_path)


n = len(review_list) // 5

train_review_list, train_sentiment_list = review_list[n:], sentiment_list[n:]
test_review_list, test_sentiment_list = review_list[:n], sentiment_list[:n]

print('训练集数量： {}'.format(str(len(train_review_list))))
print('测试集数量： {}'.format(str(len(test_review_list))))

# 用于训练的评论
review_train = [' '.join(review_to_text(review)) for review in train_review_list]
# 对于训练评论对应的好评/差评
sentiment_train = train_sentiment_list

# 用于测试的评论
review_test = [' '.join(review_to_text(review)) for review in test_review_list]
# 对于测试评论对应的好评/差评
sentiment_test = test_sentiment_list

#count_vec = CountVectorizer(max_df=0.8, min_df=3)

vectorizer = CountVectorizer(max_df=0.8, min_df=3)
#tfidf_vec = TfidfVectorizer()
tfidftransformer = TfidfTransformer()
# 先转换成词频矩阵，再计算TFIDF值
tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(review_train))
# 朴素贝叶斯中的多项式分类器
clf = MultinomialNB().fit(tfidf, sentiment_train)

#mnbc_clf = MNB_Classifier()

# 进行训练
#mnbc_clf.fit(review_train, sentiment_train)

# 测试集准确率
print('测试集准确率： {}'.format(clf.score(review_test, sentiment_test)))

# 收集测试集错误
a = mnbc_clf.predict(review_test).tolist()
err_list = []
for i in range(len(review_test)):
    data = {'sentiment': '', 'review': ''}
    if a[i] != sentiment_test[i]:
        data['sentiment'] = sentiment_test[i]
        data['review'] = review_test[i]

        err_list.append(data)

训练集数量： 41402
测试集数量： 10350


ValueError: Expected 2D array, got 1D array instead:
array=['看着 恶心 死 多人 打星 给星 好'
 '好 无聊 两 男主角 不来电 好 中国 特色 片子 一股 浓浓的 曼哈顿 下城 chinatown 杂货店 尘味' '小半截 太次' ...
 '太 妈 感人' '冗长 平淡'
 '电影 风格 适合 废柴 看 美国 拍 日剧 换 日本 演员 来演 毫无 违和感 泡 一杯 廉价 雀巢 最低 耗氧量 心跳 面 无表情 屏幕 前 呆坐 俩 小时 结尾 忘 喝咖啡 倒掉 上床睡觉 thisisit'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [34]:
# -*- coding: utf-8 -*-
import re
import pickle

import numpy as np
import jieba


class SentimentAnalyzer(object):
    def __init__(self, model_path, userdict_path, stopword_path):
        self.clf = None
        self.vectorizer = None
        self.tfidftransformer = None
        self.model_path = model_path
        self.stopword_path = stopword_path
        self.userdict_path = userdict_path
        self.stop_words = []
        self.tokenizer = jieba.Tokenizer()
        self.initialize()

    # 加载模型
    def initialize(self):
        with open(self.stopword_path, encoding='UTF-8') as words:
            self.stop_words = [i.strip() for i in words.readlines()]

        with open(self.model_path, 'rb') as file:
            model = pickle.load(file)
            self.clf = model['clf']
            self.vectorizer = model['vectorizer']
            self.tfidftransformer = model['tfidftransformer']
        if self.userdict_path:
            self.tokenizer.load_userdict(self.userdict_path)

    # 过滤文字中的英文与无关文字
    def replace_text(self, text):
        text = re.sub('((https?|ftp|file)://)?[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|].(com|cn)', '', text)
        text = text.replace('\u3000', '').replace('\xa0', '').replace('”', '').replace('"', '')
        text = text.replace(' ', '').replace('↵', '').replace('\n', '').replace('\r', '').replace('\t', '').replace('）', '')
        text_corpus = re.split('[！。？；……;]', text)
        return text_corpus

    # 情感分析计算
    def predict_score(self, text_corpus):
        # 分词
        docs = [self.__cut_word(sentence) for sentence in text_corpus]
        new_tfidf = self.tfidftransformer.transform(self.vectorizer.transform(docs))
        predicted = self.clf.predict_proba(new_tfidf)
        # 四舍五入，保留三位
        result = np.around(predicted, decimals=3)
        return result

    # jieba分词
    def __cut_word(self, sentence):
        words = [i for i in self.tokenizer.cut(sentence) if i not in self.stop_words]
        result = ' '.join(words)
        return result

    def analyze(self, text):
        text_corpus = self.replace_text(text)
        result = self.predict_score(text_corpus)

        neg = result[0][0]
        pos = result[0][1]

        print('差评： {} 好评： {}'.format(neg, pos))


In [31]:
# -*- coding: utf-8 -*-
import os
import csv
import random
import pickle

import numpy as np
import jieba


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


jieba.load_userdict("./data/userdict.txt")


file_path = './data/review.csv'
model_export_path = './data/bayes.pkl'
stopword_path = './data/stopwords.txt'


def load_corpus(corpus_path):
    with open(corpus_path, 'r') as f:
        reader = csv.reader(f)
        rows = [row for row in reader]

    review_data = np.array(rows).tolist()
    # 打乱数据顺序
    random.shuffle(review_data)

    review_list = []
    sentiment_list = []
    for words in review_data:
        review_list.append(words[1])
        sentiment_list.append(words[0])

    return review_list, sentiment_list


def load_stopwords(file_path):
    stop_words = []
    with open(file_path, encoding='UTF-8') as words:
       stop_words.extend([i.strip() for i in words.readlines()])
    return stop_words


# jieba分词
def review_to_text(review):
    stop_words = load_stopwords(stopword_path)
    review = jieba.cut(review)
    all_stop_words = set(stop_words)
    # 去掉停用词
    review_words = [w for w in review if w not in all_stop_words]

    return review_words


review_list, sentiment_list = load_corpus(file_path)
n = len(review_list) // 5

train_review_list, train_sentiment_list = review_list[n:], sentiment_list[n:]
test_review_list, test_sentiment_list = review_list[:n], sentiment_list[:n]

print('训练集数量： {}'.format(str(len(train_review_list))))
print('测试集数量： {}'.format(str(len(test_review_list))))

review_train = [' '.join(review_to_text(review)) for review in train_review_list]
sentiment_train = train_sentiment_list

review_test = [' '.join(review_to_text(review)) for review in test_review_list]
sentiment_test = test_sentiment_list


vectorizer = CountVectorizer(max_df=0.8, min_df=3)

tfidftransformer = TfidfTransformer()

# 先转换成词频矩阵，再计算TFIDF值
tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(review_train))
# 朴素贝叶斯中的多项式分类器
clf = MultinomialNB().fit(tfidf, sentiment_train)

# 将模型保存pickle文件
with open(model_export_path, 'wb') as file:
    d = {
        "clf": clf,
        "vectorizer": vectorizer,
        "tfidftransformer": tfidftransformer,
    }
    pickle.dump(d, file)

print("训练完成")


FileNotFoundError: [Errno 2] No such file or directory: './data/userdict.txt'

In [33]:
# -*- coding: utf-8 -*-
import os
import csv
import random
import pickle

import numpy as np
import jieba


from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB


jieba.load_userdict("./data_set/userdict.txt")


file_path = './data_set/review.csv'
model_export_path = './data_set/bayes.pkl'
stopword_path = './data_set/stopwords.txt'


def load_corpus(corpus_path):
    with open(corpus_path, 'r',encoding='utf-8') as f:
        reader = csv.reader(f)
        rows = [row for row in reader]

    review_data = np.array(rows).tolist()
    # 打乱数据顺序
    random.shuffle(review_data)

    review_list = []
    sentiment_list = []
    for words in review_data:
        review_list.append(words[1])
        sentiment_list.append(words[0])

    return review_list, sentiment_list


def load_stopwords(file_path):
    stop_words = []
    with open(file_path, encoding='UTF-8') as words:
        stop_words.extend([i.strip() for i in words.readlines()])
    return stop_words


# jieba分词
def review_to_text(review):
    stop_words = load_stopwords(stopword_path)
    review = jieba.cut(review)
    all_stop_words = set(stop_words)
    # 去掉停用词
    review_words = [w for w in review if w not in all_stop_words]

    return review_words


review_list, sentiment_list = load_corpus(file_path)
n = len(review_list) // 5

train_review_list, train_sentiment_list = review_list[n:], sentiment_list[n:]
test_review_list, test_sentiment_list = review_list[:n], sentiment_list[:n]

print('训练集数量： {}'.format(str(len(train_review_list))))
print('测试集数量： {}'.format(str(len(test_review_list))))

review_train = [' '.join(review_to_text(review)) for review in train_review_list]
sentiment_train = train_sentiment_list

review_test = [' '.join(review_to_text(review)) for review in test_review_list]
sentiment_test = test_sentiment_list


vectorizer = CountVectorizer(max_df=0.8, min_df=3)

tfidftransformer = TfidfTransformer()

# 先转换成词频矩阵，再计算TFIDF值
tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(review_train))
# 朴素贝叶斯中的多项式分类器
clf = MultinomialNB().fit(tfidf, sentiment_train)

# 将模型保存pickle文件
with open(model_export_path, 'wb') as file:
    d = {
        "clf": clf,
        "vectorizer": vectorizer,
        "tfidftransformer": tfidftransformer,
    }
    pickle.dump(d, file)

print("训练完成")


训练集数量： 41402
测试集数量： 10350
训练完成


In [42]:
# -*- coding: utf-8 -*-



model_path = './data_set/bayes.pkl'
userdict_path = './data_set/userdict.txt'
stopword_path = './data_set/stopwords.txt'
corpus_path = './data_set/review.csv'


analyzer = SentimentAnalyzer(model_path=model_path, stopword_path=stopword_path, userdict_path=userdict_path)
text = '加油'
analyzer.analyze(text=text)


Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\二龙熊\AppData\Local\Temp\jieba.cache
Loading model cost 0.841 seconds.
Prefix dict has been built successfully.


差评： 0.701 好评： 0.299
