In [1]:
import pandas as pd
import numpy as np
import jieba
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
data = pd.read_csv("../data/书籍评价.csv", encoding="gbk")
data.columns = ['index', 'content', 'eval']
data.head()

Unnamed: 0,index,content,eval
0,0,从编程小白的角度看，入门极佳。,好评
1,1,很好的入门书，简洁全面，适合小白。,好评
2,2,讲解全面，许多小细节都有顾及，三个小项目受益匪浅。,好评
3,3,前半部分讲概念深入浅出，要言不烦，很赞,好评
4,4,看了一遍还是不会写，有个概念而已,差评


In [5]:
data.loc[data.loc[:, 'eval']=='好评', 'score']=1
data.loc[data.loc[:, 'eval']=='差评', 'score']=0
data

Unnamed: 0,index,content,eval,score
0,0,从编程小白的角度看，入门极佳。,好评,1.0
1,1,很好的入门书，简洁全面，适合小白。,好评,1.0
2,2,讲解全面，许多小细节都有顾及，三个小项目受益匪浅。,好评,1.0
3,3,前半部分讲概念深入浅出，要言不烦，很赞,好评,1.0
4,4,看了一遍还是不会写，有个概念而已,差评,0.0
5,5,中规中矩的教科书，零基础的看了依旧看不懂,差评,0.0
6,6,内容太浅显，个人认为不适合有其它语言编程基础的人,差评,0.0
7,7,破书一本,差评,0.0
8,8,适合完完全全的小白读，有其他语言经验的可以去看别的书,差评,0.0
9,9,基础知识写的挺好的！,好评,1.0


In [6]:
content = data['content']
content

0                从编程小白的角度看，入门极佳。
1              很好的入门书，简洁全面，适合小白。
2      讲解全面，许多小细节都有顾及，三个小项目受益匪浅。
3            前半部分讲概念深入浅出，要言不烦，很赞
4               看了一遍还是不会写，有个概念而已
5           中规中矩的教科书，零基础的看了依旧看不懂
6       内容太浅显，个人认为不适合有其它语言编程基础的人
7                           破书一本
8     适合完完全全的小白读，有其他语言经验的可以去看别的书
9                     基础知识写的挺好的！
10                           太基础
11            略_嗦。。适合完全没有编程经验的小白
12                      真的真的不建议买
Name: content, dtype: object

In [8]:
stopwords=[]
with open('../data/stopwords.txt','r',encoding='utf-8') as f:
    lines = f.readlines()
    for line in lines:
        word = line.strip()
        stopwords.append(word)
stopwords = list(set(stopwords))
stopwords[:5]

['', '［③①］', '此次', '产生', '总之']

In [10]:
content_list = []
for tmp in content:
    seg_list=jieba.cut(tmp, cut_all=False)
    seg_str = ','.join(seg_list)
    content_list.append(seg_str)
content_list

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/96/01myvpm532l7jxr5thqt_tj80000gn/T/jieba.cache
Loading model cost 0.709 seconds.
Prefix dict has been built successfully.


[' ,从,编程,小白,的,角度看,，,入门,极佳,。',
 '很,好,的,入门,书,，,简洁,全面,，,适合,小白,。',
 '讲解,全面,，,许多,小,细节,都,有,顾及,，,三个,小,项目,受益匪浅,。',
 '前半部,分讲,概念,深入浅出,，,要言不烦,，,很赞',
 '看,了,一遍,还是,不会,写,，,有个,概念,而已',
 '中规中矩,的,教科书,，,零,基础,的,看,了,依旧,看不懂',
 '内容,太,浅显,，,个人,认为,不,适合,有,其它,语言,编程,基础,的,人',
 '破书,一本',
 '适合,完完全全,的,小白读,，,有,其他,语言,经验,的,可以,去,看,别的,书',
 '基础知识,写,的,挺,好,的,！',
 '太,基础',
 '略,_,嗦,。,。,适合,完全,没有,编程,经验,的,小白',
 '真的,真的,不,建议,买']

In [11]:
con = CountVectorizer(stop_words = stopwords)
X=con.fit_transform(content_list)
name = con.get_feature_names()
print(name)
X.toarray()

['一本', '一遍', '三个', '中规中矩', '依旧', '入门', '内容', '分讲', '前半部', '受益匪浅', '基础', '基础知识', '完完全全', '小白', '小白读', '建议', '很赞', '教科书', '有个', '极佳', '概念', '浅显', '深入浅出', '看不懂', '真的', '破书', '简洁', '细节', '经验', '编程', '要言不烦', '角度看', '讲解', '语言', '适合', '项目', '顾及']


array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0

In [15]:
good_bad = data['score'].values
good_bad

array([1., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [18]:
x_train = X.toarray()[:10,:]
y_train = good_bad[:10]

x_test = X.toarray()[:10,:]
y_train = good_bad[:10]

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0