In [3]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
import numpy

### 读取商品评论  df_comments为最初DataFrame

In [4]:
# 读取包含好评、差评的Excel文件
df_comments = pd.read_excel('path')

In [5]:
df_test = DataFrame(df_comments,columns=['评价星级','评价内容'])
df_test_new = df_test.drop_duplicates() # 去掉重复的数据,5-18更新:通过Excel已经去重


# 取出好评、差评
active_com = df_test_new[df_test_new['评价星级']=='star5']
negative_com = df_test_new[df_test_new['评价星级']=='star1'] 

### 将Series转为Str (用于后面的分词以及TF-IDF)

In [6]:
df_comments_str_active = "".join(i for i in active_com['评价内容'][:])
df_comments_str_negative = "".join(i for i in negative_com['评价内容'][:])

### 利用jieba分词 && 去除停用词 

In [7]:
import jieba

# 加载停用词
stopwords=pd.read_csv('stopwords.txt',index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords2=pd.read_csv('stopwords2.txt',index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')

def drop_stopwords(contents,stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        if line in stopwords:
                continue
        line_clean.append(line)
        contents_clean.append(line_clean)
    return contents_clean

        
# 更改分词器,采用自己的自定义词典,并通过notepad++改变txt文件编码
jieba.load_userdict("userdict.txt")
active_cut_contents = jieba.lcut(df_comments_str_active) #利用jieba分词  https://github.com/fxsjy/jieba
negative_cut_contents = jieba.lcut(df_comments_str_negative)
# contents = df_comments['评价内容'][:20].values.tolist() #评论转为list

stopwords = stopwords.stopword.values.tolist()
stopwords2 = stopwords2.stopword.values.tolist()


# 调用drop_stopwords() 去除停用词
active_contents_clean = drop_stopwords(active_cut_contents,stopwords)
negative_contents_clean = drop_stopwords(negative_cut_contents,stopwords2)


# 加一步清除 contents_clean_test 中的空格,contents_clean_test = [['超级'],['棒'],[' '],['体验'],['超乎'],[' ']]
def clean_none(list):
    for i in list:
        for j in i:
            if ' ' in i:
                i.remove(' ')

    list1 = []
    for i in list:
        if i:
            list1.append(i)
    return list1


# 得到去除空格的,格式为list of list :  [['超级'],['棒'],['体验'],['超乎']]
active_contents_clean = clean_none(active_contents_clean)
negative_contents_clean = clean_none(negative_contents_clean)


# 得到 ['超级','棒','体验','超乎','想象']
active_contents_clean_new = [str(x[0]) for x in active_contents_clean]
negative_contents_clean_new = [str(x[0]) for x in negative_contents_clean]


#  将['超级','棒','体验','超乎','想象'] 转为 DataFrame , 分为 积极 和 消极
df_all_words_active = DataFrame({'all_words':active_contents_clean_new})
df_all_words_negative = DataFrame({'all_words':negative_contents_clean_new})

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\asus\AppData\Local\Temp\jieba.cache
Loading model cost 1.050 seconds.
Prefix dict has been built succesfully.


###  加载用户字典 && 基于 TF-IDF 算法的关键词抽取(另TextRank)

In [8]:
import jieba.analyse
#  extract_tags() 传入一个list
jieba.analyse.set_stop_words("stopwords.txt")
tags_TF_IDF = jieba.analyse.extract_tags(df_comments_str_active, withWeight =False,topK=8)

tags_TextRank = jieba.analyse.textrank(df_comments_str_active, topK=8, withWeight=True)
#  df_comments_str_active
tags_TF_IDF,tags_TextRank

(['充电', '855', '游戏', '流畅', '吃鸡', '快充', '拍照', '玩游戏'],
 [('充电', 1.0),
  ('游戏', 0.8607247014505186),
  ('速度', 0.7555496223510816),
  ('拍照', 0.46874727314861575),
  ('屏幕', 0.46577129159118524),
  ('吃鸡', 0.45729860210380635),
  ('电池', 0.41671070308455643),
  ('玩游戏', 0.3958192765970956)])

### 统计词频

In [9]:
words_count_active=df_all_words_active.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})
words_count_active=words_count_active.reset_index().sort_values(by=["count"],ascending=False)

words_count_neg=df_all_words_negative.groupby(by=['all_words'])['all_words'].agg({"count":numpy.size})
words_count_neg=words_count_neg.reset_index().sort_values(by=["count"],ascending=False)

### 绘制词云

In [10]:
from pyecharts import options as opts
from pyecharts.charts import Page, WordCloud
from pyecharts.globals import SymbolType

word_frequence_active = [(x[0],x[1]) for x in words_count_active.head(800).values] # 生成词云所需要的格式 [('充电', 83), ('流畅', 53), ('玩游戏', 49)]

word_neg_frequence = [(x[0],x[1]) for x in words_count_neg.head(800).values]

def wordcloud_base(word_frequence) -> WordCloud:
    c = (
        WordCloud()
        .add("", word_frequence, word_size_range=[20, 100])
        .set_global_opts(title_opts=opts.TitleOpts(title="WordCloud"))
    )
    return c

# 正向评价词云
wordcloud_base(word_frequence_active).render_notebook()  
#  负向评价词云
wordcloud_base(word_neg_frequence).render_notebook()  

### 绘制正面负面评价分布图

In [11]:
from pyecharts import options as opts
from pyecharts.charts import Bar


'''
获取好评中各个属性数 : 价格 外观 续航 屏幕 内存 系统 解锁 游戏
'''
def get_property_nums(words_count):
    price_num = appearance_num= words_count[words_count['all_words'].str.contains('价格|贵|便宜|性价比高')].apply(np.sum)['count']
    photo = words_count[words_count['all_words'].str.contains('拍照|摄像|照相')].apply(np.sum)['count']
    appearance_num= words_count[words_count['all_words'].str.contains('外观|外形')].apply(np.sum)['count']
    battery_num = words_count[words_count['all_words'].str.contains('续航|电池|充电')].apply(np.sum)['count'] #得到int
    scree_num = words_count[words_count['all_words'].str.contains('屏幕')].apply(np.sum)['count']
    unlock = words_count[words_count['all_words'].str.contains('解锁|锁屏|指纹')].apply(np.sum)['count']
    game = words_count[words_count['all_words'].str.contains('游戏|电竞|王者|王者荣耀|吃鸡|刺激战场')].apply(np.sum)['count']
    system = words_count[words_count['all_words'].str.contains('系统|内存|流畅|配置|855')].apply(np.sum)['count']
    return price_num,photo,appearance_num,battery_num,scree_num,unlock,game,system



'''
add_xaxis()传入一个列表,作为横坐标
add_yaxis()传入一个列表,作为柱形图y数值
'''
def bar_stack0() -> Bar:
    properties = ['价格','拍照','外观','电池','屏幕','解锁','游戏','系统']
    c = (
        Bar()
        .add_xaxis(properties)
        .add_yaxis("好评",list(get_property_nums(words_count_active)), stack="stack1")
        .add_yaxis("差评", list(get_property_nums(words_count_neg)), stack="stack1")
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(title_opts=opts.TitleOpts(title="手机属性口碑分布图"))
    )
    return c

bar_stack0().render_notebook()

### 绘制容量颜色销量

In [12]:
# 获取颜色
df_color = DataFrame(df_comments,columns=['颜色'])
# 去重参考 https://www.jb51.net/article/155577.htm
df_color_new = df_color.drop_duplicates()
df_color_new_group =df_color.groupby(by=['颜色'])['颜色'].agg({"count":numpy.size})
df_color_new_group = df_color_new_group.reset_index().sort_values(by=["count"],ascending=False)

color = [x for x in df_color_new_group['颜色'][:]] #填入颜色饼状图的名称列表
color_num = [float(str(x)) for x in df_color_new_group['count'][:]]


#  获取容量
df_volume = DataFrame(df_comments,columns=['容量'])
df_volume_new = df_volume.drop_duplicates()
df_volume_new_group =df_volume.groupby(by=['容量'])['容量'].agg({"count":numpy.size})
df_volume_new_group = df_volume_new_group.reset_index().sort_values(by=["count"],ascending=False)

volume = [x for x in df_volume_new_group['容量'][:]] #填入颜色饼状图的名称列表
volume_num = [float(str(x)) for x in df_volume_new_group['count'][:]]


volume,volume_num
str(color_num)
color,color_num

(['电光蓝', '熔岩橙', '武士黑'], [1481.0, 445.0, 69.0])

In [13]:
from pyecharts import options as opts
from pyecharts.charts import Page, Pie


# 获取好评数,差评数,得到int
positive_num = df_test_new[df_test_new['评价星级']=='star5'].shape[0]
negative_num = df_test_new[df_test_new['评价星级']=='star1'].shape[0]


# 计算占比
positive_num_proportion = positive_num / (positive_num + negative_num)
negative_num_proportion = negative_num / (positive_num + negative_num)


def pie_color() -> Pie:
    c = (
        Pie()
        .add("", [list(z) for z in zip(color, color_num)])
        .set_global_opts(title_opts=opts.TitleOpts(title="各种颜色销量占比"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
    return c

def pie_volume() -> Pie:
    c = (
        Pie()
        .add("", [list(z) for z in zip(volume, volume_num)])
        .set_global_opts(title_opts=opts.TitleOpts(title="各种容量销量占比"))
        .set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
    )
    return c
# pie_volume().render_notebook()
pie_color().render_notebook()

In [14]:
pie_volume().render_notebook()

### gensim LDA

In [15]:
from gensim import corpora, models, similarities
import gensim
from gensim.models import word2vec

In [16]:
#  参数texts就是若干个被拆成单词集合的文档的集合，而Dictionary就是把所有单词取一个set(),并对set中每个单词分配一个id号的map;  猜测是(1,'超级')
dictionary = corpora.Dictionary(active_contents_clean)
#  doc2bow把文档 doc变成一个稀疏向量，[(0, 1), (1, 1)]，表明id为0,1的词汇出现了1次，至于其他词汇，没有出现。
corpus = [dictionary.doc2bow(sentence) for sentence in active_contents_clean]
# corpus[:20]

lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)  # 自己指定主题


In [17]:
def get_LDA(contents_clean):
    dictionary = corpora.Dictionary(contents_clean)
    corpus = [dictionary.doc2bow(sentence) for sentence in contents_clean]
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=3)  # 自己指定主题
    topic = "<p>"+lda.print_topic(0, topn=5)+"</p>"+"<p>"+lda.print_topic(1, topn=5)+"</p>"+"<p>"+lda.print_topic(2, topn=5)+"</p>"
    return topic

lda_active = '<p>[正向评价LDA]</p>'+get_LDA(active_contents_clean)
lda_negative = '<p>[负向向评价LDA]</p>'+get_LDA(negative_contents_clean)

In [18]:
for topic in lda.print_topics(num_topics=5, num_words=5):
    print (topic[1])

0.026*"游戏" + 0.026*"855" + 0.025*"体验" + 0.015*"杠杠" + 0.014*"抢"
0.045*"充电" + 0.038*"性能" + 0.027*"流畅" + 0.023*"外观" + 0.022*"吃鸡"
0.025*"手感" + 0.023*"玩游戏" + 0.023*"速度" + 0.021*"超级" + 0.021*"快充"


In [19]:
lda_active + '<br>' + lda_negative
# active_contents_clean

'<p>[正向评价LDA]</p><p>0.044*"充电" + 0.026*"流畅" + 0.023*"玩游戏" + 0.022*"速度" + 0.021*"超级"</p><p>0.028*"手感" + 0.026*"游戏" + 0.025*"体验" + 0.024*"吃鸡" + 0.022*"王者"</p><p>0.039*"性能" + 0.024*"外观" + 0.024*"855" + 0.022*"快充" + 0.015*"拍照"</p><br><p>[负向向评价LDA]</p><p>0.026*"充电" + 0.018*"感觉" + 0.014*"卡" + 0.013*"荣耀" + 0.011*"吃鸡"</p><p>0.051*"耳机" + 0.024*"王者" + 0.019*"差评" + 0.015*"京东" + 0.013*"系统"</p><p>0.023*"客服" + 0.016*"垃圾" + 0.015*"屏幕" + 0.013*"玩游戏" + 0.013*"电池"</p>'

### 将分析结果写入elasticsearch

In [23]:
from elasticsearch import Elasticsearch

client = Elasticsearch(hosts=["127.0.0.1"])

title = df_comments.loc[0,'页面标题']
url = df_comments.loc[0,'页面网址']
star=df_comments.loc[0,'评价星级']
content = lda_active + '<br>' + lda_negative

wordcloud_positive = str(word_frequence_active)
wordcloud_negative = str(word_neg_frequence)
# proportion_positive = str(float('%.2f' % positive_num_proportion))
# proportion_negative =  str(float('%.2f' % negative_num_proportion))
koubei_positive = str(list(get_property_nums(words_count_active)))
koubei_negative = str(list(get_property_nums(words_count_neg)))

doc_index ="jingdong"
doc_type = "comment"
doc_body = {
    "title":title,
    "create_date": "2019-06-01",
    "content": content,
    "url": url,
    "tags": str(tags_TF_IDF),
    "star": "star5",
    "wordcloud_positive":wordcloud_positive,
    "wordcloud_negative":wordcloud_negative,
    "color":str(color),
    "color_num1":str(color_num),
    "volume":str(volume),
    "volume_num1":str(volume_num),
    "koubei_positive":koubei_positive,
    "koubei_negative":koubei_negative
}
try:
    # client.bulk(index=doc_index, doc_type=doc_type, body=doc_body)
    client.index(index=doc_index,doc_type=doc_type,body = doc_body,id=1)
    print("success")
except Exception as e:
    print(e)

    

success
