In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pymongo import MongoClient
from pandas.io.json import json_normalize

plt.style.use('ggplot')
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['SimHei']  #解决seaborn中文字体显示问题
plt.rc('figure', figsize=(10, 10))  #把plt默认的图片size调大一点
plt.rcParams["figure.dpi"] =mpl.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
%matplotlib inline

In [2]:
conn = MongoClient(host='127.0.0.1', port=27017)  # 实例化MongoClient
db = conn.get_database('KrisWu')  # 连接到CaiXuKun数据库

repost = db.get_collection('repost') # 连接到集合repost
mon_data = repost.find()  # 查询这个集合下的所有记录

In [3]:
data = json_normalize([comment for comment in mon_data])

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102118 entries, 0 to 102117
Columns: 111 entries, _id to version
dtypes: bool(10), float64(59), int64(19), object(23)
memory usage: 79.7+ MB


In [5]:
data.sample(5)

Unnamed: 0,_id,ad_state,attitudes_count,bid,can_edit,cardid,comments_count,content_auth,created_at,darwin_tags,...,user.profile_image_url,user.profile_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason,user.verified_type,user.verified_type_ext,version
71066,5cb9e2c4b4fbcfda28bdb098,,0,HqnZC1nct,False,,0,0,4小时前,[],...,https://tvax4.sinaimg.cn/crop.0.0.750.750.180/...,https://m.weibo.cn/u/7042796303?uid=7042796303,不吃辣怎么活得下去,23,4,False,,-1,,
47793,5cb9a121b4fbcfda28bca1d3,,0,HqmZzzffi,False,,0,0,25分钟前,[],...,https://tvax4.sinaimg.cn/crop.0.0.512.512.180/...,https://m.weibo.cn/u/5273246409?uid=5273246409,Joeyyll,210,9,False,,-1,,
79861,5cb9e91db4fbcfda28be1d3b,,0,HqnKc6bnk,False,,0,0,4小时前,[],...,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,https://m.weibo.cn/u/6039314470?uid=6039314470,黎黎哩嘿,56,4,False,,-1,,
25852,5cb98bc7b4fbcfda28bbb805,,0,Hqms1EPTQ,False,,0,0,8小时前,[],...,https://tvax2.sinaimg.cn/crop.0.0.512.512.180/...,https://m.weibo.cn/u/5641981190?uid=5641981190,cococonutpalm,205,28,False,,-1,,
50153,5cb9a332b4fbcfda28bcbd8e,,0,HqmY30Dsf,False,star_095,0,0,6小时前,[],...,https://tvax2.sinaimg.cn/crop.0.0.996.996.180/...,https://m.weibo.cn/u/5208873191?uid=5208873191,DK哥哥的cute妹妹粉,86,9,False,,-1,,


#### 1. 数据清洗
由于数据入库的时候没有进行清洗，所以数据多出了很多没用的字段，需要先清洗掉

In [8]:
print(list(data.columns))

['_id', 'ad_state', 'attitudes_count', 'bid', 'can_edit', 'cardid', 'comments_count', 'content_auth', 'created_at', 'darwin_tags', 'edit_at', 'edit_count', 'expire_time', 'favorited', 'hide_flag', 'hide_hot_flow', 'id', 'isLongText', 'is_imported_topic', 'is_paid', 'mblog_vip_type', 'mblogtype', 'mid', 'more_info_type', 'pending_approval_count', 'pic_ids', 'pic_types', 'pid', 'raw_text', 'reposts_count', 'reward_exhibition_type', 'show_additional_indication', 'source', 'sync_mblog', 'topic_id', 'user.avatar_hd', 'user.badge.ali_1688', 'user.badge.anniversary', 'user.badge.asiad_2018', 'user.badge.avengers_2019', 'user.badge.bind_taobao', 'user.badge.cz_wed_2017', 'user.badge.dailv', 'user.badge.dailv_2018', 'user.badge.denglong_2019', 'user.badge.double11_2018', 'user.badge.dzwbqlx_2016', 'user.badge.follow_whitelist_video', 'user.badge.fools_day_2016', 'user.badge.fu_2019', 'user.badge.gongyi', 'user.badge.gongyi_level', 'user.badge.hongbaofei_2019', 'user.badge.inspector', 'user.badg

In [9]:
in_columns = ['attitudes_count', 'comments_count', 'reposts_count', 'mid', 'raw_text', 
          'source', 'user.description', 'user.follow_count', 'user.followers_count', 
          'user.gender', 'user.id', 'user.mbrank', 'user.mbtype', 'user.profile_url', 
          'user.profile_image_url', 'user.screen_name', 'user.statuses_count', 
          'user.urank', 'user.verified', 'user.verified_reason']

In [10]:
data = data[in_columns]

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102118 entries, 0 to 102117
Data columns (total 20 columns):
attitudes_count           102118 non-null int64
comments_count            102118 non-null int64
reposts_count             102118 non-null int64
mid                       102118 non-null object
raw_text                  102118 non-null object
source                    102118 non-null object
user.description          102118 non-null object
user.follow_count         102118 non-null int64
user.followers_count      102118 non-null int64
user.gender               102118 non-null object
user.id                   102118 non-null int64
user.mbrank               102118 non-null int64
user.mbtype               102118 non-null int64
user.profile_url          102118 non-null object
user.profile_image_url    102118 non-null object
user.screen_name          102118 non-null object
user.statuses_count       102118 non-null int64
user.urank                102118 non-null int64
user.verified    

In [12]:
data.to_csv('kriswu.csv', index=False)

问题：
1. 吴亦凡该微博的转发是否存在假流量？
2. 大家对于《大碗宽面》怎么看？
3. 有多少人拿吴亦凡跟蔡徐坤做对比？
4. 有多少人开始路转粉了？
5. 评论的词云图

### 1. 吴亦凡该微博的转发是否存在假流量？

In [16]:
# 先来看看吴亦凡的粉丝性别比例
fans_num = data['user.gender'].value_counts()
fans_num

f    77279
m    24839
Name: user.gender, dtype: int64

In [376]:
from pyecharts import Bar

bar = Bar("吴亦凡粉丝性别比例初探", width = 600,height=500)
bar.add("(总数据102118条)", ['女', '男'], fans_num.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [18]:
np.round(fans_num/fans_num.sum()*100, 2)

f    75.68
m    24.32
Name: user.gender, dtype: float64

In [378]:
data[data['user.gender']=='f'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
41706,0,0,0,4362807157558622,😂️😂️😂️,前置双摄vivo X9,💛想要那种无所畏惧的心甘情愿,295,196,f,5662041771,2,2,https://m.weibo.cn/u/5662041771?uid=5662041771,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,精分少女不太冷,710,24,False,
39079,0,0,0,4362798714648990,嘻嘻,iPhone客户端,我要好运👍,331,245,f,5031367390,2,11,https://m.weibo.cn/u/5031367390?uid=5031367390,https://tvax1.sinaimg.cn/crop.0.0.512.512.180/...,不吃胡萝卜的小王子yoo,806,30,False,
45108,0,0,0,4362847078920385,//@舒淇:肚子暖暖 心就暖 [色][色][色],荣耀手机 勇敢做自己,唯早晨和吴亦凡不可辜负♥,170,99,f,5540629792,1,2,https://m.weibo.cn/u/5540629792?uid=5540629792,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,F君的Rachel,3988,4,False,
51390,0,0,0,4362855135966200,转发微博,前后2000万 OPPO R11,从你的全世界路过。,197,54,f,5469881353,0,0,https://m.weibo.cn/u/5469881353?uid=5469881353,https://tvax4.sinaimg.cn/crop.0.0.996.996.180/...,大长腿mimo,45,9,False,
88521,0,0,0,4362871112369317,respect,vivo X20全面屏手机,( • ̀ω ⁃᷄)✧,48,181,f,6574764125,0,0,https://m.weibo.cn/u/6574764125?uid=6574764125,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,你是我的小哥哥呐_,51,9,False,


In [473]:
data[data['user.gender']=='m'].sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
58187,0,0,0,4362908001173820,吴亦凡对不起[二哈][二哈][二哈][二哈],前后2000万 OPPO R11,喝了王老吉成为基佬王的男人,569,165,m,3607455341,0,0,https://m.weibo.cn/u/3607455341?uid=3607455341,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,香香鸡啊,48,14,False,
10835,0,0,0,4362811440290233,竟然很好听？😂 😂 不错哦,OPPO R11s Plus,陪在身边才算拥有，爱到习惯才叫长久。,297,100,m,5628399052,0,0,https://m.weibo.cn/u/5628399052?uid=5628399052,https://tvax1.sinaimg.cn/crop.0.0.996.996.180/...,今年要更加努力xx,764,14,False,
12116,0,0,0,4362800471640602,吴亦凡牛逼,Android客户端,,0,1,m,6721476489,0,0,https://m.weibo.cn/u/6721476489?uid=6721476489,https://tvax2.sinaimg.cn/default/images/defaul...,用户6721476489,5,2,False,
107,0,0,0,4362776065372823,今天我就是51粉丝了[doge],HUAWEI P20,到底我也是个高傲的成年…,369,248,m,5171942624,0,0,https://m.weibo.cn/u/5171942624?uid=5171942624,https://tvax4.sinaimg.cn/crop.0.0.996.996.180/...,小羊的百事快落水,1054,31,False,
41846,1,0,0,4362810878222659,奈何桥上。 ...,HUAWEI Mate 10 Pro,,188,62,m,5621943005,1,11,https://m.weibo.cn/u/5621943005?uid=5621943005,https://tvax1.sinaimg.cn/crop.0.0.664.664.180/...,HaKuNa玛挞挞,62,9,False,


In [25]:
data_fake = data[((data['user.follow_count']<=5)|(data['user.followers_count']<=5))&
                 (data['user.description']=='')&
                 (data['comments_count']==0)&
                (data['attitudes_count']==0)&
                (data['reposts_count']==0)&
                (data['user.mbrank']==0)]
data_fake.sample(5)

Unnamed: 0,attitudes_count,comments_count,reposts_count,mid,raw_text,source,user.description,user.follow_count,user.followers_count,user.gender,user.id,user.mbrank,user.mbtype,user.profile_url,user.profile_image_url,user.screen_name,user.statuses_count,user.urank,user.verified,user.verified_reason
10715,0,0,0,4362800564822206,吴亦凡牛逼,Android客户端,,0,1,m,6693614926,0,0,https://m.weibo.cn/u/6693614926?uid=6693614926,https://tvax3.sinaimg.cn/default/images/defaul...,用户6693614926,40,4,False,
46174,0,0,0,4362800585122065,吴亦凡牛逼,Android客户端,,0,1,f,6693651578,0,0,https://m.weibo.cn/u/6693651578?uid=6693651578,https://tvax3.sinaimg.cn/default/images/defaul...,用户6693651578,39,4,False,
14330,0,0,0,4362806692434481,转发微博,OPPO智能手机,,83,2,f,6981995378,0,0,https://m.weibo.cn/u/6981995378?uid=6981995378,https://tvax1.sinaimg.cn/crop.0.0.100.100.180/...,激奋_欧耶,11,4,False,
67636,0,0,0,4362925572072904,转发微博,iPhone客户端,,2,2,m,6580608026,0,0,https://m.weibo.cn/u/6580608026?uid=6580608026,https://tvax4.sinaimg.cn/crop.0.0.1125.1125.18...,LiZyuuu,2,3,False,
33961,0,0,0,4362800232918724,吴亦凡牛逼,Android客户端,,0,1,m,6693960080,0,0,https://m.weibo.cn/u/6693960080?uid=6693960080,https://tvax1.sinaimg.cn/default/images/defaul...,用户6693960080,38,4,False,


In [26]:
data_fake.shape

(5667, 20)

In [27]:
# 昵称里包含“用户”的，基本上可以断定是假粉丝
data_fake2_index = data[(data['user.follow_count']>5)&
                        (data['user.followers_count']>5)&
                        (data['user.screen_name'].str.contains('用户'))].index

In [28]:
# 把假的流量粉丝转发组合起来
data_fake = pd.concat([data_fake, data.iloc[data_fake2_index]])

In [29]:
data_fake.shape

(6100, 20)

In [30]:
# 取出真粉的转发
data_true = data.drop(data_fake.index)

In [31]:
data_true.shape

(96018, 20)

In [32]:
print('真粉丝转发数占总转发数的{}%'.format(np.round(data_true.shape[0]/data.shape[0]*100, 2)))
print('假粉丝转发数占总转发数的{}%'.format(np.round(data_fake.shape[0]/data.shape[0]*100, 2)))

真粉丝转发数占总转发数的94.03%
假粉丝转发数占总转发数的5.97%


In [33]:
bar = Bar("吴亦凡真假流量的转发量", width = 600,height=500)
bar.add("(总数据102118条)", ['总转发量', '假粉丝转发量', '真粉丝转发量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0]], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [34]:
real_fans_num = data_true.drop_duplicates(subset='user.id').shape[0]

In [35]:
bar = Bar("吴亦凡真假流量的转发量与真实转发粉丝量(总数据102118条)", width = 600,height=500)
bar.add('', ['总转发量', '假粉丝转发量', '真粉丝转发量', '真实转发粉丝量'], 
        [data.shape[0], data_fake.shape[0], data_true.shape[0], real_fans_num], is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True, xaxis_rotate=20)
bar

In [36]:
print('真实转发粉丝量占总转发数的{}%'.format(np.round(real_fans_num/data.shape[0]*100, 2)))

真实转发粉丝量占总转发数的80.17%


### 2. 大家对于《大碗宽面》怎么看？

In [39]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102118 entries, 0 to 102117
Data columns (total 20 columns):
attitudes_count           102118 non-null int64
comments_count            102118 non-null int64
reposts_count             102118 non-null int64
mid                       102118 non-null object
raw_text                  102118 non-null object
source                    102118 non-null object
user.description          102118 non-null object
user.follow_count         102118 non-null int64
user.followers_count      102118 non-null int64
user.gender               102118 non-null object
user.id                   102118 non-null int64
user.mbrank               102118 non-null int64
user.mbtype               102118 non-null int64
user.profile_url          102118 non-null object
user.profile_image_url    102118 non-null object
user.screen_name          102118 non-null object
user.statuses_count       102118 non-null int64
user.urank                102118 non-null int64
user.verified    

In [53]:
from snownlp import SnowNLP


def get_sent_snownlp(data):
    s = SnowNLP(data)
    return s.sentiments

In [57]:
data_true['clean_text'] = data_true['raw_text'].str.split("//", expand=True)[0]

In [59]:
data_true['sent_nlp'] = data_true.loc[data_true['clean_text']!='', 'clean_text'].apply(get_sent_snownlp)

In [433]:
data_true[['clean_text', 'sent_nlp']].sample(5)

Unnamed: 0,clean_text,sent_nlp
30370,哈哈哈哈，第一次转发吴亦凡的微博，之前都是在鬼畜区见到，莫名觉得好听,0.967346
91789,我觉得不错啊？,0.861213
79664,转发微博,0.643891
82108,你看这个面它又长又宽，你看这个碗它又大又圆[允悲] mv好Q啊,0.903441
46573,我晕我现在心情真的好复杂,0.111104


In [80]:
data_true.loc[-data_true['clean_text'].isin(['转发微博', 'repost', '轉發微博']), 'sent_nlp'].mean()

0.6860448043677209

In [107]:
from pyecharts import Gauge, Page


g = Gauge()
g.add('', ['评论对《大碗宽面》\n的平均评分'], [68.6])
g

In [165]:
data_true.loc[data_true['sent_nlp']>0.9, ['clean_text', 'sent_nlp']].sample(5)

Unnamed: 0,clean_text,sent_nlp
39966,吴亦凡做的太酷了,0.909254
49314,啊啊啊啊啊啊，牛鹿锁死,0.924473
82926,讲真，这首歌很有旋律啊[赞][赞][赞],0.999965
10412,瑞思拜瑞思拜,0.988395
3454,很接地气了哈哈哈哈哈,0.972198


In [170]:
data_true.loc[data_true['attitudes_count'].sort_values(ascending=False)[:10].index, ['user.screen_name', 'clean_text', 'attitudes_count']]

Unnamed: 0,user.screen_name,clean_text,attitudes_count
20939,何炅,吴亦凡好有趣一男的。,30891
74001,舒淇,肚子暖暖 心就暖 [色][色][色],29275
90481,包贝尔,饿了，你吃啥呢？,14388
25295,BeatsbyDre,朋友，吃面吗？又长又宽的那种👇,1287
52621,鹿透社,鹿晗也发歌了，而且还是认真用心做的好歌，希望你朋友圈的微商每次找你转发的时候也能回馈一下，哦...,709
72937,Clear清扬,清扬能让你头发无懈可击，这也确是我本意！明天的演唱会，我在南京等你！Ah wu ah nah...,556
1410,闫紫境GwAwa,这就是hiphop！Hiphop的精神最重要！[good][good][good],555
25552,湖南卫视七十二层奇楼,我听这歌又酷又甜[心] 凡凡 好久不见甚是想念～,549
19120,限定热狗丨思聪,吃了这碗面🍜我们就是一家人[加油],418
20671,西西里岛岛主金女士,哈哈哈哈哈哈哈哈哈他真的好可爱哦！！！！！！！！！,318


In [173]:
data_true.loc[data_true['attitudes_count'].sort_values(ascending=False)[:100].index, ['user.screen_name', 'clean_text', 'attitudes_count']].sample(5)

Unnamed: 0,user.screen_name,clean_text,attitudes_count
38474,slayerboom,高啊~,30
64727,婕大酱,可可爱爱吴亦凡[嘿哈][嘿哈],25
16831,小精灵real,支持吴老师的娱乐精神,61
53112,张金堡,你看这个驰，他又帅又俊✌,158
21270,大葱哥聊KPL,曾经有一碗真挚的宽面放在葱面前，葱没有珍惜，甚至还嗤之以鼻。经历了种种对耳朵的洗礼，我才懂食...,18


### 3. 有多少人拿吴亦凡跟蔡徐坤做对比？

In [174]:
data_true.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 96018 entries, 0 to 102117
Data columns (total 22 columns):
attitudes_count           96018 non-null int64
comments_count            96018 non-null int64
reposts_count             96018 non-null int64
mid                       96018 non-null object
raw_text                  96018 non-null object
source                    96018 non-null object
user.description          96018 non-null object
user.follow_count         96018 non-null int64
user.followers_count      96018 non-null int64
user.gender               96018 non-null object
user.id                   96018 non-null int64
user.mbrank               96018 non-null int64
user.mbtype               96018 non-null int64
user.profile_url          96018 non-null object
user.profile_image_url    96018 non-null object
user.screen_name          96018 non-null object
user.statuses_count       96018 non-null int64
user.urank                96018 non-null int64
user.verified             96018 non-

In [290]:
data_true['clean_text'].sample(10)

65939                                              
18957                            何老师～这个语气好像粉丝说的哈哈哈哈
53931                 我家这位兄弟真的是又酷又超级有梗的[喵喵][佩奇][爱你]
53496                                     何老师也是可爱的！
20893                  这个真的好听。。。我他妈真的，反差太大了，多出这种歌啊！
39047                                          转发微博
10531                    为什么有了蔡徐坤 感觉吴亦凡没那么讨厌了[doge]
53778                                         期待inh
3962     想起哥哥在中国新说唱的时候说想把中国风带到rap里，哥哥真的有一直在努力。[好爱哦]
74339                         作为吃货团队的一名成员，必须得大碗吃面呀！
Name: clean_text, dtype: object

In [347]:
data_true['raw_text'].str.contains('kun|坤|律师|球|函|cxk|比|弟|CXK|胸|格局|气度|衬托').sum()

6229

In [436]:
data_kun = data_true.loc[data_true['raw_text'].str.contains('kun|坤|律师|球|函|cxk|比|弟|CXK|胸|格局|气度|衬托'),
             ['user.screen_name', 'raw_text', 'attitudes_count']]

In [457]:
data_kun.loc[list(data_kun['attitudes_count'].nlargest(10).index), ['raw_text', 'attitudes_count']]

Unnamed: 0,raw_text,attitudes_count
92800,现在艺人公关真的是很厉害，能把嘲点转换为自己的亮点，就比如这个大碗宽面，这么一发想嘲的人可能...,43
27741,凡凡趁着坤坤这波居然完美洗白了？？关键的是歌还不错？！//@藤新Jiven:老吳這個可以，哈...,42
1329,妙妙妙！大气又可爱！才华横溢我的大凡凡！瑞思拜[米妮爱你]#吴亦凡[超话]# [米奇比心][...,37
82171,自黑新高度[允悲]这心态不得不佩服！我凡真不是普通人的气度，爱了爱了！永远支持你呀！[心][...,21
30017,出大事了 cxk要g了吗，凡少也开始玩怪东西了,20
64918,和律师函相比真的是高下立判了,17
84339,还是凡凡格局大[赞]cxk就是个弟弟,16
5722,我的宝贝今天真的好棒[羞嗒嗒] 顺便瞎说一番 今天打开空间朋友圈全部都是对不起吴亦凡的发言 ...,12
30014,我也欠吴老师一个道歉，起码是条会打篮球的刚烈汉子。,12
50951,天蝎气场让人觉得有距离且紧绷给人霸道的假象，好多天蝎明星都经历过被人diss人设太霸道总裁太...,11


In [349]:
data_true[data_true['raw_text']!=''].shape

(96018, 22)

In [350]:
6229/96018

0.06487325293174197

### 4. 有多少人开始路转粉了？

In [346]:
data_true['clean_text'].sample(10)

4777                                                  转发微博
54202                                                 那个磊呢
49000                                                     
62390                                             卧槽，凡聪是真的
84587                                                     
95667    有人把综艺梗代入到你的音乐作品 你用自己的方式 放过他们和自己 对于今天的我来说 非常应景了...
43513                                                 转发微博
15210                                        只有我一个人想哭吗....
58018    不过综艺随便的一个梗 走到哪都有人黑 也不知道为啥都拿来当祖训背 然后嘻嘻哈哈的 我哥真是太...
87223                                                 姐姐晚安
Name: clean_text, dtype: object

In [351]:
data_true['raw_text'].str.contains('转粉|爱上|重新|路|圈粉|espect|瑞思拜').sum()

3646

In [362]:
data_true.loc[data_true['raw_text'].str.contains('转粉|爱上|重新|路|圈粉|espect|瑞思拜'),
             ['user.screen_name', 'raw_text']].sample(5)

Unnamed: 0,user.screen_name,raw_text
89866,从没选对过,黑转路带点粉了，毕竟那么多人说咱像
1744,玺欢侬吖,被圈粉了[喵喵]
1097,是灰灰呀是灰灰,圈粉
31501,明明爱kris,#吴亦凡[超话]#//@PP音乐官方微博:我不得不瑞思拜 从去年的diss track 到今...
61191,蟹小排,哇，好听，转粉了呀！好有趣啊


In [363]:
fans = data_true.loc[data_true['raw_text'].str.contains('转粉|爱上|重新|路|圈粉|espect|瑞思拜'), 
                     'user.gender'].value_counts()

In [364]:
bar = Bar("路转粉的男女性别比例", width = 600,height=500)
bar.add("", ['女', '男'], fans.values, is_stack=True, 
       xaxis_label_textsize=20, yaxis_label_textsize=14, is_label_show=True)
bar

In [365]:
fans/fans.sum()

f    0.669501
m    0.330499
Name: user.gender, dtype: float64

### 5. 评论的词云图

In [369]:
import jieba
from collections import Counter
from pyecharts import WordCloud

jieba.add_word('吴亦凡')
jieba.add_word('蔡徐坤')
jieba.add_word('ikun')
jieba.add_word('凡凡')
jieba.add_word('Kris')

swords = [x.strip() for x in open ('stopwords.txt')]

In [373]:
def plot_word_cloud(data, swords):
    text = ''.join(data)
    words = list(jieba.cut(text))
    ex_sw_words = []
    for word in words:
        if len(word)>1 and (word not in swords):
            ex_sw_words.append(word)
    c = Counter()
    c = Counter(ex_sw_words)
    wc_data = pd.DataFrame({'word':list(c.keys()), 'counts':list(c.values())}).sort_values(by='counts', ascending=False).head(100)
    wordcloud = WordCloud(width=1300, height=620)
    wordcloud.add("", wc_data['word'], wc_data['counts'], word_size_range=[20, 100])
    return wordcloud

In [375]:
plot_word_cloud(data=data_true.loc[-data_true['clean_text'].str.contains("转发"), 'clean_text'], swords=swords)