In [1]:
import numpy as np
import pandas as pd
import time
import matplotlib
matplotlib.rcParams['font.family'] = 'STSong'
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts import options as opts
from pyecharts.charts import ThemeRiver, Geo, Map, Timeline
from pyecharts.globals import CurrentConfig, NotebookType, ThemeType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
from gensim import corpora
from gensim.corpora import Dictionary
import jieba
import stylecloud
from IPython.display import Image
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/rumor_text_data_processed.csv')
df.head(3)

Unnamed: 0,date,source,content,core,province,like,views,all_topic,all_prob,all_emotion,sub_topic,sub_prob,sub_emotion
0,2022-02-18,北京日报客户端,“有人从香港游泳偷渡到深圳湾被捕？”假的！系轻生被救！,有人从香港游泳偷渡到深圳湾被捕？,广东省,0,0,4,0.923976,0.974462,6,0.869034,0.89157
1,2022-02-18,大河报、羊城晚报,近日，社交媒体上一则题为“河南一位老爷爷考上清华大学被顶替”的消息引发网友关注和转发。消息称...,河南一位老爷爷考上清华大学被顶替,河南省,41,1,3,0.987694,1.0,11,0.847208,0.812769
2,2022-02-17,南方都市报、大众网,近日，网传“湖南衡阳南岳衡山寺庙高薪招聘住持，月薪高达5万元，弹性工作制，下班后原则上不干预...,湖南衡阳南岳衡山寺庙高薪招聘住持，月薪高达5万元，弹性工作制，下班后原则上不干预私生活,湖南省,26,1,3,0.984797,0.999992,11,0.94907,0.992183


# 4 时间信息挖掘

## 4.1 主题河流图

首先按照date从过去到现在进行排序。

In [3]:
df = df.sort_values('date').reset_index()
df['date'] = pd.to_datetime(df['date'])

以一周为频率取样。

In [4]:
def resample_week(x):
    if x <= 7:
        return '01'
    elif x <= 15:
        return '07'
    elif x <= 23:
        return '15'
    else:
        return '23'

In [5]:
df['day'] = df.date.dt.day
df['week'] = df['day'].apply(resample_week)
df['week'] = df.date.dt.year.astype(str) + '-' + df.date.dt.month.astype(str) + '-' + df['week'].astype(str)
df['week'] = pd.to_datetime(df['week'])

以半个月为频率取样。

In [6]:
df['half_month'] = df['day'].apply(lambda x: '01' if x <= 15 else '15')
df['half_month'] = df.date.dt.year.astype(str) + '-' + df.date.dt.month.astype(str) + '-' + df['half_month'].astype(str)
df['half_month'] = pd.to_datetime(df['half_month'])

以1个月为频率取样。

In [7]:
df['month'] = df.date.dt.year.astype(str) + '-' + df.date.dt.month.astype(str) + '-01'
df['month'] = pd.to_datetime(df['month'])
df.head(3)

Unnamed: 0,index,date,source,content,core,province,like,views,all_topic,all_prob,all_emotion,sub_topic,sub_prob,sub_emotion,day,week,half_month,month
0,365,2021-11-07,天津日报,近日，网传“天津医科大学总医院出现疫情”“总医院一、二、六层已全封”等消息。经天津市疫情防控...,天津医科大学总医院出现疫情，总医院一、二、六层已全封,天津市,30,2,0,0.635143,1.073142e-12,9,0.869032,0.697871,7,2021-11-01,2021-11-01,2021-11-01
1,359,2021-11-08,健康青羊、四川日报,11月8日，有网友在社交媒体上发布消息称“成都市青羊区蜀鑫路附近正在建方舱医院。”经成都青羊...,成都市青羊区蜀鑫路附近正在建方舱医院。,四川省,15,3,0,0.980937,0.3106715,9,0.869038,0.828448,8,2021-11-07,2021-11-01,2021-11-01
2,360,2021-11-08,成都锦城学院,近日，网传视频称“成都锦城学院发现两例新冠”“全部戒严，武警在这扎起”。经成都锦城学院官方证...,成都锦城学院发现两例新冠，全部戒严，武警在这扎起,四川省,18,3,2,0.982923,0.4592922,3,0.908321,0.121564,8,2021-11-07,2021-11-01,2021-11-01


### 4.1.1 谣言数量大主题河流图

以1个月为频率。

In [8]:
tmp = df.groupby(['month', 'all_topic'])['content'].count().reset_index()
tmp = tmp[['month', 'content', 'all_topic']]
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量大主题河流图（频率：月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言数量大主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [9]:
tmp = df.groupby(['half_month', 'all_topic'])['content'].count().reset_index()
tmp = tmp[['half_month', 'content', 'all_topic']]
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量大主题河流图（频率：半月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言数量大主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [10]:
tmp = df.groupby(['week', 'all_topic'])['content'].count().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 6)
tmp1.columns = ['week']
tmp1['content'] = np.zeros(tmp1.shape[0])
tmp1['all_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(6)])
for i in range(tmp.shape[0]):
    tmp1['content'][(tmp1.week == tmp.loc[i].week) & (tmp1.all_topic == tmp.loc[i].all_topic)] = tmp.loc[i].content
tmp1['all_topic'] = '大主题' + tmp1['all_topic'].astype(str)
tmp = tmp1.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量大主题河流图（频率：周）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言数量大主题河流图（频率：周）.html')
wc.render_notebook()

### 4.1.2 谣言数量小主题河流图

以1个月为频率。

In [11]:
tmp = df.groupby(['month', 'sub_topic'])['content'].count().reset_index()
tmp = tmp[['month', 'content', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量小主题河流图（频率：月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言数量小主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [12]:
tmp = df.groupby(['half_month', 'sub_topic'])['content'].count().reset_index()
tmp = tmp[['half_month', 'content', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量小主题河流图（频率：半月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言数量小主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [13]:
tmp = df.groupby(['week', 'sub_topic'])['content'].count().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 12)
tmp1.columns = ['week']
tmp1['content'] = np.zeros(tmp1.shape[0])
tmp1['sub_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(12)])
for i in range(tmp.shape[0]):
    tmp1['content'][(tmp1.week == tmp.loc[i].week) & (tmp1.sub_topic == tmp.loc[i].sub_topic)] = tmp.loc[i].content
tmp1['sub_topic'] = '小主题' + tmp1['sub_topic'].astype(str)
tmp = tmp1.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.LIGHT))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言数量小主题河流图（频率：周）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言数量小主题河流图（频率：周）.html')
wc.render_notebook()

### 4.1.3 谣言热度大主题河流图

以每个主题的平均点赞数作为谣言热度的指标。

以1个月为频率。

In [14]:
tmp = df.groupby(['month', 'all_topic'])['like'].mean().reset_index()
tmp = tmp[['month', 'like', 'all_topic']]
tmp['like'] = round(tmp['like'], 2)
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度大主题河流图（频率：月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言热度大主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [15]:
tmp = df.groupby(['half_month', 'all_topic'])['like'].mean().reset_index()
tmp = tmp[['half_month', 'like', 'all_topic']]
tmp['like'] = round(tmp['like'], 2)
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度大主题河流图（频率：半月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言热度大主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [16]:
tmp = df.groupby(['week', 'all_topic'])['like'].mean().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 6)
tmp1.columns = ['week']
tmp1['like'] = np.zeros(tmp1.shape[0])
tmp1['all_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(6)])
for i in range(tmp.shape[0]):
    tmp1['like'][(tmp1.week == tmp.loc[i].week) & (tmp1.all_topic == tmp.loc[i].all_topic)] = tmp.loc[i].like
tmp1['all_topic'] = '大主题' + tmp1['all_topic'].astype(str)
tmp['like'] = round(tmp['like'], 2)
tmp = tmp1.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度大主题河流图（频率：周）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言热度大主题河流图（频率：周）.html')
wc.render_notebook()

### 4.1.4 谣言热度小主题河流图

以1个月为频率。

In [17]:
tmp = df.groupby(['month', 'sub_topic'])['like'].mean().reset_index()
tmp = tmp[['month', 'like', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp['like'] = round(tmp['like'], 2)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度小主题河流图（频率：月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言热度小主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [18]:
tmp = df.groupby(['half_month', 'sub_topic'])['like'].mean().reset_index()
tmp = tmp[['half_month', 'like', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp['like'] = round(tmp['like'], 2)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度小主题河流图（频率：半月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言热度小主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [19]:
tmp = df.groupby(['week', 'sub_topic'])['like'].mean().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 12)
tmp1.columns = ['week']
tmp1['like'] = np.zeros(tmp1.shape[0])
tmp1['sub_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(12)])
for i in range(tmp.shape[0]):
    tmp1['like'][(tmp1.week == tmp.loc[i].week) & (tmp1.sub_topic == tmp.loc[i].sub_topic)] = tmp.loc[i].like
tmp1['sub_topic'] = '小主题' + tmp1['sub_topic'].astype(str)
tmp1['like'] = round(tmp1['like'], 2)
tmp = tmp1.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WALDEN))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言热度小主题河流图（频率：周）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言热度小主题河流图（频率：周）.html')
wc.render_notebook()

### 4.1.5 谣言情感大主题河流图

以每个主题的平均情感强度作为谣言情感强烈程度的指标。

以1个月为频率。

In [20]:
tmp = df.groupby(['month', 'all_topic'])['all_emotion'].mean().reset_index()
tmp = tmp[['month', 'all_emotion', 'all_topic']]
tmp['all_emotion'] = np.abs(round(tmp['all_emotion'], 2) - 0.5)
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感大主题河流图（频率：月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言情感大主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [21]:
tmp = df.groupby(['half_month', 'all_topic'])['all_emotion'].mean().reset_index()
tmp = tmp[['half_month', 'all_emotion', 'all_topic']]
tmp['all_emotion'] = np.abs(round(tmp['all_emotion'], 2) - 0.5)
tmp['all_topic'] = '大主题' + tmp['all_topic'].astype(str)
tmp = tmp.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感大主题河流图（频率：半月）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言情感大主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [22]:
tmp = df.groupby(['week', 'all_topic'])['all_emotion'].mean().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 6)
tmp1.columns = ['week']
tmp1['all_emotion'] = np.zeros(tmp1.shape[0])
tmp1['all_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(6)])
for i in range(tmp.shape[0]):
    tmp1['all_emotion'][(tmp1.week == tmp.loc[i].week) & (tmp1.all_topic == tmp.loc[i].all_topic)] = tmp.loc[i].all_emotion
tmp1['all_topic'] = '大主题' + tmp1['all_topic'].astype(str)
tmp1['all_emotion'] = np.abs(round(tmp1['all_emotion'], 2) - 0.5)
tmp = tmp1.values.tolist()
series = ['大主题' + str(i) for i in range(6)]

wc = ThemeRiver(init_opts=opts.InitOpts(height='600px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感大主题河流图（频率：周）',
                                  pos_bottom='90%', pos_right='center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
    )
wc.render('timeriver/谣言情感大主题河流图（频率：周）.html')
wc.render_notebook()

### 4.1.6 谣言情感小主题河流图

以1个月为频率。

In [23]:
tmp = df.groupby(['month', 'sub_topic'])['sub_emotion'].mean().reset_index()
tmp = tmp[['month', 'sub_emotion', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp['sub_emotion'] = np.abs(round(tmp['sub_emotion'], 2) - 0.5)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感小主题河流图（频率：月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言情感小主题河流图（频率：月）.html')
wc.render_notebook()

以半个月为频率。

In [24]:
tmp = df.groupby(['half_month', 'sub_topic'])['sub_emotion'].mean().reset_index()
tmp = tmp[['half_month', 'sub_emotion', 'sub_topic']]
tmp['sub_topic'] = '小主题' + tmp['sub_topic'].astype(str)
tmp['sub_emotion'] = np.abs(round(tmp['sub_emotion'], 2) - 0.5)
tmp = tmp.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感小主题河流图（频率：半月）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言情感小主题河流图（频率：半月）.html')
wc.render_notebook()

以周为频率。

In [25]:
tmp = df.groupby(['week', 'sub_topic'])['sub_emotion'].mean().reset_index()
tmp1 = pd.DataFrame(list(tmp.week.unique()) * 12)
tmp1.columns = ['week']
tmp1['sub_emotion'] = np.zeros(tmp1.shape[0])
tmp1['sub_topic'] = np.concatenate([[i] * len(tmp.week.unique()) for i in range(12)])
for i in range(tmp.shape[0]):
    tmp1['sub_emotion'][(tmp1.week == tmp.loc[i].week) & (tmp1.sub_topic == tmp.loc[i].sub_topic)] = tmp.loc[i].sub_emotion
tmp1['sub_topic'] = '小主题' + tmp1['sub_topic'].astype(str)
tmp1['sub_emotion'] = np.abs(round(tmp1['sub_emotion'], 2) - 0.5)
tmp = tmp1.values.tolist()
series = ['小主题' + str(i) for i in range(12)]

wc = ThemeRiver(init_opts=opts.InitOpts(width='960px', height='720px', theme=ThemeType.WESTEROS))\
    .add(series_name=series, data=tmp, singleaxis_opts=opts.SingleAxisOpts(type_='time'))\
    .set_global_opts(
        title_opts=opts.TitleOpts(title='谣言情感小主题河流图（频率：周）', pos_left = 'center'),
        tooltip_opts=opts.TooltipOpts(trigger="axis", axis_pointer_type="line"),
        legend_opts=opts.LegendOpts(pos_right='right', pos_top='top')
    )
wc.render('timeriver/谣言情感小主题河流图（频率：周）.html')
wc.render_notebook()

## 4.2 词云图

不论是谣言数量的主题河流图，还是谣言热度的主题河流图，我们都可以从中看出，谣言的变化趋势是先增大后减小。结合实际，我们知道2021.11至2022.2可以分为3个阶段：疫情初期、疫情爆发期和疫情中期，这3个时期的谣言内容应该会有所不同。因此我们画出这3个时期的词云图，看看文本主题随时间的变化情况。

In [26]:
corpus = list(df.content)
with open('LDA/stopwords_all.txt', 'r') as f:
    stopwords = [line.strip() for line in f]
num_words = [str(i) for i in range(3000)]
stopwords = stopwords + num_words
tokenlist = [jieba.lcut(text, HMM=True) for text in corpus]
tokenlist = [[t for t in tokens if t not in stopwords] for tokens in tokenlist]
dictionary = corpora.Dictionary(tokenlist)

Building prefix dict from the default dictionary ...
Loading model from cache D:\Temp\jieba.cache
Loading model cost 0.381 seconds.
Prefix dict has been built successfully.


根据这3个时期重新取样。

In [27]:
def resample_period1(x):
    if x < pd.to_datetime('2021-12-01'):
        return pd.to_datetime('2021-11-01') # 疫情初期
    if x < pd.to_datetime('2022-01-15'):
        return pd.to_datetime('2022-12-01') # 疫情爆发期
    else:
        return pd.to_datetime('2022-01-15') # 疫情中期

In [28]:
def resample_period2(x):
    if x < pd.to_datetime('2021-12-01'):
        return '疫情初期' # 疫情初期
    if x < pd.to_datetime('2022-01-15'):
        return '疫情爆发期' # 疫情爆发期
    else:
        return '疫情中期' # 疫情中期

In [29]:
df['period'] = df.date.apply(resample_period1)
df['covid'] = df.date.apply(resample_period2)

画出词云图。

In [30]:
def draw_wordcloud(covid, color='Blues'):
    words = []
    for i in df[df.covid==covid].index:
        words += tokenlist[i]
    stylecloud.gen_stylecloud(
        text=' '.join(words),
        font_path="wordcloud/STSong.ttf",
        size = 1000,
        icon_name='fas fa-virus',
        palette='colorbrewer.sequential.' + color + '_7',
        background_color='white',
        max_words=200,
        stopwords=True,
        custom_stopwords=['例新冠', '发现', '发生', '包', '有人', '一名'],
        collocations=False,
        # gradient='horizontal',
        output_name=f'wordcloud/wordcloud{covid}.png'
    )
    return Image(filename = f'wordcloud/wordcloud{covid}.png')

In [31]:
colorlist = ['Blues', 'Reds', 'Oranges']
periods = ['疫情初期', '疫情爆发期', '疫情中期']

# for i in range(3):
#     draw_wordcloud(periods[i], colorlist[i])

## 4.3 随时间变化的谣言分布地图

### 4.3.1 谣言数量分布

In [32]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['content'].count()).reset_index()

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'content']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['content'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的谣言数量分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = 0, max_ = 10),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的谣言数量分布地图.html')
timeline.render_notebook()

### 4.3.2 谣言热度分布

点赞总量。

In [33]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['like'].sum()).reset_index()

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'like']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['like'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的谣言点赞总量分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = 0, max_ = 300),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的谣言点赞总量分布地图.html')
timeline.render_notebook()

点赞均值。

In [34]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['like'].mean()).reset_index()
data['like'] = round(data['like'], 2)

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'like']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['like'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的谣言点赞均值分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = 0, max_ = 90),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的谣言点赞均值分布地图.html')
timeline.render_notebook()

## 4.3.3 谣言情感倾向分布

In [35]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['all_emotion'].mean()).reset_index()
data['all_emotion'] = round(data['all_emotion'] - 0.5, 2)

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'all_emotion']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['all_emotion'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的详细谣言情感倾向分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = -0.5, max_ = 0.5),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的详细谣言情感倾向分布地图.html')
timeline.render_notebook()

In [36]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['sub_emotion'].mean()).reset_index()
data['sub_emotion'] = round(data['sub_emotion'] - 0.5, 4)

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'sub_emotion']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['sub_emotion'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的核心谣言情感倾向分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = -0.5, max_ = 0.5),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的核心谣言情感倾向分布地图.html')
timeline.render_notebook()

## 4.3.4 谣言情感强烈程度分布

In [37]:
df['abs_all_emotion'] = np.abs(df['all_emotion'] - 0.5)
df['abs_sub_emotion'] = np.abs(df['sub_emotion'] - 0.5)

In [38]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['abs_all_emotion'].mean()).reset_index()
data['abs_all_emotion'] = round(data['abs_all_emotion'], 2)

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'abs_all_emotion']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['abs_all_emotion'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的详细谣言情感强烈程度分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = 0.45, max_ = 0.5),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的详细谣言情感强烈程度分布地图.html')
timeline.render_notebook()

In [39]:
data = pd.DataFrame(df.groupby(['covid', 'province'])['abs_sub_emotion'].mean()).reset_index()
data['abs_sub_emotion'] = round(data['abs_sub_emotion'], 2)

timeline = Timeline(init_opts = opts.InitOpts(width = '960px', height = '540px', theme = 'light'))

for i in range(len(periods)):
    tmp = data[data.covid == periods[i]].reset_index(drop = True)[['province', 'abs_sub_emotion']]
    attr = list(tmp.iloc[:,0])
    value = list(tmp['abs_sub_emotion'])
    data_pair = []
    for idx in range(len(attr)):
        pair = [attr[idx], value[idx]]
        data_pair.append(pair)
    m = Map()
    m.add(f'{periods[i]}', data_pair, 'china', is_map_symbol_show=False)
    m.set_series_opts(label_opts = opts.LabelOpts(is_show = False))
    m.set_global_opts(title_opts = opts.TitleOpts(title = '随时间变化的核心谣言情感强烈程度分布地图', pos_left = 'center'),
                     visualmap_opts = opts.VisualMapOpts(min_ = 0.2, max_ = 0.5),
                     legend_opts=opts.LegendOpts(pos_right='right', pos_top='top'))
    timeline.add(chart = m, time_point = str(periods[i]))
timeline.add_schema(is_auto_play = True, play_interval = 600)
timeline.render('map/随时间变化的核心谣言情感强烈程度分布地图.html')
timeline.render_notebook()

将数据的索引恢复，保存数据到本地。

In [40]:
df = df.set_index('index').sort_index()
df.to_csv('data/rumor_text_data_final.csv', index=False)