# Weibo Hot Topics

## Preprocessing

In [287]:
import pandas as pd
import datetime
from datetime import datetime, date
import numpy as np

df = pd.read_csv("./hot_search_data/weibo_hot_topic.csv", index_col=0)

df['end_time'] = pd.to_datetime(df['end_time'])
df['start_time'] = pd.to_datetime(df['start_time'])
df['alive_time'] = (df['end_time'] - df['start_time']).values/np.timedelta64(1, 'h')

df['start_mnth_day'] = df['start_time'].apply(lambda x: x.strftime('%Y-%m-%d')) 
df['end_mnth_day'] = df['end_time'].apply(lambda x: x.strftime('%Y-%m-%d')) 

df = df.sort_values('count', ascending=False).drop_duplicates('content').sort_index()
result_df = df.drop_duplicates(subset=['content'], keep='first')
result_df = result_df.sort_values(by=['alive_time'], ascending=False)
result_df.to_csv('hot_with_live_time.csv')
result_df

Unnamed: 0,content,end_time,start_time,count,alive_time,start_mnth_day,end_mnth_day
6401,惊蛰,2020-03-05 15:40:01,2019-10-26 20:08:02,444127,3139.533056,2019-10-26,2020-03-05
6036,天天向上,2020-03-02 10:38:02,2019-10-27 23:08:02,1750818,3035.500000,2019-10-27,2020-03-02
5240,快乐大本营,2020-02-29 22:22:01,2019-10-26 20:08:02,1655857,3026.233056,2019-10-26,2020-02-29
6121,詹姆斯三双,2020-03-02 12:38:02,2019-11-02 12:28:02,250208,2904.166667,2019-11-02,2020-03-02
5996,粉笔模考,2020-03-01 14:00:02,2019-11-03 11:28:01,1122536,2858.533611,2019-11-03,2020-03-01
...,...,...,...,...,...,...,...
5031,多家车企转产医疗物资,2020-02-20 12:18:02,2020-02-20 12:00:01,253686,0.300278,2020-02-20,2020-02-20
5233,疫情期间你的改变,2020-02-22 12:16:02,2020-02-22 11:58:01,255777,0.300278,2020-02-22,2020-02-22
281,税务师成绩,2020-01-03 12:08:01,2020-01-03 11:50:02,144489,0.299722,2020-01-03,2020-01-03
6514,陕西抗疫补贴超一线医护院长被免职,2020-03-06 08:50:01,2020-03-06 08:42:02,159699,0.133056,2020-03-06,2020-03-06


## Top 20 hot topic

In [288]:
top_20_df = result_df.sort_values(by=['count'], ascending=False)

top_20_df = top_20_df.head(20)

rank_list = ["NO." + str(num) for num in [i for i in range(1, 21)]]
top_20_df['rank'] = rank_list

inside_text = []
for i in range(len(rank_list)): 
    inside_text.append("{}   Search count:{}".format(top_20_df['content'].iloc[i], 
                                                     top_20_df['count'].iloc[i]))
top_20_df['inside_text'] = inside_text

top_20_df.to_csv('top20_hot_topic.csv')
top_20_df

Unnamed: 0,content,end_time,start_time,count,alive_time,start_mnth_day,end_mnth_day,rank,inside_text
2599,安徽出现聚集性疫情,2020-01-27 19:18:01,2020-01-27 10:42:02,16360270,8.599722,2020-01-27,2020-01-27,NO.1,安徽出现聚集性疫情 Search count:16360270
3048,双黄连可抑制新型冠状病毒,2020-02-01 14:22:02,2020-01-31 23:10:02,16134192,15.2,2020-01-31,2020-02-01,NO.2,双黄连可抑制新型冠状病毒 Search count:16134192
3645,湖北副省长回应武汉市民网络求助,2020-02-07 14:44:01,2020-02-06 22:28:02,15248291,16.266389,2020-02-06,2020-02-07,NO.3,湖北副省长回应武汉市民网络求助 Search count:15248291
2351,全国确诊新型肺炎病例,2020-01-30 14:02:01,2020-01-22 23:26:02,14972975,182.599722,2020-01-22,2020-01-30,NO.4,全国确诊新型肺炎病例 Search count:14972975
2400,新加坡抵杭州一架航班所有乘客隔离,2020-01-25 18:00:02,2020-01-25 10:30:01,13061134,7.500278,2020-01-25,2020-01-25,NO.5,新加坡抵杭州一架航班所有乘客隔离 Search count:13061134
2549,500万人离开武汉,2020-01-27 12:48:01,2020-01-26 22:44:02,11412070,14.066389,2020-01-26,2020-01-27,NO.6,500万人离开武汉 Search count:11412070
3147,湖南禽流感,2020-02-02 13:06:01,2020-02-01 22:26:01,10822673,14.666667,2020-02-01,2020-02-02,NO.7,湖南禽流感 Search count:10822673
2301,湖北一家三口去山东过年被举报,2020-01-24 20:20:02,2020-01-24 10:54:02,10554971,9.433333,2020-01-24,2020-01-24,NO.8,湖北一家三口去山东过年被举报 Search count:10554971
2105,新型肺炎确诊440例死亡9例,2020-01-22 18:48:02,2020-01-22 10:30:01,10231631,8.300278,2020-01-22,2020-01-22,NO.9,新型肺炎确诊440例死亡9例 Search count:10231631
2600,浙江成功分离到新型冠状病毒毒株,2020-01-27 18:14:02,2020-01-27 09:22:01,10025927,8.866944,2020-01-27,2020-01-27,NO.10,浙江成功分离到新型冠状病毒毒株 Search count:10025927


In [223]:
import plotly as py
import plotly.graph_objs as go
import plotly.express as px
import plotly.offline as py

pyplt = py.offline.plot

top_20_df = pd.read_csv("top20_hot_topic.csv", sep=',')
x_count = top_20_df['count'][::-1]
inside_text = top_20_df['inside_text'][::-1]
rank = top_20_df['rank'][::-1]
  
color_list =[]
percent = 30
for i in range(len(x_count)):
    color_list.append('hsl(33, 100%, {}%)'.format(percent))
    percent += 3

data = [go.Bar(x = x_count, 
               y = rank,
               orientation = 'h',
               text = inside_text,
               textposition='inside',
               insidetextanchor='middle',
               textfont={'family':'Arial', 
                         'size': 20,
                         'color': '#262626'},
               marker={ 'color':  color_list,},)]

layout = go.Layout(title={'text': "Top 20 Hot Topic",
                          'y':0.92,'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top',},
                   font=dict(family="Arial",
                             size=18,
                             color="#262626"),
                   plot_bgcolor = '#ffffff',
                   autosize=False,width=1300,height=800,
                   xaxis=dict(showticklabels=False),
                   yaxis=dict(tickfont=dict(color='#262626',
                                            size = 18,),),)

figure = go.Figure(data = data, layout = layout)
figure.show()
# pyplt(figure, filename='top20.html')
# pyplt(figure, filename='top20.html', image='png')

### Pick the day with total highest count

In [257]:
group_df = result_df.copy()
group_df = group_df.groupby('start_mnth_day',as_index=False).sum()
sort_df = group_df.sort_values(by=['start_mnth_day'], ascending=True)

sort_df = sort_df[sort_df.start_mnth_day.between('2020-01-01', '2020-03-06')]
sort_df

Unnamed: 0,start_mnth_day,count,alive_time
42,2020-01-01,62688746,5503.365833
43,2020-01-02,57096178,2052.200000
44,2020-01-03,39724062,854.832500
45,2020-01-04,41372963,2061.235278
46,2020-01-05,38648558,2769.266389
...,...,...,...
103,2020-03-02,69375035,731.796944
104,2020-03-03,58606329,734.235000
105,2020-03-04,49711651,800.698056
106,2020-03-05,56261281,803.263056


In [258]:
import datetime

# def dateRange(start, end, step=1, format="%Y-%m-%d"):
#     strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
#     days = (strptime(end, format) - strptime(start, format)).days
#     return [strftime(strptime(start, format) + datetime.timedelta(i), format) for i in range(0, days, step)]

def normalize(df, cols):
    result = df.copy()
    for feature_name in cols:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value) *10
    return result

# datelist = dateRange("2019-10-25", "2020-03-06")

# date_df = pd.DataFrame(datelist, columns = ['start_mnth_day']) 
# merge_df = pd.merge(date_df, sort_df, on='start_mnth_day', how='outer')
# merge_df = merge_df.fillna(0)

norm_df = normalize(sort_df, ['count', 'alive_time'])
norm_df.to_csv('hot_topic_heatmap.csv')
norm_df

Unnamed: 0,start_mnth_day,count,alive_time
42,2020-01-01,4.147188,10.000000
43,2020-01-02,3.616963,3.699036
44,2020-01-03,1.969931,1.512943
45,2020-01-04,2.126262,3.715532
46,2020-01-05,1.867964,5.008220
...,...,...,...
103,2020-03-02,4.781108,1.288311
104,2020-03-03,3.760139,1.292762
105,2020-03-04,2.916844,1.414107
106,2020-03-05,3.537807,1.418790


In [259]:
import plotly.graph_objects as go
import datetime
from datetime import date, timedelta
import numpy as np

map_df = pd.read_csv("hot_topic_heatmap.csv", sep=',')

y = list(map_df.columns)[2:]

begin = datetime.date(2020,1,1)
end = datetime.date(2020, 3, 6)
delta = end - begin 
x = []
for i in range(delta.days + 1):
    x.append(begin + timedelta(days=i))

z = []
for i in range(len(y)):
    z_part = map_df[y[i]].values.tolist()
    z.append(z_part)

fig = go.Figure(data=go.Heatmap(
        z=z,
        x=x,
        y=y,
        colorscale='Blues'))

fig.update_layout(
    title={'text': "Hot topic ",
           'y':0.92,'x':0.5,
           'xanchor': 'center',
           'yanchor': 'top',},
    font=dict(family="Arial",
              color="#262626"),
    xaxis_nticks=36)

fig.show()

Conclusion: It seems that the most hot topic count is around Jan 26th. Let's see exactly.

In [264]:
group_df = group_df.sort_values(by=['count'], ascending=False)
group_df


Unnamed: 0,start_mnth_day,count,alive_time
67,2020-01-26,124421452,5052.163056
68,2020-01-27,122058289,1935.967500
66,2020-01-25,114015406,1264.163333
65,2020-01-24,108810079,1722.199722
63,2020-01-22,97346382,1281.332500
...,...,...,...
17,2019-11-20,271029,2287.600000
35,2019-12-22,270769,1440.133333
5,2019-10-31,243066,1859.066944
39,2019-12-29,220192,429.300000


### Let's see what happened on that day

In [218]:
hottest_day = group_df[['start_mnth_day']].iloc[0].values[0]
highest_day_df = result_df[result_df['start_mnth_day'] == hottest_day]
highest_day_df = highest_day_df.sort_values(by=['count'], ascending=False)

length = highest_day_df.count()[0]
rank_list = ["NO." + str(num) for num in [i for i in range(1, length+1)]]
highest_day_df['rank'] = rank_list

inside_text = []
for i in range(len(rank_list)): 
    inside_text.append("{}   Search count:{}".format(highest_day_df['content'].iloc[i], 
                                                     highest_day_df['count'].iloc[i]))
    
highest_day_df['inside_text'] = inside_text
highest_day_df.to_csv("highest_day.csv")
highest_day_df.head(20)


Unnamed: 0,content,end_time,start_time,count,alive_time,start_mnth_day,end_mnth_day,rank,inside_text
2549,500万人离开武汉,2020-01-27 12:48:01,2020-01-26 22:44:02,11412070,14.066389,2020-01-26,2020-01-27,NO.1,500万人离开武汉 Search count:11412070
2499,传染病学专家表示疫情已刻不容缓,2020-01-26 15:38:02,2020-01-26 07:12:01,6317411,8.433611,2020-01-26,2020-01-26,NO.2,传染病学专家表示疫情已刻不容缓 Search count:6317411
2500,大年初一全国票房仅181万,2020-01-26 17:38:02,2020-01-26 11:06:02,5451178,6.533333,2020-01-26,2020-01-26,NO.3,大年初一全国票房仅181万 Search count:5451178
2550,湖北省长表示痛心内疚自责,2020-01-27 12:06:02,2020-01-26 23:06:01,5217833,13.000278,2020-01-26,2020-01-27,NO.4,湖北省长表示痛心内疚自责 Search count:5217833
2501,多所高校延期开学,2020-01-26 22:00:01,2020-01-26 10:02:02,4960564,11.966389,2020-01-26,2020-01-26,NO.5,多所高校延期开学 Search count:4960564
2502,明星捐款,2020-01-26 18:32:01,2020-01-26 10:28:02,4878384,8.066389,2020-01-26,2020-01-26,NO.6,明星捐款 Search count:4878384
2503,武汉现状,2020-01-26 18:16:01,2020-01-26 11:34:02,4179531,6.699722,2020-01-26,2020-01-26,NO.7,武汉现状 Search count:4179531
2551,华南海鲜市场存在大量新冠病毒,2020-01-27 09:54:01,2020-01-26 21:50:01,3843257,12.066667,2020-01-26,2020-01-27,NO.8,华南海鲜市场存在大量新冠病毒 Search count:3843257
2552,武汉确诊病例可能再增加约1000例,2020-01-27 10:22:01,2020-01-26 22:24:02,3711517,11.966389,2020-01-26,2020-01-27,NO.9,武汉确诊病例可能再增加约1000例 Search count:3711517
2553,9个月婴儿新型肺炎病例,2020-01-27 11:04:01,2020-01-26 23:20:02,3485076,11.733056,2020-01-26,2020-01-27,NO.10,9个月婴儿新型肺炎病例 Search count:3485076


In [222]:
pyplt = py.offline.plot

highest_day_df = pd.read_csv("highest_day.csv", sep=',')
highest_day_df = highest_day_df.iloc[:20]

x_count = highest_day_df['count'][::-1]
inside_text = highest_day_df['inside_text'][::-1]
rank = highest_day_df['rank'][::-1]
    
color_list =[]
percent = 30
for i in range(len(x_count)):
    color_list.append('hsl(38, 100%, {}%)'.format(percent))
    percent += 3

data = [go.Bar(x = x_count, 
               y = rank,
               orientation = 'h',
               text = inside_text,
               textposition='inside',
               insidetextanchor='middle',
               textfont={'family':'Arial', 
                         'size': 20,
                         'color': '#262626'},
               marker={ 'color':  color_list,},)]

layout = go.Layout(title={'text': "Top 20 Hot Topic on",
                          'y':0.92,'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top',},
                   font=dict(family="Arial",
                             size=18,
                             color="#262626"),
                   plot_bgcolor = '#ffffff',
                   autosize=False,width=1300,height=800,
                   xaxis=dict(showticklabels=False),
                   yaxis=dict(tickfont=dict(color='#262626',
                                            size = 18,),),)

figure = go.Figure(data = data, layout = layout)
figure.show()

Conclusion: In top 20 hot topics on that day, there are 18 records related to "wuhan" or "coronavirus".

## High frequency words

In [286]:
import nltk
import jieba
from itertools import chain


# remove space
def remove_space(col):
    for item in col:
        if item == ' ':
            col.remove(item)
    return col
        
# remove some specific column with nan
def delete_nan(df, col):
    df.replace(to_replace=r'^\s*$',value=np.nan,regex=True,inplace=True)
    df = df.dropna(subset=[col])
    return df
   
# remove Chinese stop words
def delete_stopwords(df, col, filepath):
    stop_words = [line.strip() for line in open(filepath).readlines()]
    df = df[~df[col].isin(stop_words)]
    return df

# remove single words
def delete_single_words(df, col):
    df = df[df[col].str.len() > 1]
    return df

# remove numbers
def delete_numbers(col):
    for item in col:
        if item.isdigit():
            col.remove(item)
    return col
   
# segment words
words_df = result_df.copy()
words_df['words'] = [list(jieba.cut(topic)) for topic in words_df['content'].values]
words_df['words'] = words_df['words'].apply(lambda x: remove_space(x))
words_df['words'] = words_df['words'].apply(lambda x: delete_numbers(x))

flatten_df = pd.DataFrame({
    'flatten_words' : list(chain.from_iterable(words_df['words'].tolist())), 
    'count' : words_df['count'].values.repeat(words_df['words'].str.len()),
    'content' : words_df['content'].values.repeat(words_df['words'].str.len()),
    'alive_time': words_df['alive_time'].values.repeat(words_df['words'].str.len())
})

flatten_df.insert(1, 'number', 1)
flatten_df = delete_nan(flatten_df, 'flatten_words')
flatten_df = flatten_df.apply(lambda x: remove_space(x))
flatten_df = delete_stopwords(flatten_df, 'flatten_words', 'cn_stopwords.txt')
flatten_df = delete_single_words(flatten_df, 'flatten_words')

group_word_df = flatten_df.groupby('flatten_words',as_index=False).sum()

order_count_df = group_word_df.sort_values(by=['count'], ascending=False)
order_count_df = order_count_df.iloc[0:19]

order_time_df = group_word_df.sort_values(by=['alive_time'], ascending=False)
order_time_df = order_time_df.iloc[0:19]
# order_number_df = group_word_df.sort_values(by=['number'], ascending=False)

order_count_df.to_csv('words_frequency_count.csv')
order_time_df.to_csv('words_frequency_alivetime.csv')
# order_number_df.to_csv('words_frequency_number.csv')
order_time_df

Unnamed: 0,flatten_words,number,count,alive_time
439,下雪,13,4840230,8722.667222
4971,武汉,251,253055420,5469.628333
4713,李佳琦,8,8096933,5138.566667
4976,武磊,3,2363462,4309.0
5839,直播,30,20429343,3653.198889
7478,造型,14,9832063,3394.166111
2399,地震,23,9914062,3283.431667
3677,惊蛰,1,444127,3139.533056
6496,肺炎,334,333333816,3135.83
3598,快乐,11,7744922,3116.532222


In [326]:
import plotly.graph_objects as go

count_df = pd.read_csv("words_frequency_count.csv", sep=',')
labels1 = count_df['flatten_words'].values.tolist()
values1 = count_df['count'].values.tolist()

data = [go.Pie(labels=labels1,
               values=values1,
               textinfo='label+percent',)]
layout = go.Layout(title={'text': "Top 20 Words by Total Visitors",
                          'y':0.92,'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top',}
                   ,)
fig1 = go.Figure(data=data, layout=layout)

alivetime_df = pd.read_csv("words_frequency_alivetime.csv", sep=',')
labels2 = alivetime_df['flatten_words'].values.tolist()
values2 = alivetime_df['alive_time'].values.tolist()

data = [go.Pie(labels=labels2,
               values=values2,
               textinfo='label+percent',)]
layout = go.Layout(title={'text': "Top 20 Words by Total Alive Time",
                          'y':0.92,'x':0.5,
                          'xanchor': 'center',
                          'yanchor': 'top',}
                   ,)
fig2 = go.Figure(data=data, layout=layout)

fig1.show()
fig2.show()