In [1]:
import json
from os import listdir
from os import walk
from os.path import isfile, join
import datetime as dt

import markdown
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

In [2]:
rumor_content_dir = "./data/rumor/rumor_weibo"
rumor_comment_dir = "./data/rumor/rumor_forward_comment"

content_files = [f for f in listdir(rumor_content_dir) if isfile(join(rumor_content_dir, f)) and f.endswith('.json')]
comment_files = [f for f in listdir(rumor_comment_dir) if isfile(join(rumor_comment_dir, f)) and f.endswith('.json')]

In [3]:
len(content_files), len(comment_files)

(324, 266)

In [4]:
# initialize a dataframe to store preprocessed information
ids = [i for i in range(max(len(content_files), len(comment_files)))]

rumor_data = pd.DataFrame({'id': ids})
data_num = len(rumor_data)

In [5]:
def get_date_from_file_name(file_name):
    date_from_name = file_name.split('_')[0]
    return dt.datetime.strptime(date_from_name,'%Y-%m-%d').date()

def md_to_text(md):
    """
    Inspired by: https://stackoverflow.com/questions/761824/python-how-to-convert-markdown-formatted-text-to-text
    """
    html = markdown.markdown(md)
    soup = BeautifulSoup(html, features='html.parser')
    return soup.get_text()

def cleanup_text(text):
    # remove space
    text = map(lambda s: s.replace(' ', ''), text)
    return ''.join(list(text))

def remove_user_name(text):
    if '：' not in text:
        return text
    split_list = text.split('：')
    final_text = ''
    for i in range(1, len(split_list)):
        final_text += split_list[i]
    return final_text

In [6]:
# extract rumor file name, rumor content, visit times, and data from the data

rumor_names = [np.nan] * data_num
rumor_dates = [np.nan] * data_num
contents = [np.nan] * data_num
visit_times = [np.nan] * data_num

for i, content_file in enumerate(content_files):

    rumor_names[i] = content_file
    rumor_dates[i] = get_date_from_file_name(content_file)
    print(rumor_dates[i])

    file_path = join(rumor_content_dir, content_file)

    with open(file_path, encoding='utf8') as f:
        data = json.load(f)
        if data['rumorText']:
            contents[i] = cleanup_text(md_to_text(data['rumorText']))
        if data['visitTimes']:
            visit_times[i] = data['visitTimes']

rumor_data['name'] = rumor_names
rumor_data['date'] = rumor_dates
rumor_data['content'] = contents
rumor_data['visit_times'] = visit_times

2020-01-22
2020-01-22
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-23
2020-01-24
2020-01-24
2020-01-24
2020-01-24
2020-01-24
2020-01-24
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-25
2020-01-26
2020-01-26
2020-01-26
2020-01-26
2020-01-27
2020-01-27
2020-01-27
2020-01-28
2020-01-28
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-29
2020-01-30
2020-01-30
2020-01-30
2020-01-30
2020-01-30
2020-01-30
2020-01-30
2020-01-30
2020-01-30

In [7]:
# extract comment count, forward count, texts(a list)

comment_counts = [np.nan] * data_num
forward_counts = [np.nan] * data_num
texts = [np.nan] * data_num

for comment_file in comment_files:
    if comment_file not in content_files:
        continue
        
    id = content_files.index(comment_file)

    file_path = join(rumor_comment_dir, comment_file)
    with open(file_path, encoding='utf8') as f:
        data = json.load(f)

    action_texts = []
    comment_count = 0
    forward_count = 0
    # step through the information
    for info in data:
        if info['comment_or_forward'] == 'forward':
            forward_count += 1
        elif info['comment_or_forward'] == 'comment':
            comment_count += 1
        if info['text']:
            content = remove_user_name(md_to_text(info['text']))
            action_texts.append(cleanup_text(content))

    # assign to corresponds global list
    comment_counts[id] = comment_count
    forward_counts[id] = forward_count
    texts[id] = action_texts

# append to dataframe
rumor_data['comment_times'] = comment_counts
rumor_data['forward_times'] = forward_counts
rumor_data['action_texts'] = texts

In [8]:
rumor_data.head(6)

Unnamed: 0,id,name,date,content,visit_times,comment_times,forward_times,action_texts
0,0,2020-01-22_K1CaS7Qth660h.json,2020-01-22,,39,,,
1,1,2020-01-22_K1CaS7Qxd76ol.json,2020-01-22,据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺—-一种犬科动物携带的病毒，然...,33,1.0,1.0,"[转发微博, @中国政府网]"
2,2,2020-01-23_K1CaS7Q1c768i.json,2020-01-23,据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺—-一种犬科动物携带的病毒，然...,71,18.0,0.0,"[根据什么，？？中央都没确定，你就确定研究了？？？, @微博辟谣已举报, 央视新闻已辟谣¡评..."
3,3,2020-01-23_K1CaS7Qth7qgi.json,2020-01-23,今天下午五点~九点，大家千万不要出门，全市转运发热病人到定点医院。切记切记。以免造成感染。中...,26,4.0,0.0,"[不会吧, 空军播撒消毒液，第一次听说这种操作, 谣言，请删除, 已经撒了，闻到了]"
4,4,2020-01-23_K1CaS7Qth7qsf.json,2020-01-23,说是今天要在武汉上空撒消毒粉液之后还要运转发热患者到定点医院。​​​​,25,4.0,0.0,"[官方辟谣了, 消毒那个辟谣吗, 确定吗？, 辟谣了]"
5,5,2020-01-23_K1CaS7Qth7qsh.json,2020-01-23,武汉“封城“隔离后并不是什么都不做，现在开始空中撒消毒粉液全城消毒了。老百姓更在意的是实实在...,57,0.0,0.0,[]


In [9]:
# save out the raw data for future reference
rumor_data.to_csv('./data/rumor/analys_data.csv', encoding='utf-8')

In [12]:
###### Testing ######
full_data = pd.read_csv('./data/rumor/analys_data.csv', encoding="utf-8")

In [14]:
full_data

Unnamed: 0.1,Unnamed: 0,id,name,date,content,visit_times,comment_times,forward_times,action_texts
0,0,0,2020-01-22_K1CaS7Qth660h.json,2020-01-22,,39,,,
1,1,1,2020-01-22_K1CaS7Qxd76ol.json,2020-01-22,据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺—-一种犬科动物携带的病毒，然...,33,1.0,1.0,"['转发微博', '@中国政府网']"
2,2,2,2020-01-23_K1CaS7Q1c768i.json,2020-01-23,据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺—-一种犬科动物携带的病毒，然...,71,18.0,0.0,"['根据什么，？？中央都没确定，你就确定研究了？？？', '@微博辟谣已举报', '央视新闻..."
3,3,3,2020-01-23_K1CaS7Qth7qgi.json,2020-01-23,今天下午五点~九点，大家千万不要出门，全市转运发热病人到定点医院。切记切记。以免造成感染。中...,26,4.0,0.0,"['不会吧', '空军播撒消毒液，第一次听说这种操作', '谣言，请删除', '已经撒了，闻..."
4,4,4,2020-01-23_K1CaS7Qth7qsf.json,2020-01-23,说是今天要在武汉上空撒消毒粉液之后还要运转发热患者到定点医院。​​​​,25,4.0,0.0,"['官方辟谣了', '消毒那个辟谣吗', '确定吗？', '辟谣了']"
...,...,...,...,...,...,...,...,...,...
319,319,319,2020-03-03_K1CaS8wtj7aoi.json,2020-03-03,"""我不是中国人""，华人女子获美国绿卡，慷慨对美捐赠20万只口罩""我已经获得美国绿卡，即将加入...",108,22.0,4.0,"['转发微博', '//@syll天天天蓝:长大不要娘的人还是少数', '有钱任性', '转..."
320,320,320,2020-03-03_K1CaS8wtk6Kwe.json,2020-03-03,【重大消息】上海市政府会议精神，3月16日居民出行正常化，17日公交正常化，18日逐步企业生...,30,0.0,0.0,[]
321,321,321,2020-03-03_K1CaS8wtl7qcf.json,2020-03-03,测试3月4005​​​​,50,0.0,0.0,[]
322,322,322,2020-03-03_K1CaS8wxc7asi.json,2020-03-03,测试3月4014​​​​,29,0.0,0.0,[]


In [1]:
start_date = '2020-01-01'
end_date = '2020-03-03'
x_index = pd.date_range(start_date, end_date)

x_index

NameError: name 'pd' is not defined