In [1]:
import pandas as pd

In [4]:
df = pd.read_csv('./data/typhoon.csv')

## 阅读、点赞、转发的次数不考虑T1~T3范围，设置为全局统计）

In [11]:
def get_typhoon_news_all_time(CN_NAME):
    temp = df[df['text'].str.contains(CN_NAME, na=False)].copy()
    return temp
    
# 各台风的总阅读数
ty_list = [('杜苏芮', 'Doksuri'), ('苏拉', 'Saola'), ('卡努', 'Khanun'), 
           ('海葵', 'Haikui'), ('泰利', 'Talim')]

def get_typhoon_reads_total():
    arr = []
    for name, en_name in ty_list:
        tt_df = get_typhoon_news_all_time(name)
        # 阅读数
        v_read_total = tt_df['read_num'].sum()
        v_read_avg = tt_df['read_num'].mean()

        # 点赞
        v_like_total = tt_df['old_like_num'].sum()
        v_like_avg = tt_df['old_like_num'].mean()

        # 转发
        v_share_total = tt_df['share_num'].sum()
        v_share_avg = tt_df['share_num'].mean()
        
        arr.append([name, en_name, len(tt_df), v_read_total, v_read_avg, v_like_total, 
                    v_like_avg, v_share_total, v_share_avg])

    temp = pd.DataFrame(arr, columns=['name', 'en_name', '文章篇数','阅读总人次', '篇均阅读人次', '总点赞数', '篇均点赞数', '转发次数', '篇均转发数'])
    temp.to_csv('./result/top5_typhoon.csv', index=None)
    return temp

get_typhoon_reads_total()

Unnamed: 0,name,en_name,文章篇数,阅读总人次,篇均阅读人次,总点赞数,篇均点赞数,转发次数,篇均转发数
0,杜苏芮,Doksuri,1519,25996539,17114.245556,35877,23.618828,787,0.518104
1,苏拉,Saola,774,21647818,27968.757106,31974,41.310078,472,0.609819
2,卡努,Khanun,647,14719222,22749.956723,22011,34.020093,193,0.2983
3,海葵,Haikui,568,13296423,23409.195423,16459,28.977113,252,0.443662
4,泰利,Talim,404,10075827,24940.165842,13749,34.032178,192,0.475248


## 【T1-T3】文章数、平均阅读数、平均点赞，平均分享

In [None]:
def get_typhoon_news(CN_NAME, start_time, end_time):
    temp = df[df['text'].str.contains(CN_NAME, na=False)].copy()
    temp['date'] = pd.to_datetime(temp['date'])
    filtered_df = temp[(temp['date'] >= start_time) & (temp['date'] <= end_time)]
    return filtered_df
    
# https://en.wikipedia.org/wiki/Typhoon_Doksuri
# https://en.wikipedia.org/wiki/Typhoon_Saola_(2023)
# https://en.wikipedia.org/wiki/Typhoon_Khanun_(2023)
# https://en.wikipedia.org/wiki/Typhoon_Haikui_(2023)
# https://en.wikipedia.org/wiki/Tropical_Storm_Talim_(2023)
# https://en.wikipedia.org/wiki/Typhoon_Koinu

#https://en.wikipedia.org/wiki/Tropical_Storm_Nalgae
# https://en.wikipedia.org/wiki/Typhoon_Lan_(2023)

# 杜苏芮: 2023年7月20日 - 2023年7月30日
# 苏拉: 2023年8月22日 - 2023年9月4日
# 卡努：2023年7月26日 - 2023年8月15日
# 海葵：2023年8月27日 - 2023年9月6日
# 泰利：2023年7月13日 - 2023年7月19日
# 小犬：2023年9月29日 - 2023年10月10日
# 玛娃：2023年5月19日 - 	2023年6月11日
# 尼格：2022年10月26日 - 	2022年11月3日
# 兰恩：2023年8月6日 - 	2023年8月19日

typhoon_time_TT = {
    '杜苏芮': ('Doksuri', '2023-07-15 00:00:00', '2023-08-04 23:59:59'),
    '苏拉': ('Saola', '2023-08-17 00:00:00', '2023-09-09 23:59:59'),
    '卡努': ('Khanun', '2023-07-21 00:00:00', '2023-08-20 23:59:59'),
    '海葵': ('Haikui', '2023-08-22 00:00:00', '2023-09-11 23:59:59'),
    '泰利': ('Talim', '2023-07-08 00:00:00', '2023-07-24 23:59:59'),
}

In [225]:
# 获取 T-5, T, T+5的占比
def get_data_ratio(data):
    if len(data) < 10:
        raise ValueError("The list must have at least 10 elements to split into three segments.")
    front = data[:5]  # 前5个元素
    middle = data[5:-5]  # 中间部分
    back = data[-5:]  # 后5个元素
    sum_front = sum(front)
    sum_middle = sum(middle)
    sum_back = sum(back)
    total = sum_front + sum_middle + sum_back
    ratio_front = sum_front / total * 100
    ratio_middle = sum_middle / total * 100
    ratio_back = sum_back / total * 100
    return {
        "sums": (sum_front, sum_middle, sum_back)
    }

# 点赞阅读数的平均
def get_data_ratio_avg(data):
    if len(data) < 10:
        raise ValueError("The list must have at least 10 elements to split into three segments.")
    front = data[:5]
    middle = data[5:-5]
    back = data[-5:]
    
    sum_front = sum(front)
    sum_middle = sum(middle)
    sum_back = sum(back)
    
    avg_front = sum_front / len(front)
    avg_middle = sum_middle / len(middle)
    avg_back = sum_back / len(back)
    return (avg_front, avg_middle, avg_back)

# 每天文章数量
def get_news_count(CN_NAME, start_time, end_time):
    temp = get_typhoon_news(CN_NAME, start_time, end_time)
    temp['date'] = pd.to_datetime(temp['date'])
    temp['date_only'] = temp['date'].dt.date
    news_count_by_date = temp.groupby('date_only').size().reset_index(name='news_count')
    full_date_range = pd.date_range(start=start_time, end=end_time).date
    news_count_full = pd.DataFrame({'date_only': full_date_range})
    news_count_full = news_count_full.merge(news_count_by_date, on='date_only', how='left')
    news_count_full['news_count'] = news_count_full['news_count'].fillna(0).astype(int)
    news_count_full['formatted_date'] = news_count_full['date_only'].apply(lambda x: x.strftime('%m-%d'))
    dd = news_count_full['news_count'].tolist()
    print(CN_NAME, dd)
    print(get_data_ratio(dd))
    temp = news_count_full[['formatted_date', 'news_count']]
    temp.to_csv('./result/' + CN_NAME + '_文章数量.csv', index=None)
    
    return news_count_full[['formatted_date', 'news_count']]

# 每天某个指标的平均（篇均）
def get_avg_by_day(CN_NAME, start_time, end_time, key):
    NEW_KEY = key + '_avg'
    temp = get_typhoon_news(CN_NAME, start_time, end_time)
    temp['date'] = pd.to_datetime(temp['date'])
    temp['day'] = temp['date'].dt.date
    
    # 按天平均
    daily = temp.groupby('day')[key].mean().reset_index()
    daily.columns = ['date', NEW_KEY]
    full_date_range = pd.date_range(start=start_time, end=end_time).date
    full_date_df = pd.DataFrame({'date': full_date_range})

    daily = full_date_df.merge(daily, on='date', how='left')
    daily[NEW_KEY] = daily[NEW_KEY].fillna(0)
    daily['formatted_date'] = daily['date'].apply(lambda x: x.strftime('%m-%d'))
    dd = daily[NEW_KEY].tolist()
    
    print(NEW_KEY, get_data_ratio_avg(dd)) 
    return daily[['formatted_date', NEW_KEY]]

In [226]:
for name, (en_name, start_time, end_time) in typhoon_time_TT.items():
    typhoon_name = name
    print('=============' + name + '_' + en_name + '=======================')
    indicators = ['read_num', 'old_like_num', 'share_num']
    ins = []
    temp = pd.DataFrame()
    dates = []
    for i in indicators:
        NEW_KEY = i + '_avg'
        d = get_avg_by_day(typhoon_name, start_time, end_time, i)
        v = d[NEW_KEY].tolist()
        dates = d['formatted_date'].tolist()
        temp[i] = v
    temp['name'] = typhoon_name
    temp['en_name'] = en_name
    temp['date'] = dates
    temp.to_csv('./result/' + typhoon_name + '_avg_read_like_share.csv', index=None)

read_num_avg (17248.25, 23185.19532350294, 5766.533002693824)
old_like_num_avg (15.6, 28.902793232207454, 18.410221867684555)
share_num_avg (0.0, 0.35143905161940664, 0.5969084398188876)
read_num_avg (6452.9, 24247.939817346607, 8582.704761904763)
old_like_num_avg (7.3, 35.21351538117492, 10.952380952380953)
share_num_avg (0.3, 0.4105322153657313, 0.7952380952380953)
read_num_avg (4592.0, 24660.897857865555, 530.0)
old_like_num_avg (3.2, 33.85855547710836, 0.8666666666666666)
share_num_avg (0.0, 0.3020299676681046, 0.2)
read_num_avg (17068.63333333333, 25965.146472005243, 18413.6272377372)
old_like_num_avg (35.425, 29.568973915362406, 28.701351593268885)
share_num_avg (0.025, 0.2880936855794952, 1.253938417472252)
read_num_avg (0.0, 26959.80640932068, 25436.524761904762)
old_like_num_avg (0.0, 37.349565554873315, 23.715238095238096)
share_num_avg (0.0, 0.34858119875961086, 0.19428571428571428)


In [227]:
for name, (en_name, start_time, end_time) in typhoon_time_TT.items():
    get_news_count(name, start_time, end_time)

杜苏芮 [0, 0, 0, 1, 4, 0, 14, 15, 43, 70, 65, 133, 214, 264, 140, 83, 67, 66, 56, 36, 39]
{'sums': (5, 1041, 264)}
苏拉 [0, 2, 2, 1, 0, 0, 1, 10, 26, 13, 27, 26, 44, 76, 138, 183, 118, 59, 23, 7, 2, 6, 2, 2]
{'sums': (5, 744, 19)}
卡努 [0, 0, 0, 0, 1, 2, 5, 46, 39, 40, 59, 22, 41, 32, 36, 19, 11, 31, 21, 51, 56, 39, 54, 15, 7, 5, 3, 3, 0, 0, 0]
{'sums': (1, 631, 6)}
海葵 [0, 0, 0, 8, 3, 9, 17, 29, 25, 41, 38, 48, 66, 80, 51, 24, 24, 49, 19, 8, 8]
{'sums': (11, 428, 108)}
泰利 [0, 0, 0, 0, 0, 6, 10, 30, 118, 132, 57, 20, 5, 2, 7, 1, 3]
{'sums': (0, 373, 18)}
