In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# tiger

In [2]:
file_path = "./data/tiger_summary.csv"

df = pd.read_csv(file_path)
# # 將 Published 轉為 datetime
df['Published Date'] = pd.to_datetime(df['Published Date'])

df

Unnamed: 0,Published Date,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,sentiment
0,2019-01-03,1,0,0,0,0,0,0,2,0.165967
1,2019-01-04,1,1,1,4,1,1,0,4,0.049865
2,2019-01-06,1,8,3,4,0,4,0,15,0.174241
3,2019-01-07,0,0,0,1,0,0,0,2,-0.415956
4,2019-01-08,0,1,1,4,0,1,3,8,0.229378
...,...,...,...,...,...,...,...,...,...,...
181,2024-10-25,5,1,0,1,1,1,2,11,-0.051465
182,2024-11-09,0,0,0,2,0,0,0,0,-0.772716
183,2024-11-15,1,0,1,0,0,1,0,4,-0.228295
184,2024-11-19,1,0,1,1,0,0,0,1,0.085000


針對每個分類（Label_1 到 Label_8），計算前 3 個月的移動平均與標準差，
並判斷當月數值是否高於平均 + 2 倍標準差，若是，則標記為一筆服務事件（True）。

In [3]:
# 對每個 Label_1 ~ Label_7 計算移動平均與 std，並判斷是否為異常事件
for i in range(1, 9):
    label_col = f'Label_{i}'
    mean_col = f'{label_col}_mean'
    std_col = f'{label_col}_std'
    event_col = f'Event_{i}'

    # 前90天的移動平均與標準差（shift(1) 保證不含當月）
    #df[mean_col] = df[label_col].shift(1).rolling(window=90).mean()
    #df[std_col] = df[label_col].shift(1).rolling(window=90).std()
    df[mean_col] = df[label_col].shift(1).rolling(window=5).mean()
    df[std_col] = df[label_col].shift(1).rolling(window=5).std()
    # 動態異常偵測：如果該月值 > 平均 + 2×標準差，視為異常
    df[event_col] = df[label_col] > (df[mean_col] + 2 * df[std_col])

df.to_csv('./0909/tiger_event_day_5.csv', index=False)

In [4]:
df

Unnamed: 0,Published Date,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,sentiment,...,Event_5,Label_6_mean,Label_6_std,Event_6,Label_7_mean,Label_7_std,Event_7,Label_8_mean,Label_8_std,Event_8
0,2019-01-03,1,0,0,0,0,0,0,2,0.165967,...,False,,,False,,,False,,,False
1,2019-01-04,1,1,1,4,1,1,0,4,0.049865,...,False,,,False,,,False,,,False
2,2019-01-06,1,8,3,4,0,4,0,15,0.174241,...,False,,,False,,,False,,,False
3,2019-01-07,0,0,0,1,0,0,0,2,-0.415956,...,False,,,False,,,False,,,False
4,2019-01-08,0,1,1,4,0,1,3,8,0.229378,...,False,,,False,,,False,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181,2024-10-25,5,1,0,1,1,1,2,11,-0.051465,...,False,0.4,0.894427,False,1.2,1.303840,False,5.8,5.069517,False
182,2024-11-09,0,0,0,2,0,0,0,0,-0.772716,...,False,0.6,0.894427,False,1.6,1.140175,False,5.8,5.069517,False
183,2024-11-15,1,0,1,0,0,1,0,4,-0.228295,...,False,0.6,0.894427,False,1.0,1.000000,False,3.8,4.969909,False
184,2024-11-19,1,0,1,1,0,0,0,1,0.085000,...,False,0.8,0.836660,False,1.0,1.000000,False,4.6,4.505552,False


In [5]:
# 假設 df 中已包含 Published 與 Event_1 ~ Event_8
event_cols = [f'Event_{i}' for i in range(1, 9)]

# 篩出含有至少一個 True 的列
df_events = df[df[event_cols].any(axis=1)][['Published Date'] + event_cols]

# 將每一筆 True 拆成獨立 row
df_melted = df_events.melt(id_vars='Published Date', value_vars=event_cols, 
                           var_name='Event', value_name='IsEvent')

# 篩選出 True 的事件
df_event_table = df_melted[df_melted['IsEvent'] == True][['Published Date', 'Event']]

df_events

Unnamed: 0,Published Date,Event_1,Event_2,Event_3,Event_4,Event_5,Event_6,Event_7,Event_8
7,2019-01-18,False,False,False,False,True,False,False,False
12,2019-02-04,False,False,False,True,False,False,False,False
14,2019-02-07,True,True,False,False,False,False,True,True
19,2019-02-19,False,False,False,False,False,True,False,False
20,2019-02-20,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...
173,2024-07-26,False,False,True,False,False,False,True,False
176,2024-08-15,False,False,False,True,False,False,False,True
177,2024-09-06,True,True,False,True,False,False,False,False
179,2024-09-27,False,False,False,False,True,True,False,False


In [6]:
df_event_table.to_csv('./0909/tiger_event_true_5.csv', index=False)
df_event_table

Unnamed: 0,Published Date,Event
2,2019-02-07,Event_1
4,2019-02-20,Event_1
8,2019-03-10,Event_1
9,2019-03-27,Event_1
14,2019-04-22,Event_1
...,...,...
602,2023-10-20,Event_8
603,2023-11-13,Event_8
605,2024-02-24,Event_8
607,2024-04-29,Event_8


# eva

In [7]:
file_path = "./data/eva_summary.csv"

df = pd.read_csv(file_path)
# # 將 Published 轉為 datetime
df['Published Date'] = pd.to_datetime(df['Published Date'])

df

Unnamed: 0,Published Date,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,sentiment,Total
0,2019-01-01,8,3,1,5,1,3,1,3,0.236611,25
1,2019-01-02,15,5,4,10,1,8,6,18,0.248481,67
2,2019-01-03,0,2,0,0,0,1,0,1,0.617067,4
3,2019-01-04,3,7,4,2,5,3,1,23,-0.003960,48
4,2019-01-05,6,12,7,10,5,5,1,13,0.209656,59
...,...,...,...,...,...,...,...,...,...,...,...
863,2024-12-22,0,0,0,0,0,1,2,0,0.605857,3
864,2024-12-23,0,1,0,0,1,0,2,3,0.141964,7
865,2024-12-24,1,0,0,1,0,0,0,5,-0.071494,7
866,2024-12-25,1,0,1,2,2,0,0,2,0.040805,8


針對每個分類（Label_1 到 Label_8），計算前 3 個月的移動平均與標準差，
並判斷當月數值是否高於平均 + 2 倍標準差，若是，則標記為一筆服務事件（True）。

In [8]:
# 對每個 Label_1 ~ Label_7 計算移動平均與 std，並判斷是否為異常事件
for i in range(1, 9):
    label_col = f'Label_{i}'
    mean_col = f'{label_col}_mean'
    std_col = f'{label_col}_std'
    event_col = f'Event_{i}'

    # 前三個月的移動平均與標準差（shift(1) 保證不含當月）
    #df[mean_col] = df[label_col].shift(1).rolling(window=90).mean()
    #df[std_col] = df[label_col].shift(1).rolling(window=90).std()
    df[mean_col] = df[label_col].shift(1).rolling(window=5).mean()
    df[std_col] = df[label_col].shift(1).rolling(window=5).std()
    # 動態異常偵測：如果該月值 > 平均 + 2×標準差，視為異常
    df[event_col] = df[label_col] > (df[mean_col] + 2 * df[std_col])

df.to_csv('./0909/eva_event_day_5.csv', index=False)


In [9]:
# 假設 df 中已包含 Published 與 Event_1 ~ Event_8
event_cols = [f'Event_{i}' for i in range(1, 9)]

# 篩出含有至少一個 True 的列
df_events = df[df[event_cols].any(axis=1)][['Published Date'] + event_cols]

# 將每一筆 True 拆成獨立 row
df_melted = df_events.melt(id_vars='Published Date', value_vars=event_cols, 
                           var_name='Event', value_name='IsEvent')

# 篩選出 True 的事件
df_event_table = df_melted[df_melted['IsEvent'] == True][['Published Date', 'Event']]

df_events

Unnamed: 0,Published Date,Event_1,Event_2,Event_3,Event_4,Event_5,Event_6,Event_7,Event_8
7,2019-01-08,False,False,False,False,False,True,True,False
10,2019-01-11,True,False,False,False,False,False,False,True
14,2019-01-15,True,True,True,True,False,True,True,False
17,2019-01-18,False,False,False,False,True,False,False,True
20,2019-01-21,True,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...
860,2024-12-16,True,False,True,False,False,False,False,False
861,2024-12-17,False,False,False,False,False,False,False,True
863,2024-12-22,False,False,False,False,False,False,True,False
864,2024-12-23,False,False,False,False,True,False,False,False


In [10]:
df_event_table.to_csv('./0909/eva_event_true_5.csv', index=False)
df_event_table

Unnamed: 0,Published Date,Event
1,2019-01-11,Event_1
2,2019-01-15,Event_1
4,2019-01-21,Event_1
10,2019-02-07,Event_1
14,2019-02-18,Event_1
...,...,...
3115,2024-10-31,Event_8
3117,2024-11-11,Event_8
3120,2024-12-06,Event_8
3121,2024-12-07,Event_8


# china

In [11]:
file_path = "./data/china_summary.csv"

df = pd.read_csv(file_path)
# # 將 Published 轉為 datetime
df['Published Date'] = pd.to_datetime(df['Published Date'])

df

Unnamed: 0,Published Date,Label_1,Label_2,Label_3,Label_4,Label_5,Label_6,Label_7,Label_8,sentiment,Total
0,2019-01-01,3,9,8,1,0,0,1,8,0.325910,30
1,2019-01-02,0,0,2,1,0,2,0,5,-0.069213,10
2,2019-01-03,3,4,3,0,1,6,0,6,0.244771,23
3,2019-01-04,3,1,5,1,3,5,4,16,0.064436,38
4,2019-01-05,4,4,7,3,5,4,3,10,0.116174,40
...,...,...,...,...,...,...,...,...,...,...,...
585,2024-12-22,0,2,0,2,4,5,0,9,0.129369,22
586,2024-12-23,1,0,0,0,1,1,0,2,-0.354962,5
587,2024-12-24,2,1,0,1,2,0,3,3,-0.308825,12
588,2024-12-25,0,0,3,0,0,0,0,2,-0.398063,5


針對每個分類（Label_1 到 Label_8），計算前 3 個月的移動平均與標準差，
並判斷當月數值是否高於平均 + 2 倍標準差，若是，則標記為一筆服務事件（True）。

In [12]:
# 對每個 Label_1 ~ Label_7 計算移動平均與 std，並判斷是否為異常事件
for i in range(1, 9):
    label_col = f'Label_{i}'
    mean_col = f'{label_col}_mean'
    std_col = f'{label_col}_std'
    event_col = f'Event_{i}'

    # 前三個月的移動平均與標準差（shift(1) 保證不含當月）
    #df[mean_col] = df[label_col].shift(1).rolling(window=90).mean()
    #df[std_col] = df[label_col].shift(1).rolling(window=90).std()
    df[mean_col] = df[label_col].shift(1).rolling(window=5).mean()
    df[std_col] = df[label_col].shift(1).rolling(window=5).std()

    # 動態異常偵測：如果該月值 > 平均 + 2×標準差，視為異常
    df[event_col] = df[label_col] > (df[mean_col] + 2 * df[std_col])

df.to_csv('./0909/china_event_day_5.csv', index=False)

In [13]:
# 假設 df 中已包含 Published 與 Event_1 ~ Event_8
event_cols = [f'Event_{i}' for i in range(1, 9)]

# 篩出含有至少一個 True 的列
df_events = df[df[event_cols].any(axis=1)][['Published Date'] + event_cols]

# 將每一筆 True 拆成獨立 row
df_melted = df_events.melt(id_vars='Published Date', value_vars=event_cols, 
                           var_name='Event', value_name='IsEvent')

# 篩選出 True 的事件
df_event_table = df_melted[df_melted['IsEvent'] == True][['Published Date', 'Event']]

df_events

Unnamed: 0,Published Date,Event_1,Event_2,Event_3,Event_4,Event_5,Event_6,Event_7,Event_8
5,2019-01-06,False,False,False,True,False,False,False,False
8,2019-01-09,False,False,False,True,False,False,False,False
9,2019-01-10,False,False,False,False,False,False,True,False
12,2019-01-13,False,False,False,False,False,True,False,False
14,2019-01-15,False,False,True,True,True,False,False,False
...,...,...,...,...,...,...,...,...,...
583,2024-12-11,True,False,False,True,False,False,False,False
584,2024-12-16,False,False,False,True,False,False,False,True
585,2024-12-22,False,False,False,False,True,False,False,False
587,2024-12-24,False,False,False,False,False,False,True,False


In [14]:
df_event_table.to_csv('./0909/china_event_true_5.csv', index=False)
df_event_table

Unnamed: 0,Published Date,Event
7,2019-01-18,Event_1
15,2019-02-12,Event_1
19,2019-02-25,Event_1
20,2019-02-26,Event_1
23,2019-03-12,Event_1
...,...,...
2030,2024-08-22,Event_8
2037,2024-10-27,Event_8
2040,2024-11-19,Event_8
2041,2024-11-20,Event_8
