In [1]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [2]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)
with open('../../data/processed/gdelt_llm_sentiment.pkl', 'rb') as f:
    llm_data = pickle.load(f)
df = pd.merge(df, llm_data, on='GKGRECORDID', how='left')

In [3]:
df=df[df['disaster']==True]

In [4]:
df

Unnamed: 0,GKGRECORDID,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,...,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent,disaster,llm_sentiment
0,20230221030000-453,aero-news.net,http://aero-news.net/index.cfm?do=main.textpos...,TAX_WORLDREPTILES;TAX_WORLDREPTILES_SIDEWINDER...,2023-02-21 03:00:00,2023-02-21,0,1,0,0,...,0.084485,0.096285,0.065091,-0.117746,-0.047186,-0.009921,-0.062596,-0.075467,True,0.394921
2,20240527093000-676,itechpost.com,http://www.itechpost.com/articles/122528/20240...,TAX_FNCACT;TAX_FNCACT_EMPLOYEE;EPU_POLICY;EPU_...,2024-05-27 09:30:00,2024-05-27,0,1,1,0,...,0.088567,0.090690,0.078992,-0.115803,-0.063174,-0.039071,-0.037974,-0.090853,True,-0.510247
4,20241120131500-415,710keel.com,https://710keel.com/ixp/182/p/spirit-airlines-...,ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_...,2024-11-20 13:15:00,2024-11-20,0,1,0,0,...,0.100371,0.092308,0.088900,-0.050954,-0.023068,0.021486,0.009874,-0.006657,True,-1.209943
5,20231016110000-1591,abc13.com,https://abc13.com/flight-attendant-grandma-gra...,TAX_FNCACT;TAX_FNCACT_ATTENDANT;TAX_FNCACT_FLI...,2023-10-16 11:00:00,2023-10-16,0,1,0,0,...,0.097128,0.084789,0.079306,-0.026461,0.027215,0.051446,0.025956,0.009132,True,0.601107
7,20231016110000-441,abc30.com,https://abc30.com/flight-attendant-grandma-gra...,TAX_FNCACT;TAX_FNCACT_ATTENDANT;TAX_FNCACT_FLI...,2023-10-16 11:00:00,2023-10-16,0,1,0,0,...,0.097128,0.084789,0.079306,-0.026461,0.027215,0.051446,0.025956,0.009132,True,0.601107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121177,20220516161500-1658,wgal.com,https://www.wgal.com/article/jetblue-hostile-t...,EPU_ECONOMY_HISTORIC;ARMEDCONFLICT;USPEC_POLIC...,2022-05-16 16:15:00,2022-05-16,0,1,0,0,...,0.104781,0.095951,0.090180,-0.057734,-0.004453,-0.001611,0.013447,0.014984,True,-0.656124
1121178,20181010101500-143,wpxi.com,https://www.wpxi.com/news/national/hurricane-m...,NATURAL_DISASTER;NATURAL_DISASTER_HURRICANE;CR...,2018-10-10 10:15:00,2018-10-10,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,0.724012
1121180,20190719014500-2143,wtol.com,https://www.wtol.com/article/news/nation-world...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,-0.774252
1121181,20190719014500-887,wwltv.com,https://www.wwltv.com/article/news/nation-worl...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,True,-0.760821


In [5]:
# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Positive Score','Activity Reference Density','Self/Group Reference Density'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [6]:
# Convert from UTC to EST, accounting for daylight saving time
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce', utc=True)
df['datetime'] = df['datetime'].dt.tz_convert('America/New_York')
# Remove the timezone information 
df['datetime'] = df['datetime'].dt.tz_localize(None)
df['date'] = df['datetime'].dt.date

In [7]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics  = ['Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['Tone','Negative Score','Article Count','llm_sentiment']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [8]:
df

Unnamed: 0_level_0,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,airplane,airline,airport,Alaska Airlines,American Airlines,...,Article Count_Southwest Airlines,llm_sentiment_Southwest Airlines,Tone_United Airlines,Negative Score_United Airlines,Article Count_United Airlines,llm_sentiment_United Airlines,Tone_Allegiant Air,Negative Score_Allegiant Air,Article Count_Allegiant Air,llm_sentiment_Allegiant Air
GKGRECORDID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20230221030000-453,aero-news.net,http://aero-news.net/index.cfm?do=main.textpos...,TAX_WORLDREPTILES;TAX_WORLDREPTILES_SIDEWINDER...,2023-02-20 22:00:00,2023-02-20,0,1,0,0,1,...,0,0.000000,-0.000000,0.000000,0,0.000000,-0.0,0.0,0,0.0
20240527093000-676,itechpost.com,http://www.itechpost.com/articles/122528/20240...,TAX_FNCACT;TAX_FNCACT_EMPLOYEE;EPU_POLICY;EPU_...,2024-05-27 05:30:00,2024-05-27,0,1,1,0,0,...,0,-0.000000,-3.271028,7.242990,1,-0.510247,-0.0,0.0,0,-0.0
20241120131500-415,710keel.com,https://710keel.com/ixp/182/p/spirit-airlines-...,ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_...,2024-11-20 08:15:00,2024-11-20,0,1,0,0,0,...,0,-0.000000,-1.453488,4.651163,1,-1.209943,-0.0,0.0,0,-0.0
20231016110000-1591,abc13.com,https://abc13.com/flight-attendant-grandma-gra...,TAX_FNCACT;TAX_FNCACT_ATTENDANT;TAX_FNCACT_FLI...,2023-10-16 07:00:00,2023-10-16,0,1,0,0,0,...,1,0.601107,0.000000,0.000000,0,0.000000,0.0,0.0,0,0.0
20231016110000-441,abc30.com,https://abc30.com/flight-attendant-grandma-gra...,TAX_FNCACT;TAX_FNCACT_ATTENDANT;TAX_FNCACT_FLI...,2023-10-16 07:00:00,2023-10-16,0,1,0,0,0,...,1,0.601107,0.000000,0.000000,0,0.000000,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20220516161500-1658,wgal.com,https://www.wgal.com/article/jetblue-hostile-t...,EPU_ECONOMY_HISTORIC;ARMEDCONFLICT;USPEC_POLIC...,2022-05-16 12:15:00,2022-05-16,0,1,0,0,1,...,1,-0.656124,-0.000000,0.000000,0,-0.000000,-0.0,0.0,0,-0.0
20181010101500-143,wpxi.com,https://www.wpxi.com/news/national/hurricane-m...,NATURAL_DISASTER;NATURAL_DISASTER_HURRICANE;CR...,2018-10-10 06:15:00,2018-10-10,0,1,0,0,0,...,0,0.000000,-7.062780,7.959641,1,0.724012,-0.0,0.0,0,0.0
20190719014500-2143,wtol.com,https://www.wtol.com/article/news/nation-world...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-18 21:45:00,2019-07-18,0,1,0,0,1,...,1,-0.774252,-0.000000,0.000000,0,-0.000000,-0.0,0.0,0,-0.0
20190719014500-887,wwltv.com,https://www.wwltv.com/article/news/nation-worl...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-18 21:45:00,2019-07-18,0,1,0,0,1,...,1,-0.760821,-0.000000,0.000000,0,-0.000000,-0.0,0.0,0,-0.0


In [9]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('date').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [10]:
grouped_df

Unnamed: 0_level_0,Tone_Alaska Airlines,Negative Score_Alaska Airlines,Article Count_Alaska Airlines,llm_sentiment_Alaska Airlines,Tone_American Airlines,Negative Score_American Airlines,Article Count_American Airlines,llm_sentiment_American Airlines,Tone_Delta Air Lines,Negative Score_Delta Air Lines,...,Article Count_Southwest Airlines,llm_sentiment_Southwest Airlines,Tone_United Airlines,Negative Score_United Airlines,Article Count_United Airlines,llm_sentiment_United Airlines,Tone_Allegiant Air,Negative Score_Allegiant Air,Article Count_Allegiant Air,llm_sentiment_Allegiant Air
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-12-31,-13.723021,16.915142,4,-3.055726,-17.687618,23.026848,3,-0.505849,-3.059830,8.949744,...,1,0.179780,-9.148665,10.101652,1,0.179780,0.000000,0.000000,0,0.000000
2018-01-01,-15.633385,47.521145,20,-5.901770,-27.072172,73.017075,19,0.246700,-28.031559,59.403725,...,1,0.433177,-6.420059,41.747913,14,-1.647160,0.000000,0.000000,0,0.000000
2018-01-02,-105.864479,165.528015,45,-22.704805,-123.854057,245.065643,45,8.159196,-14.118258,45.579170,...,25,14.548387,-107.266792,182.757645,30,-2.442148,0.000000,0.000000,0,0.000000
2018-01-03,-500.023834,811.859863,176,-76.734140,-673.057556,1063.114990,268,-33.989357,-134.827072,195.123764,...,51,25.461855,-16.488083,50.042549,15,4.056584,0.000000,0.000000,0,0.000000
2018-01-04,-120.757065,285.700623,66,0.050666,-1430.101196,2124.122314,409,38.236032,-83.134491,137.559448,...,36,9.901328,-70.990990,122.932465,25,-2.043692,-3.051303,7.219255,2,-0.011941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-27,-37.277554,52.337234,11,-6.548611,-6.697656,50.121933,15,-0.597500,-64.418037,276.576935,...,54,-3.274660,-45.955013,96.995590,31,-8.336025,0.956284,3.278688,1,-0.304090
2025-05-28,-47.601429,73.333099,12,-10.100034,-69.797081,103.881607,18,-10.861085,-64.594460,161.139008,...,21,2.036721,-178.856918,268.377991,58,-3.504886,-23.022478,27.695900,4,-3.168934
2025-05-29,-8.838427,59.826740,17,3.187258,-92.699425,214.919006,46,-12.311865,-33.974358,73.439308,...,46,-6.887891,-9.813432,101.852852,31,9.521830,-21.589024,25.492680,4,-2.453441
2025-05-30,10.235211,20.994715,8,1.716890,-134.835907,187.995438,30,-19.807893,-20.749403,52.618099,...,8,-0.386213,-20.771137,43.358574,12,3.748829,-1.320755,1.509434,1,-0.024907


In [11]:
# Calculate the 1-period difference for each metric
for topic in topics:
    for metric in metrics:
        grouped_df[f'Change_{metric}_{topic}'] = grouped_df[f'{metric}_{topic}'].diff().fillna(0)

In [12]:
# For each topic, we identify events by metric
for topic in topics:
    for metric in metrics:
        asc = True if metric in ['Tone','llm_sentiment'] else False
        grouped_df[f'event_rank_{metric}_{topic}'] = grouped_df[f'Change_{metric}_{topic}'].rank(method='min', ascending=asc)
    
rank_df = grouped_df[[f'event_rank_{metric}_{topic}' for topic in topics for metric in metrics]].copy()
rank_df['date'] = rank_df.index
rank_df.reset_index(drop=True, inplace=True)

In [13]:
# Average the rankings across all metrics for each topic
for topic in topics:
    rank_df[f'event_rank_{topic}'] = rank_df[[f'event_rank_{metric}_{topic}' for metric in metrics]].mean(axis=1)
    # Now relabel the rankings to be 1 for the highest rank, 2 for the second highest, etc.
    rank_df[f'event_rank_{topic}'] = rank_df[f'event_rank_{topic}'].rank(method='first', ascending=True)
    # drop the individual metric rankings
    rank_df.drop(columns=[f'event_rank_{metric}_{topic}' for metric in metrics], inplace=True)

In [14]:
rank_df

Unnamed: 0,date,event_rank_Alaska Airlines,event_rank_American Airlines,event_rank_Delta Air Lines,event_rank_JetBlue,event_rank_Southwest Airlines,event_rank_United Airlines,event_rank_Allegiant Air
0,2017-12-31,1240.0,1281.0,1250.0,1237.0,1229.0,1270.0,548.0
1,2018-01-01,381.0,965.0,614.0,1010.0,1472.0,912.0,549.0
2,2018-01-02,186.0,748.0,1885.0,716.0,892.0,518.0,550.0
3,2018-01-03,53.0,126.0,244.0,382.0,828.0,2211.0,551.0
4,2018-01-04,2651.0,526.0,2064.0,2608.0,1700.0,539.0,214.0
...,...,...,...,...,...,...,...,...
2704,2025-05-27,282.0,1593.0,155.0,885.0,2595.0,2218.0,518.0
2705,2025-05-28,447.0,711.0,2344.0,1499.0,2329.0,610.0,59.0
2706,2025-05-29,2077.0,706.0,2431.0,1230.0,242.0,2364.0,2227.0
2707,2025-05-30,2096.0,1178.0,2006.0,1459.0,2443.0,1467.0,2643.0


In [15]:
df=df[['date','datetime', 'article_title',
 'Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air','Tone'
 ]].dropna()

In [16]:
# now merge the rank_df with the original df
df = pd.merge(df, rank_df, on='date', how='left')

In [17]:
df

Unnamed: 0,date,datetime,article_title,Alaska Airlines,American Airlines,Delta Air Lines,JetBlue,Southwest Airlines,United Airlines,Allegiant Air,Tone,event_rank_Alaska Airlines,event_rank_American Airlines,event_rank_Delta Air Lines,event_rank_JetBlue,event_rank_Southwest Airlines,event_rank_United Airlines,event_rank_Allegiant Air
0,2023-02-20,2023-02-20 22:00:00,airborne 02 20 23: hobby balloon shot down per...,0,1,0,0,0,0,0,-1.228501,815.0,1137.0,2016.0,985.0,1947.0,1428.0,1037.0
1,2024-05-27,2024-05-27 05:30:00,boeing safety product quality concerns surge a...,0,0,0,0,0,1,0,-3.271028,2198.0,989.0,1408.0,910.0,1304.0,399.0,2596.0
2,2024-11-20,2024-11-20 08:15:00,will spirit airlines bankruptcy ruin louisiana...,0,0,0,0,0,1,0,-1.453488,1716.0,2316.0,758.0,2664.0,587.0,1203.0,2031.0
3,2023-10-16,2023-10-16 07:00:00,granddaughter follows in grandma s footsteps t...,0,0,0,0,1,0,0,3.417722,857.0,873.0,536.0,248.0,1534.0,1113.0,275.0
4,2023-10-16,2023-10-16 07:00:00,granddaughter follows in grandma s footsteps t...,0,0,0,0,1,0,0,3.417722,857.0,873.0,536.0,248.0,1534.0,1113.0,275.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
652643,2022-05-16,2022-05-16 12:15:00,jetblue launches hostile takeover for spirit,0,1,0,1,1,0,0,-1.436782,2400.0,574.0,1881.0,57.0,360.0,1600.0,918.0
652644,2018-10-10,2018-10-10 06:15:00,hurricane michael track storm strengthens to c...,0,0,0,0,0,1,0,-7.062780,2642.0,313.0,712.0,2119.0,146.0,585.0,2677.0
652645,2019-07-18,2019-07-18 21:45:00,nation southwest cancellations will rise due t...,0,1,0,0,1,0,0,-1.813472,417.0,541.0,467.0,403.0,51.0,227.0,502.0
652646,2019-07-18,2019-07-18 21:45:00,nation southwest cancellations will rise due t...,0,1,0,0,1,0,0,-1.813472,417.0,541.0,467.0,403.0,51.0,227.0,502.0


In [18]:
df_filter=df[
    (df['American Airlines']==1) &
    (df['datetime'] >=datetime.datetime(2025, 1, 30, 15, 30, 0)) &
    (df['datetime']<=datetime.datetime(2025, 1, 31, 9, 15, 0))
    ]

df_filter.sort_values(by='datetime', inplace=True)
df_filter

Unnamed: 0,date,datetime,article_title,Alaska Airlines,American Airlines,Delta Air Lines,JetBlue,Southwest Airlines,United Airlines,Allegiant Air,Tone,event_rank_Alaska Airlines,event_rank_American Airlines,event_rank_Delta Air Lines,event_rank_JetBlue,event_rank_Southwest Airlines,event_rank_United Airlines,event_rank_Allegiant Air
31786,2025-01-30,2025-01-30 15:30:00,ground stop lifted at reagan national airport ...,0,1,0,0,0,0,0,-2.358490,15.0,1.0,341.0,43.0,48.0,20.0,1709.0
31868,2025-01-30,2025-01-30 15:30:00,world news: trump to appoint acting faa admini...,0,1,0,0,0,0,0,-4.166667,15.0,1.0,341.0,43.0,48.0,20.0,1709.0
31883,2025-01-30,2025-01-30 15:30:00,14 from figure skating community killed in pla...,0,1,0,0,0,0,0,-1.739927,15.0,1.0,341.0,43.0,48.0,20.0,1709.0
31884,2025-01-30,2025-01-30 15:30:00,kansas couple among 60 passengers on flight fr...,0,1,0,0,0,0,0,-3.510867,15.0,1.0,341.0,43.0,48.0,20.0,1709.0
31885,2025-01-30,2025-01-30 15:30:00,here are some of the deadliest plane crashes i...,1,1,0,0,0,1,0,-9.320695,15.0,1.0,341.0,43.0,48.0,20.0,1709.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147923,2025-01-31,2025-01-31 09:15:00,what to know about ntsb and investigation into...,0,1,0,0,0,0,0,-4.270463,2689.0,2709.0,2334.0,2673.0,2667.0,2640.0,131.0
147934,2025-01-31,2025-01-31 09:15:00,trump advocates for enhanced air safety and pr...,0,1,0,0,0,0,0,-1.571429,2689.0,2709.0,2334.0,2673.0,2667.0,2640.0,131.0
147947,2025-01-31,2025-01-31 09:15:00,what caused the midair collision over dc | fir...,0,1,0,0,0,0,0,-3.618818,2689.0,2709.0,2334.0,2673.0,2667.0,2640.0,131.0
147901,2025-01-31,2025-01-31 09:15:00,president trump confirms there were no survivo...,0,1,0,0,0,0,0,-3.272727,2689.0,2709.0,2334.0,2673.0,2667.0,2640.0,131.0


In [19]:
df_filter=df[
    (df['Southwest Airlines']==1) &
    (df['datetime'] >=datetime.datetime(2018, 4, 16)) &
    (df['datetime']<=datetime.datetime(2018, 4, 18))
    ]

df_filter.sort_values(by='datetime', inplace=True)
df_filter

Unnamed: 0,date,datetime,article_title,Alaska Airlines,American Airlines,Delta Air Lines,JetBlue,Southwest Airlines,United Airlines,Allegiant Air,Tone,event_rank_Alaska Airlines,event_rank_American Airlines,event_rank_Delta Air Lines,event_rank_JetBlue,event_rank_Southwest Airlines,event_rank_United Airlines,event_rank_Allegiant Air
342955,2018-04-16,2018-04-16 00:30:00,travel advice flights things you didnt know ab...,0,1,0,0,1,1,0,-0.530223,813.0,916.0,591.0,387.0,996.0,740.0,322.0
484531,2018-04-16,2018-04-16 02:15:00,things you didnt know about your flight number...,0,1,0,0,1,1,0,-0.710227,813.0,916.0,591.0,387.0,996.0,740.0,322.0
484658,2018-04-16,2018-04-16 02:15:00,travel advice things you didnt know about your...,0,1,0,0,1,1,0,-0.710227,813.0,916.0,591.0,387.0,996.0,740.0,322.0
366156,2018-04-16,2018-04-16 03:15:00,things you didnt know about your flight number...,0,1,0,0,1,1,0,-0.710227,813.0,916.0,591.0,387.0,996.0,740.0,322.0
583694,2018-04-16,2018-04-16 03:30:00,travel advice things you didnt know about your...,0,1,0,0,1,1,0,-0.710227,813.0,916.0,591.0,387.0,996.0,740.0,322.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350239,2018-04-18,2018-04-18 00:00:00,opinion article 42a0 11e8 b606,0,0,0,0,1,0,0,-2.223634,1540.0,55.0,166.0,1494.0,453.0,2127.0,2688.0
350238,2018-04-18,2018-04-18 00:00:00,opinion article 42b5 11e8 8823,0,0,0,0,1,0,0,-1.470588,1540.0,55.0,166.0,1494.0,453.0,2127.0,2688.0
350237,2018-04-18,2018-04-18 00:00:00,world article 11f4 5817 8c3b,0,0,0,0,1,0,0,-0.475436,1540.0,55.0,166.0,1494.0,453.0,2127.0,2688.0
350235,2018-04-18,2018-04-18 00:00:00,article 42b5 11e8 980c bfda4,0,0,0,0,1,0,0,-2.872063,1540.0,55.0,166.0,1494.0,453.0,2127.0,2688.0


In [20]:
topics = ['Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']

for topic in topics:
    df[f'event_rank_{topic}'] = df[f'event_rank_{topic}'] * df[topic]
    df[f'event_rank_{topic}'] = np.where(df[f'event_rank_{topic}'] == 0, np.nan, df[f'event_rank_{topic}'])

In [None]:
titles = df['article_title'].value_counts()
titles.to_csv(r'../../Data/Processed/gdelt_article_titles.csv', index=True)
titles

article_title
sheriff naked man threatened to bomb florida airport                                  349
debris falls from plane during emergency landing near denver                          289
trump administration moves to block chinese airlines from us                          272
on the road again: travelers emerge in time for thanksgiving                          268
plane stolen by suicidal employee crashes near seattle                                257
                                                                                     ... 
article united passengers forced slide plane emergency exit smoke reported cabin        1
2018 jul 01 smoke in cabin forces evacuation of united plane                            1
one last storm for dallas county emergency management coordinator                       1
channellisting asp ch=205&ctime=3 6 2018%2010:00:00%20am&thistime=&thisday=             1
mechanical issues prompt 2 delta air lines flights to divert return to airport com    

In [22]:
# Reshape longer by topic
df_long = df.melt(id_vars=['date', 'article_title'], 
                  value_vars=[f'event_rank_{topic}' for topic in topics],
                  var_name='topic', value_name='event_rank')
df_long['article_title'].replace('', np.nan, inplace=True)
df_long.dropna(inplace=True)

df_long=df_long[df_long['event_rank']<=10]

# Group by date, article title, topic, and event rank, counting the number of articles
df_long = df_long.groupby(['topic', 'event_rank', 'date', 'article_title']).size().reset_index(name='article_count')

In [23]:
df_long.sort_values(by=['topic', 'event_rank', 'date', 'article_count'], ascending=[True,True,True,False], inplace=True)

# For each ['date', 'topic', 'event_rank'], keep the first 5 rows
df_long = df_long.groupby(['topic', 'event_rank', 'date']).head(5).reset_index(drop=True)
df_long['top_article_'] = df_long.groupby(['topic', 'event_rank', 'date']).cumcount() + 1

df_long

Unnamed: 0,topic,event_rank,date,article_title,article_count,top_article_
0,event_rank_Alaska Airlines,1.0,2018-08-11,plane stolen by suicidal employee crashes near...,257,1
1,event_rank_Alaska Airlines,1.0,2018-08-11,probe under way after suicidal employee steals...,140,2
2,event_rank_Alaska Airlines,1.0,2018-08-11,update plane stolen by suicidal employee crash...,122,3
3,event_rank_Alaska Airlines,1.0,2018-08-11,suicidal mechanic steals plane from seattle ai...,106,4
4,event_rank_Alaska Airlines,1.0,2018-08-11,plane crashes after unauthorized takeoff seatt...,64,5
...,...,...,...,...,...,...
341,event_rank_United Airlines,10.0,2018-03-14,united airlines could be forced to pay 25k in ...,103,1
342,event_rank_United Airlines,10.0,2018-03-14,offbeat united mistakenly flies kansas bound d...,78,2
343,event_rank_United Airlines,10.0,2018-03-14,dog death on united raises questions about its...,57,3
344,event_rank_United Airlines,10.0,2018-03-14,united mistakenly flies kansas bound dog to japan,30,4


In [24]:
# Reshape article titles to wide format using top_article_
df_wide = df_long.pivot_table(index=['topic', 'event_rank', 'date'], 
                               columns='top_article_', 
                               values='article_title', 
                               aggfunc='first').reset_index()

df_wide['topic'].replace('event_rank_','', inplace=True)
df_wide.rename(columns={
    'topic': 'airline',
    1: 'article_title_1', 
    2: 'article_title_2', 
    3: 'article_title_3', 
    4: 'article_title_4', 
    5: 'article_title_5'
    }, inplace=True)

df_wide

top_article_,airline,event_rank,date,article_title_1,article_title_2,article_title_3,article_title_4,article_title_5
0,event_rank_Alaska Airlines,1.0,2018-08-11,plane stolen by suicidal employee crashes near...,probe under way after suicidal employee steals...,update plane stolen by suicidal employee crash...,suicidal mechanic steals plane from seattle ai...,plane crashes after unauthorized takeoff seatt...
1,event_rank_Alaska Airlines,2.0,2024-01-06,faa orders grounding of certain boeing 737 max...,alaska airlines grounds 737 max 9 fleet after ...,us officials order grounding of boeing 737 9 m...,boeing faces new questions about the 737 max a...,alaska airlines flight 1282 returns to pdx aft...
2,event_rank_Alaska Airlines,3.0,2024-01-08,boeing jetliner that suffered inflight blowout...,before a door plug flew off a boeing plane an ...,united airlines finds loose bolts other proble...,investigators found the door plug that blew of...,united airlines found loose bolts on part of g...
3,event_rank_Alaska Airlines,4.0,2024-03-25,boeing ceo dave calhoun to step down part of a...,boeing chief executive dave calhoun to step down,boeing ceo to exit in broad management shakeup...,boeing ceo announces plan to step down,boeing ceo dave calhoun to step down after a y...
4,event_rank_Alaska Airlines,5.0,2023-10-23,an off duty pilot is accused of trying to shut...,an off duty pilot attempted to shut off the en...,off duty pilot charged with 83 counts of attem...,off duty pilot charged with 83 counts of attem...,off duty pilot attempted to take over alaska a...
...,...,...,...,...,...,...,...,...
65,event_rank_United Airlines,6.0,2025-03-21,heathrow airport flights resume as boss apolog...,heathrow airport closes after fire knocks out ...,global flight turmoil as fire closes london s ...,fire at london s heathrow airport knocks out p...,experts say trump official broke law by saying...
66,event_rank_United Airlines,7.0,2018-04-19,faa orders fan blade inspections after jet eng...,nation faa orders fan blade inspections after ...,southwest airlines sought more time for engine...,southwest airlines sought more time for engine...,texas faa orders fan blade inspections after j...
67,event_rank_United Airlines,8.0,2020-03-16,us airlines seek billions in aid as outbreak c...,stocks plunge as wall street white house see r...,asian stock markets us futures fall after fed ...,the latest: peru honduras enacting new contain...,the latest: peace corps evacuating volunteers ...
68,event_rank_United Airlines,9.0,2024-01-08,boeing jetliner that suffered inflight blowout...,united airlines finds loose bolts other proble...,united airlines found loose bolts on part of g...,mid air blowout jetliner had been restricted o...,united airlines found loose bolts other issues...


In [25]:
df_wide.to_csv('../../output/gdelt_top_events.csv', index=False)