In [1]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [2]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)

In [3]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [4]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [5]:
# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Positive Score','Negative Score','Activity Reference Density','Self/Group Reference Density'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [6]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='inner')

In [7]:
#list(df.columns)

In [8]:
# Create topic-specific metrics columns
df['Article Count'] = 1

df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['Tone','Polarity','Word Count',
           'PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [9]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [10]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [11]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [12]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()
# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)
#grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [13]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period
windows = [4, 8, 16, 48, 96]

for window in windows:
    for topic in topics:
        for metric in metrics:
            grouped_df[f'{metric}_{topic}_{window}'] = grouped_df[f'{metric}_{topic}'].rolling(window, min_periods=1).sum()

In [14]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.drop(columns=['stock_time'], inplace=True)
grouped_df.sort_index(inplace=True)
grouped_df.drop(columns=['datetime','datetime_EST','time','date'], inplace=True)

In [15]:
# Calculate the 1-period difference for each metric
for topic in topics:
    for metric in metrics:
        grouped_df[f'Change_{metric}_{topic}'] = grouped_df[f'{metric}_{topic}'].diff()

        for window in windows:
            grouped_df[f'Change_{metric}_{topic}_{window}'] = grouped_df[f'{metric}_{topic}_{window}'].diff()

In [16]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506221107.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)

In [17]:
grouped_df

Unnamed: 0_level_0,Tone_general,Polarity_general,Word Count_general,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,...,Change_PCA_Word_4_Allegiant Air_8,Change_PCA_Word_4_Allegiant Air_16,Change_PCA_Word_4_Allegiant Air_48,Change_PCA_Word_4_Allegiant Air_96,Change_Article Count_Allegiant Air,Change_Article Count_Allegiant Air_4,Change_Article Count_Allegiant Air_8,Change_Article Count_Allegiant Air_16,Change_Article Count_Allegiant Air_48,Change_Article Count_Allegiant Air_96
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-1.817716,20.837173,2207.0,22.468518,317.126348,175.023124,2161.893194,-265.476615,29.755018,24.701666,...,,,,,,,,,,
2018-01-02 09:45:00,-2.788845,2.788845,216.0,6.517353,34.632124,27.454499,209.678872,-33.632883,9.946373,7.431420,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:00:00,4.677223,12.279291,3476.0,12.908321,473.354621,259.780797,3411.741278,-394.964833,31.313982,25.871579,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:15:00,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:30:00,-3.405895,15.378354,1988.0,22.242029,287.764566,175.716664,1944.633961,-252.000665,41.053478,33.013992,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-2.396514,14.379085,815.0,13.046407,122.728928,60.645547,798.200751,-94.780253,10.230589,8.212349,...,0.0,0.0,0.0,-24.205773,0.0,0.0,0.0,0.0,0.0,-1.0
2025-05-30 15:00:00,-8.199908,26.186365,2067.0,35.615020,318.460931,206.679338,2016.129717,-279.317178,50.278723,41.604274,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:15:00,6.850257,14.534555,1887.0,10.685230,279.945448,171.675645,1844.087711,-244.921550,43.975354,34.334142,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:30:00,-1.743739,11.682915,875.0,13.185398,135.403889,84.922880,853.746172,-116.384345,20.158243,16.422492,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Daily data to identify events
grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')

grouped_df['date'] = grouped_df['datetime_EST'].dt.date

grouped_df = df.groupby('date').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [52]:
# Calculate the 1-period difference for each metric
for topic in topics:
    for metric in metrics:
        grouped_df[f'Change_{metric}_{topic}'] = grouped_df[f'{metric}_{topic}'].diff().fillna(0)

In [61]:
# For each topic, we identify events by lowest Change_Tone
for topic in topics:
    grouped_df[f'event_rank_{topic}'] = grouped_df[f'Change_Tone_{topic}'].rank(method='min')
    
rank_df = grouped_df[[f'event_rank_{topic}' for topic in topics]].copy()
rank_df['date'] = rank_df.index
rank_df.reset_index(drop=True, inplace=True)

In [62]:
rank_df

Unnamed: 0,event_rank_general,event_rank_Alaska Airlines,event_rank_American Airlines,event_rank_Delta Air Lines,event_rank_JetBlue,event_rank_Southwest Airlines,event_rank_United Airlines,event_rank_Allegiant Air,date
0,1324.0,1321.0,1315.0,1321.0,1307.0,1296.0,1297.0,1309.0,2018-01-01
1,1428.0,2197.0,758.0,1272.0,2236.0,2103.0,1089.0,2192.0,2018-01-02
2,118.0,20.0,478.0,369.0,248.0,599.0,1196.0,1220.0,2018-01-03
3,71.0,2478.0,32.0,405.0,84.0,505.0,1144.0,2565.0,2018-01-04
4,2636.0,2685.0,2685.0,2139.0,2636.0,1965.0,101.0,158.0,2018-01-05
...,...,...,...,...,...,...,...,...,...
2703,2089.0,2056.0,1832.0,729.0,1417.0,2379.0,2211.0,2124.0,2025-05-27
2704,797.0,348.0,754.0,545.0,916.0,2423.0,465.0,194.0,2025-05-28
2705,1821.0,2194.0,1701.0,2244.0,2586.0,195.0,2443.0,2132.0,2025-05-29
2706,1216.0,2036.0,688.0,688.0,190.0,2345.0,466.0,2363.0,2025-05-30


In [65]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [66]:
list(df.columns)

['GKGRECORDID',
 'V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'Tone',
 'Positive Score',
 'Negative Score',
 'Polarity',
 'Activity Reference Density',
 'Self/Group Reference Density',
 'Word Count',
 'c1.1; WORDCOUNT; AESTHETIC',
 'c1.2; WORDCOUNT; ECONOMIC/UTILITARIAN',
 'c1.3; WORDCOUNT; LIFE SUPPORT',
 'c1.4; WORDCOUNT; MORAL/SPIRITUAL',
 'c2.1; WORDCOUNT; ABS',
 'c2.2; WORDCOUNT; AFFGAIN',
 'c2.3; WORDCOUNT; AFFLOSS',
 'c2.4; WORDCOUNT; AFFOTH',
 'c2.5; WORDCOUNT; AFFPT',
 'c2.6; WORDCOUNT; AFFTOT',
 'c2.7; WORDCOUNT; ANI',
 'c2.8; WORDCOUNT; ANOMIE',
 'c2.9; WORDCOUNT; ARENAS',
 'c2.10; WORDCOUNT; AUD',
 'c2.11; WORDCOUNT; AbsOther',
 'c2.12; WORDCOUNT; Acad',
 'c2.13;

In [None]:
df=df[['date', 'article_title', 
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'JetBlue',
 'Southwest Airlines',
 'United Airlines',
 'Allegiant Air']].dropna()

In [67]:
# now merge the rank_df with the original df
df = pd.merge(df, rank_df, on='date', how='left')

In [68]:
topics = ['American Airlines','Delta Air Lines','Frontier Airlines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']

for topic in topics:
    df[f'event_rank_{topic}'] = df[f'event_rank_{topic}'] * df[topic]
    df[f'event_rank_{topic}'] = np.where(df[f'event_rank_{topic}'] == 0, np.nan, df[f'event_rank_{topic}'])

KeyError: 'event_rank_Frontier Airlines'