In [12]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [13]:
with open('../../data/processed/gdelt_llm_sentiment.pkl', 'rb') as f:
    llm_data = pickle.load(f)

In [14]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [15]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [16]:
df.columns.tolist()

['GKGRECORDID',
 'V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'Tone',
 'Positive Score',
 'Negative Score',
 'Polarity',
 'Activity Reference Density',
 'Self/Group Reference Density',
 'Word Count',
 'c1.1; WORDCOUNT; AESTHETIC',
 'c1.2; WORDCOUNT; ECONOMIC/UTILITARIAN',
 'c1.3; WORDCOUNT; LIFE SUPPORT',
 'c1.4; WORDCOUNT; MORAL/SPIRITUAL',
 'c2.1; WORDCOUNT; ABS',
 'c2.2; WORDCOUNT; AFFGAIN',
 'c2.3; WORDCOUNT; AFFLOSS',
 'c2.4; WORDCOUNT; AFFOTH',
 'c2.5; WORDCOUNT; AFFPT',
 'c2.6; WORDCOUNT; AFFTOT',
 'c2.7; WORDCOUNT; ANI',
 'c2.8; WORDCOUNT; ANOMIE',
 'c2.9; WORDCOUNT; ARENAS',
 'c2.10; WORDCOUNT; AUD',
 'c2.11; WORDCOUNT; AbsOther',
 'c2.12; WORDCOUNT; Acad',
 'c2.13;

In [17]:
df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)

In [18]:
df = pd.merge(df, llm_data, on='GKGRECORDID', how='left')
df.drop(columns=['GKGRECORDID'], inplace=True)

In [19]:
# Renaming GDELT columns from airline name to stock ticker
df.columns = [col.replace('Allegiant Air', 'ALGT') for col in df.columns]
df.columns = [col.replace('Alaska Airlines', 'ALK') for col in df.columns]
df.columns = [col.replace('United Airlines', 'UAL') for col in df.columns]
df.columns = [col.replace('Delta Air Lines', 'DAL') for col in df.columns]
df.columns = [col.replace('JetBlue', 'JBLU') for col in df.columns]
df.columns = [col.replace('Southwest Airlines', 'LUV') for col in df.columns]
df.columns = [col.replace('American Airlines', 'AAL') for col in df.columns]

In [20]:
df.drop(columns=['airplane','airline','airport','Frontier Airlines','Hawaiian Airlines',
                 'Spirit Airlines','Sun Country Airlines'
                 ], inplace=True)

In [21]:
df

Unnamed: 0,V2SOURCECOMMONNAME,V2DOCUMENTIDENTIFIER,V1THEMES,datetime,date,ALK,AAL,DAL,JBLU,LUV,...,v42.3; SCOREDVALUE; fairness_p,v42.4; SCOREDVALUE; loyalty_p,v42.5; SCOREDVALUE; authority_p,v42.6; SCOREDVALUE; sanctity_p,v42.7; SCOREDVALUE; care_sent,v42.8; SCOREDVALUE; fairness_sent,v42.9; SCOREDVALUE; loyalty_sent,v42.10; SCOREDVALUE; authority_sent,v42.11; SCOREDVALUE; sanctity_sent,llm_sentiment
0,aero-news.net,http://aero-news.net/index.cfm?do=main.textpos...,TAX_WORLDREPTILES;TAX_WORLDREPTILES_SIDEWINDER...,2023-02-21 03:00:00,2023-02-21,0,1,0,0,0,...,0.094112,0.084485,0.096285,0.065091,-0.117746,-0.047186,-0.009921,-0.062596,-0.075467,0.394921
1,houstonchronicle.com,http://www.houstonchronicle.com/news/politics/...,TAX_FNCACT;TAX_FNCACT_OFFICIALS;LEADER;USPEC_P...,2025-01-17 23:45:00,2025-01-17,0,1,0,0,1,...,0.075865,0.081348,0.085938,0.067908,-0.102076,-0.046182,-0.011943,-0.043249,-0.057695,-0.113300
2,itechpost.com,http://www.itechpost.com/articles/122528/20240...,TAX_FNCACT;TAX_FNCACT_EMPLOYEE;EPU_POLICY;EPU_...,2024-05-27 09:30:00,2024-05-27,0,0,0,0,0,...,0.101007,0.088567,0.090690,0.078992,-0.115803,-0.063174,-0.039071,-0.037974,-0.090853,-0.510247
3,1025kiss.com,https://1025kiss.com/ixp/175/p/lukas-first-gam...,,2025-04-10 17:15:00,2025-04-10,0,1,0,0,0,...,0.092551,0.084106,0.084379,0.078339,-0.094771,-0.025998,0.001789,-0.025604,-0.013816,0.216887
4,710keel.com,https://710keel.com/ixp/182/p/spirit-airlines-...,ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_...,2024-11-20 13:15:00,2024-11-20,0,0,0,0,0,...,0.106656,0.100371,0.092308,0.088900,-0.050954,-0.023068,0.021486,0.009874,-0.006657,-1.209943
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1121051,wsiu.org,https://www.wsiu.org/2022-12-16/they-flew-to-n...,TAX_WORLDMAMMALS;TAX_WORLDMAMMALS_DOG;TAX_ETHN...,2022-12-16 14:00:00,2022-12-16,0,0,0,0,0,...,0.093640,0.083392,0.071799,0.066122,-0.100614,-0.043287,-0.001136,-0.007722,-0.044429,0.296710
1121052,wtol.com,https://www.wtol.com/article/news/nation-world...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.774252
1121053,wwltv.com,https://www.wwltv.com/article/news/nation-worl...,MANMADE_DISASTER_IMPLIED;DELAY;USPEC_UNCERTAIN...,2019-07-19 01:45:00,2019-07-19,0,1,0,0,1,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.760821
1121054,yahoo.com,https://www.yahoo.com/news/delta-apos-cracking...,UNREST_CRACKDOWN;TAX_WORLDBIRDS;TAX_WORLDBIRDS...,2018-01-20 06:30:00,2018-01-20,0,1,1,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.396778


In [None]:
# Create topic-specific metrics columns
df['Article Count'] = 1
df['general'] = 1

topics  = ['general','AAL','ALGT','ALK','DAL','JBLU','LUV','UAL']
metrics = [i for i in df.columns if i not in topics and i not in ['V2SOURCECOMMONNAME','V2DOCUMENTIDENTIFIER','V1THEMES','datetime','date','article_title']]

for topic in topics:
    for metric in metrics:
        df[f'{topic}_{metric}'] = df[metric] * df[topic]

In [56]:
# Convert from UTC to EST, accounting for daylight saving time
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
df['datetime'] = df['datetime'].dt.tz_convert('America/New_York')

# Remove the timezone information 
df['datetime'] = df['datetime'].dt.tz_localize(None)

In [57]:
df['datetime'].describe()

count                          1121056
mean     2021-05-16 18:00:53.125446400
min                2017-12-31 19:15:00
25%                2019-04-10 18:15:00
50%                2021-01-29 12:30:00
75%                2023-07-24 21:30:00
max                2025-05-31 19:45:00
Name: datetime, dtype: object

In [58]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{topic}_{metric}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [59]:
grouped_df.shape

(214303, 144)

In [60]:
# Fill in all missing times
start = grouped_df.index.min()
end   = grouped_df.index.max()
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)
grouped_df.index = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [61]:
grouped_df.shape # filling in added 45,609 rows or 21.3%.
# Hard to say whether these times were missing because GDELT was down, or I didn't scrape them, or there were just no articles.

(259971, 144)

In [62]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_all_hours_20250706.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)

In [63]:
grouped_df['datetime'] = grouped_df.index
grouped_df['time'] = grouped_df['datetime'].dt.time
grouped_df['date'] = grouped_df['datetime'].dt.date

In [64]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()

In [65]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period
windows = [4, 8, 16, 48, 96]

for window in windows:
    for topic in topics:
        for metric in metrics:
            grouped_df[f'{topic}_{metric}_cum{window:02d}'] = grouped_df[f'{topic}_{metric}'].rolling(window, min_periods=1).sum()

In [66]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.sort_index(inplace=True)
grouped_df.drop(columns=['stock_time','datetime','time','date'], inplace=True)

In [67]:
grouped_df

Unnamed: 0_level_0,general_Tone,general_llm_sentiment,general_Article Count,general_PCA_GKG1_0,general_PCA_GKG1_1,general_PCA_GKG1_2,general_PCA_GKG1_3,general_PCA_GKG1_4,general_PCA_Scored_0,general_PCA_Scored_1,...,UAL_PCA_Scored_0_cum96,UAL_PCA_Scored_1_cum96,UAL_PCA_Scored_2_cum96,UAL_PCA_Scored_3_cum96,UAL_PCA_Scored_4_cum96,UAL_PCA_Word_0_cum96,UAL_PCA_Word_1_cum96,UAL_PCA_Word_2_cum96,UAL_PCA_Word_3_cum96,UAL_PCA_Word_4_cum96
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-1.817716,-0.574090,3.0,20.928461,314.198528,184.263505,2161.958451,-262.280734,30.240983,24.168861,...,296.644358,230.776833,19.770266,255.457946,145.257647,31878.437401,5395.433711,3170.691516,-218.338234,3443.242317
2018-01-02 09:45:00,-2.788845,-1.150778,1.0,6.438574,34.459601,28.269778,209.648900,-33.335329,10.090419,7.275720,...,296.644358,230.776833,19.770266,255.457946,145.257647,31878.437401,5395.433711,3170.691516,-218.338234,3443.242317
2018-01-02 10:00:00,4.677223,1.067656,3.0,10.364527,468.495226,274.775018,3411.814875,-390.020538,31.814957,25.317333,...,317.620089,247.421772,20.449476,273.896623,154.999165,33733.821722,5674.056970,3401.595421,-149.834376,3624.465761
2018-01-02 10:15:00,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,317.620089,247.421772,20.449476,273.896623,154.999165,33733.821722,5674.056970,3401.595421,-149.834376,3624.465761
2018-01-02 10:30:00,-3.405895,1.328670,4.0,20.986363,285.313838,183.948113,1944.611213,-249.180762,41.692714,32.305938,...,327.457368,255.247980,20.900756,282.679744,160.624964,34049.990947,5721.248323,3421.839964,-170.672868,3650.625422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-2.396514,0.743808,1.0,12.435765,121.617276,64.003250,798.263500,-93.551333,10.387594,8.042936,...,871.031328,682.294241,87.268641,748.704280,434.186372,25734.952716,4670.984966,3640.028807,-1111.674059,2683.027911
2025-05-30 15:00:00,-8.199908,-0.808221,5.0,34.454763,316.187074,214.925591,2016.049915,-276.383253,51.085058,40.749595,...,881.556164,690.972975,87.490736,756.582410,439.080441,26261.705192,4790.516343,3716.155846,-1134.817867,2748.263207
2025-05-30 15:15:00,6.850257,0.920541,4.0,9.526906,277.663575,179.446893,1844.047179,-242.288157,44.629078,33.619337,...,879.581953,690.025637,87.159848,755.407498,440.098110,26556.478060,4836.904590,3764.062970,-1131.967260,2769.251344
2025-05-30 15:30:00,-1.743739,0.065327,2.0,12.673848,134.412796,88.426032,853.723909,-115.137996,20.480206,16.079135,...,838.819349,657.724551,83.897721,720.019353,419.724746,25943.030603,4726.544504,3684.352318,-1109.281325,2704.605049


In [68]:
# Calculate the 1-period difference for each metric
for topic in topics:
    for metric in metrics:
        grouped_df[f'{topic}_Change_{metric}'] = grouped_df[f'{topic}_{metric}'].diff()

        for window in windows:
            grouped_df[f'{topic}_Change_{metric}_cum{window:02d}'] = grouped_df[f'{topic}_{metric}_cum{window:02d}'].diff()

In [69]:
# Sort all the columns alphabetically
grouped_df = grouped_df.reindex(sorted(grouped_df.columns), axis=1)

In [70]:
grouped_df.columns.to_list()

['AAL_Article Count',
 'AAL_Article Count_cum04',
 'AAL_Article Count_cum08',
 'AAL_Article Count_cum16',
 'AAL_Article Count_cum48',
 'AAL_Article Count_cum96',
 'AAL_Change_Article Count',
 'AAL_Change_Article Count_cum04',
 'AAL_Change_Article Count_cum08',
 'AAL_Change_Article Count_cum16',
 'AAL_Change_Article Count_cum48',
 'AAL_Change_Article Count_cum96',
 'AAL_Change_PCA_GKG1_0',
 'AAL_Change_PCA_GKG1_0_cum04',
 'AAL_Change_PCA_GKG1_0_cum08',
 'AAL_Change_PCA_GKG1_0_cum16',
 'AAL_Change_PCA_GKG1_0_cum48',
 'AAL_Change_PCA_GKG1_0_cum96',
 'AAL_Change_PCA_GKG1_1',
 'AAL_Change_PCA_GKG1_1_cum04',
 'AAL_Change_PCA_GKG1_1_cum08',
 'AAL_Change_PCA_GKG1_1_cum16',
 'AAL_Change_PCA_GKG1_1_cum48',
 'AAL_Change_PCA_GKG1_1_cum96',
 'AAL_Change_PCA_GKG1_2',
 'AAL_Change_PCA_GKG1_2_cum04',
 'AAL_Change_PCA_GKG1_2_cum08',
 'AAL_Change_PCA_GKG1_2_cum16',
 'AAL_Change_PCA_GKG1_2_cum48',
 'AAL_Change_PCA_GKG1_2_cum96',
 'AAL_Change_PCA_GKG1_3',
 'AAL_Change_PCA_GKG1_3_cum04',
 'AAL_Change_PCA_G

In [72]:
# Calculate lags
for topic in topics:
    for metric in metrics:
        for i in range(1, 27):
            grouped_df[f'{topic}_Change_{metric}_lag{i:02d}'] = grouped_df[f'{topic}_Change_{metric}'].shift(i)
            grouped_df[f'{topic}_Change_{metric}_cum16_lag{i:02d}'] = grouped_df[f'{topic}_Change_{metric}_cum16'].shift(i)

In [73]:
grouped_df.shape

(48438, 9216)

In [74]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202507062100.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)