In [5]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime, gc, ctypes
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
warnings.filterwarnings('ignore')

In [6]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [7]:
with open('../../data/processed/gdelt_llm_sentiment_finance.pkl', 'rb') as f:
    llm_data = pickle.load(f)
llm_data = downcast(llm_data, verbose = True)

0.0% compressed


In [8]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())
stock_times = stock_data.index.unique().tolist()

In [9]:
with open(r'../../data/processed/gdelt_intermediate_cleaned_finance.pkl', 'rb') as f:
    df = pickle.load(f)
df = downcast(df, verbose = True)

1.5% compressed


In [10]:
df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)

llm_data.index = llm_data['GKGRECORDID']
llm_data.drop(columns=['GKGRECORDID'], inplace=True)

In [11]:
# Renaming GDELT columns from airline name to stock ticker
df.columns = [col.replace('Allegiant Air', 'ALGT') for col in df.columns]
df.columns = [col.replace('Alaska Airlines', 'ALK') for col in df.columns]
df.columns = [col.replace('United Airlines', 'UAL') for col in df.columns]
df.columns = [col.replace('Delta Air Lines', 'DAL') for col in df.columns]
df.columns = [col.replace('JetBlue', 'JBLU') for col in df.columns]
df.columns = [col.replace('Southwest Airlines', 'LUV') for col in df.columns]
df.columns = [col.replace('American Airlines', 'AAL') for col in df.columns]

In [12]:
df.drop(columns=['airplane','airline','airport','Frontier Airlines','Hawaiian Airlines',
                 'Spirit Airlines','Sun Country Airlines','article_title','V2SOURCECOMMONNAME','V2DOCUMENTIDENTIFIER','V1THEMES'
                 ], inplace=True)

In [13]:
df = pd.merge(df, llm_data, left_index=True, right_index=True, how='left')

In [14]:
df

Unnamed: 0_level_0,datetime,date,ALK,AAL,DAL,JBLU,LUV,UAL,ALGT,Tone,...,llm_dimension_22,llm_dimension_23,llm_dimension_24,llm_dimension_25,llm_dimension_26,llm_dimension_27,llm_dimension_28,llm_dimension_29,llm_dimension_30,llm_dimension_31
GKGRECORDID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20250410171500-1578,2025-04-10 17:15:00,2025-04-10,0,0,0,0,0,0,0,-1.745636,...,-0.010216,0.135376,0.033295,-0.012817,-0.023117,0.036926,0.038605,0.010941,0.068909,-0.042053
20231120234500-1109,2023-11-20 23:45:00,2023-11-20,0,1,0,0,1,0,0,-1.364366,...,-0.067932,0.133667,0.050476,-0.038666,-0.074524,0.066895,-0.009491,-0.007118,0.009331,-0.028763
20230421174500-864,2023-04-21 17:45:00,2023-04-21,0,0,1,0,0,0,0,1.260193,...,-0.054932,0.145386,0.051971,-0.069824,-0.060211,0.079590,0.012947,-0.017487,-0.023911,-0.008911
20230828170000-1294,2023-08-28 17:00:00,2023-08-28,0,1,0,0,0,0,0,-4.290429,...,-0.050171,0.096069,0.057007,-0.040405,-0.008644,0.082397,0.024139,-0.025940,0.007011,-0.032532
20240423193000-1659,2024-04-23 19:30:00,2024-04-23,0,0,0,1,0,0,0,2.833827,...,-0.084656,0.098572,0.055573,-0.102661,-0.059967,0.073792,-0.020508,-0.039124,0.009949,-0.056366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20200302183000-410,2020-03-02 18:30:00,2020-03-02,0,0,1,0,0,0,0,3.413401,...,-0.052216,0.153198,0.021896,-0.069946,-0.074707,0.061127,0.015854,-0.015793,0.012238,-0.003254
20220817191500-372,2022-08-17 19:15:00,2022-08-17,0,1,0,0,0,0,0,0.675676,...,-0.066467,0.128906,-0.020569,-0.058014,-0.037811,0.070862,0.001449,0.022766,-0.002035,-0.002810
20220516161500-649,2022-05-16 16:15:00,2022-05-16,0,0,0,1,0,0,0,-4.444445,...,-0.058075,0.083252,0.049774,-0.085938,-0.044006,0.065308,0.007671,-0.006226,0.030716,-0.014923
20220516161500-83,2022-05-16 16:15:00,2022-05-16,0,0,0,1,0,0,0,-4.302926,...,-0.058075,0.083252,0.049774,-0.085938,-0.044006,0.065308,0.007671,-0.006226,0.030716,-0.014923


In [15]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics  = ['AAL','ALGT','ALK','DAL','JBLU','LUV','UAL']
metrics = [i for i in df.columns if 'llm' in i or i in ['Article Count','Tone'] or i.startswith("v") or i.startswith("c")]

for topic in topics:
    for metric in metrics:
        df[f'{topic}_{metric}'] = df[metric] * df[topic]

In [16]:
# Convert from UTC to EST, accounting for daylight saving time
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
df['datetime'] = df['datetime'].dt.tz_convert('America/New_York')

# Remove the timezone information 
df['datetime'] = df['datetime'].dt.tz_localize(None)

In [17]:
df['datetime'].describe()

count                           121655
mean     2021-07-04 18:20:29.342813696
min                2017-12-31 21:00:00
25%                2019-10-31 09:37:30
50%                2021-03-17 12:45:00
75%                2023-06-01 19:45:00
max                2025-05-31 14:15:00
Name: datetime, dtype: object

In [18]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{topic}_{metric}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [19]:
# Fill in all missing times
current_times = grouped_df.index.tolist()
to_add = list(set(stock_times) - set(current_times)) # fast way to figure out which stock times are missing
df_to_add = pd.DataFrame(index=to_add, columns=grouped_df.columns)
df_to_add = df_to_add.fillna(0)

grouped_df = pd.concat([grouped_df, df_to_add], copy=False, sort=True)

In [20]:
##### Handling after-hours articles #####
grouped_df['datetime'] = grouped_df.index
grouped_df['time'] = grouped_df['datetime'].dt.time
grouped_df['date'] = grouped_df['datetime'].dt.date

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()

In [21]:
grouped_df

Unnamed: 0,AAL_Article Count,AAL_Tone,AAL_c16.60; WORDCOUNT; finance,AAL_c18.121; WORDCOUNT; UNSAFE_WORK_ENVIRONMENT,AAL_c18.137; WORDCOUNT; TRIAL,AAL_c18.154; WORDCOUNT; ECON_MONOPOLY,AAL_c18.157; WORDCOUNT; AVIATION_INCIDENT,AAL_c18.164; WORDCOUNT; CORRUPTION,AAL_c18.178; WORDCOUNT; ECON_ENTREPRENEURSHIP,AAL_c18.187; WORDCOUNT; ECON_SUBSIDIES,...,UAL_v42.4; SCOREDVALUE; loyalty_p,UAL_v42.5; SCOREDVALUE; authority_p,UAL_v42.6; SCOREDVALUE; sanctity_p,UAL_v42.7; SCOREDVALUE; care_sent,UAL_v42.8; SCOREDVALUE; fairness_sent,UAL_v42.9; SCOREDVALUE; loyalty_sent,datetime,time,date,stock_time
2017-12-31 21:00:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2017-12-31 21:00:00,21:00:00,2017-12-31,2018-01-02 09:30:00
2018-01-01 01:00:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2018-01-01 01:00:00,01:00:00,2018-01-01,2018-01-02 09:30:00
2018-01-01 02:30:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2018-01-01 02:30:00,02:30:00,2018-01-01,2018-01-02 09:30:00
2018-01-01 03:00:00,1,0.503778,32,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2018-01-01 03:00:00,03:00:00,2018-01-01,2018-01-02 09:30:00
2018-01-01 04:15:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2018-01-01 04:15:00,04:15:00,2018-01-01,2018-01-02 09:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-31 09:15:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.083855,0.101553,0.066787,-0.183178,-0.065557,-0.078441,2025-05-31 09:15:00,09:15:00,2025-05-31,2025-05-30 15:45:00
2025-05-31 10:00:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.077963,0.080744,0.059449,-0.069236,0.023888,0.031877,2025-05-31 10:00:00,10:00:00,2025-05-31,2025-05-30 15:45:00
2025-05-31 10:15:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.158487,0.161498,0.121096,-0.144227,0.049961,0.066167,2025-05-31 10:15:00,10:15:00,2025-05-31,2025-05-30 15:45:00
2025-05-31 12:15:00,0,0.000000,0,0,0,0,0,0,0,0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2025-05-31 12:15:00,12:15:00,2025-05-31,2025-05-30 15:45:00


In [22]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period
windows = [4, 16, 48, 96]

for i in [j for j in grouped_df.columns if j not in ['datetime', 'date', 'time', 'stock_time']]:
    for window in windows:
        grouped_df[f'{i}_cum{window:02d}'] = grouped_df[f'{i}'].rolling(window, min_periods=1).sum()

In [23]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.sort_index(inplace=True)
grouped_df.drop(columns=['stock_time','datetime','time','date'], inplace=True)

In [24]:
# Calculate lags
for j in grouped_df.columns:
    grouped_df[f'{j}_lag01'] = grouped_df[f'{j}'].shift(1)

In [25]:
# Keep only the lagged columns
grouped_df = grouped_df[[col for col in grouped_df.columns if 'lag' in col]]

In [26]:
grouped_df = grouped_df.dropna()
# Order the columns alphabetically
grouped_df = grouped_df.reindex(sorted(grouped_df.columns), axis=1)

In [27]:
# Split columns into MultiIndex (ticker, metric)
grouped_df.columns = grouped_df.columns.str.split('_', n=1, expand=True)
grouped_df = grouped_df.stack(level=0).rename_axis(['date', 'ticker']).reset_index()
grouped_df = grouped_df.set_index(['date', 'ticker'])

In [28]:
grouped_df = downcast(grouped_df, verbose = True)

68.6% compressed


In [29]:
grouped_df['date']   = grouped_df.index.get_level_values('date')
grouped_df['ticker'] = grouped_df.index.get_level_values('ticker')

In [30]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_finance.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)

In [31]:
grouped_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,Tone_cum04_lag01,Tone_cum16_lag01,Tone_cum48_lag01,Tone_cum96_lag01,Tone_lag01,...,v42.8; SCOREDVALUE; fairness_sent_cum48_lag01,v42.8; SCOREDVALUE; fairness_sent_cum96_lag01,v42.8; SCOREDVALUE; fairness_sent_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum04_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum16_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum48_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum96_lag01,v42.9; SCOREDVALUE; loyalty_sent_lag01,date,ticker
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-02 09:45:00,AAL,1,6,6,6,0,0.00000,-12.074511,-12.074511,-12.074511,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,2018-01-02 09:45:00,AAL
2018-01-02 09:45:00,ALGT,0,0,0,0,0,0.00000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,2018-01-02 09:45:00,ALGT
2018-01-02 09:45:00,ALK,0,3,5,5,0,0.00000,4.502559,1.760040,1.760040,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,2018-01-02 09:45:00,ALK
2018-01-02 09:45:00,DAL,0,1,1,1,0,0.00000,0.503778,0.503778,0.503778,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,2018-01-02 09:45:00,DAL
2018-01-02 09:45:00,JBLU,1,3,3,3,0,-2.48062,1.518160,1.518160,1.518160,0.0,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,2018-01-02 09:45:00,JBLU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 15:45:00,ALK,0,0,4,13,0,0.00000,0.000000,-6.000360,-11.154161,0.0,...,-0.139706,-0.133636,0.0,0.0,0.000000,-0.067318,0.075841,0.0,2025-05-30 15:45:00,ALK
2025-05-30 15:45:00,DAL,0,4,7,11,0,0.00000,-5.082380,0.260769,-2.574365,0.0,...,0.056338,-0.185764,0.0,0.0,0.065506,0.186295,0.109351,0.0,2025-05-30 15:45:00,DAL
2025-05-30 15:45:00,JBLU,0,2,5,20,0,0.00000,4.190516,12.510170,52.351353,0.0,...,0.021646,0.389782,0.0,0.0,0.052000,0.107777,0.615597,0.0,2025-05-30 15:45:00,JBLU
2025-05-30 15:45:00,LUV,0,0,1,11,0,0.00000,0.000000,-0.803213,-8.950488,0.0,...,0.028658,-0.189179,0.0,0.0,0.000000,0.042602,0.175897,0.0,2025-05-30 15:45:00,LUV
