In [1]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [2]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)
with open('../../data/processed/gdelt_llm_sentiment.pkl', 'rb') as f:
    llm_data = pickle.load(f)

In [3]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [4]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [5]:
# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Positive Score','Activity Reference Density','Self/Group Reference Density'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [6]:
df.shape

(1122584, 25)

In [7]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='left')
df = pd.merge(df, llm_data, on='GKGRECORDID', how='left')

In [8]:
df.shape

(1122968, 124)

In [9]:
# Create topic-specific metrics columns
df['Article Count'] = 1
df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['Tone','Negative Score','Polarity','Word Count','llm_sentiment',
           'PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [10]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [11]:
grouped_df.shape

(214358, 168)

In [12]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [13]:
grouped_df.shape # filling in added 45,609 rows or 21.3%.
# Hard to say whether these times were missing because GDELT was down, or I didn't scrape them, or there were just no articles.

(259967, 169)

In [14]:
grouped_df

Unnamed: 0,Tone_general,Negative Score_general,Polarity_general,Word Count_general,llm_sentiment_general,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,...,PCA_Scored_2_Allegiant Air,PCA_Scored_3_Allegiant Air,PCA_Scored_4_Allegiant Air,PCA_Word_0_Allegiant Air,PCA_Word_1_Allegiant Air,PCA_Word_2_Allegiant Air,PCA_Word_3_Allegiant Air,PCA_Word_4_Allegiant Air,Article Count_Allegiant Air,datetime
0,-7.430732,10.656754,13.882776,1009.0,-0.635670,20.172760,150.433197,93.124840,986.216799,-127.486955,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:15:00
1,-1.109570,3.328710,5.547850,671.0,0.333184,6.726464,95.039790,55.744641,657.340733,-80.391601,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:30:00
2,-2.904094,6.729040,10.553987,1499.0,-0.669239,13.936986,209.096803,122.202156,1469.431723,-175.827114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:45:00
3,1.287554,0.085837,1.459227,1076.0,-0.411190,2.633621,142.522341,82.893606,1056.476859,-122.111386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:00:00
4,-2.259993,5.192379,8.124766,5067.0,-0.205076,14.254049,647.217386,335.349129,4987.039517,-522.434936,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:15:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259962,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 22:45:00
259963,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:00:00
259964,-1.392758,3.528319,5.663881,991.0,-0.081255,7.242003,134.544898,74.772458,972.830049,-111.179105,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:15:00
259965,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:30:00


In [15]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [16]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_all_hours_202506241203.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)

In [17]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()
# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)
#grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [18]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period
windows = [4, 8, 16, 48, 96]

for window in windows:
    for topic in topics:
        for metric in metrics:
            grouped_df[f'{metric}_{topic}_{window}'] = grouped_df[f'{metric}_{topic}'].rolling(window, min_periods=1).sum()

In [19]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.drop(columns=['stock_time'], inplace=True)
grouped_df.sort_index(inplace=True)
grouped_df.drop(columns=['datetime','datetime_EST','time','date'], inplace=True)

In [20]:
# Calculate the 1-period difference for each metric
for topic in topics:
    for metric in metrics:
        grouped_df[f'Change_{metric}_{topic}'] = grouped_df[f'{metric}_{topic}'].diff()

        for window in windows:
            grouped_df[f'Change_{metric}_{topic}_{window}'] = grouped_df[f'{metric}_{topic}_{window}'].diff()

In [21]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506241200.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)