In [209]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [210]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)

In [211]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [212]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [213]:
df = df.to_pandas()

# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density','Self/Group Reference Density','Word Count'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [214]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='inner')

In [215]:
list(df.columns)

['V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'PCA_GKG1_0',
 'PCA_GKG1_1',
 'PCA_GKG1_2',
 'PCA_GKG1_3',
 'PCA_GKG1_4',
 'PCA_GKG1_5',
 'PCA_GKG1_6',
 'PCA_Scored_0',
 'PCA_Scored_1',
 'PCA_Scored_2',
 'PCA_Scored_3',
 'PCA_Scored_4',
 'PCA_Scored_5',
 'PCA_Scored_6',
 'PCA_Scored_7',
 'PCA_Scored_8',
 'PCA_Scored_9',
 'PCA_Scored_10',
 'PCA_Scored_11',
 'PCA_Scored_12',
 'PCA_Scored_13',
 'PCA_Scored_14',
 'PCA_Scored_15',
 'PCA_Scored_16',
 'PCA_Scored_17',
 'PCA_Scored_18',
 'PCA_Scored_19',
 'PCA_Scored_20',
 'PCA_Scored_21',
 'PCA_Scored_22',
 'PCA_Scored_23',
 'PCA_Scored_24',
 'PCA_Scored_25',
 'PCA_Scored_26',
 'PCA_Scored_27',
 'PCA_Scored_28',
 'PCA_Scored_29',
 'P

In [216]:
# Create topic-specific metrics columns
df['Article Count'] = 1

df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [217]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [218]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [219]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [220]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()
# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)
#grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [221]:
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}'] = grouped_df[f'{metric}_{topic}'].rolling(window=8, min_periods=1).sum()

In [222]:
grouped_df.columns

Index(['PCA_GKG1_0_general', 'PCA_GKG1_1_general', 'PCA_GKG1_2_general',
       'PCA_GKG1_3_general', 'PCA_GKG1_4_general', 'PCA_Scored_0_general',
       'PCA_Scored_1_general', 'PCA_Scored_2_general', 'PCA_Scored_3_general',
       'PCA_Scored_4_general',
       ...
       'PCA_Word_1_Allegiant Air', 'PCA_Word_2_Allegiant Air',
       'PCA_Word_3_Allegiant Air', 'PCA_Word_4_Allegiant Air',
       'Article Count_Allegiant Air', 'datetime', 'datetime_EST', 'time',
       'date', 'stock_time'],
      dtype='object', length=133)

In [223]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
#    {f'{metric}_{topic}': ['sum','mean'] for topic in topics for metric in metrics}
#    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
    {f'{metric}_{topic}': ['mean'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
#grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]
grouped_df.columns = ['_'.join(col).strip().replace('_mean','') for col in grouped_df.columns.values]
#grouped_df.columns = ['_'.join(col).strip().replace('_sum','').replace('_mean','') for col in grouped_df.columns.values]
#grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]

grouped_df.sort_index(inplace=True)

In [224]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Scored_1_Allegiant Air,PCA_Scored_2_Allegiant Air,PCA_Scored_3_Allegiant Air,PCA_Scored_4_Allegiant Air,PCA_Word_0_Allegiant Air,PCA_Word_1_Allegiant Air,PCA_Word_2_Allegiant Air,PCA_Word_3_Allegiant Air,PCA_Word_4_Allegiant Air,Article Count_Allegiant Air
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-52.842769,2775.639558,575.589494,20016.210047,-2287.294826,231.010874,192.626324,19.727399,223.131267,124.005501,...,1.288413,0.06664,1.388000,0.784610,123.380557,24.447099,14.437908,-7.979912,17.510292,0.155844
2018-01-02 09:45:00,-8.471699,3773.169061,588.682290,28239.596732,-3031.544862,198.132144,164.864028,17.004056,189.001946,104.271142,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:00:00,-3.342084,4213.460083,661.055668,31482.958640,-3380.836844,218.681965,183.185765,16.268369,209.567198,114.309314,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:15:00,-3.342084,4213.460083,661.055668,31482.958640,-3380.836844,218.681965,183.185765,16.268369,209.567198,114.309314,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:30:00,4.137832,4221.408374,655.103216,31577.895425,-3384.029447,219.962245,183.999386,16.691910,208.748544,114.340911,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-77.818154,2183.204777,518.283508,14752.777560,-1772.038581,248.445283,212.286697,22.293669,224.976740,131.154627,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:00:00,-113.732720,2284.081906,585.336830,15189.953604,-1866.501920,285.487114,244.676409,25.433502,262.523889,152.348486,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:15:00,-116.563075,2194.959756,590.898839,14492.895052,-1808.016414,297.479037,253.021133,30.111710,271.378100,159.202758,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:30:00,-98.911397,1989.727788,533.109071,13216.535835,-1647.230637,267.467024,226.914280,28.500863,243.858233,143.783768,...,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [225]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum
for topic in topics:
    for metric in metrics:
        # Create a first difference of the rolling sum
        grouped_df[f'{metric}_{topic}_diff'] = grouped_df[f'{metric}_{topic}'].diff().fillna(0)

        grouped_df[f'{metric}_{topic}_rolling26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        # Create a first difference of the rolling sum
        grouped_df[f'{metric}_{topic}_rolling26_diff'] = grouped_df[f'{metric}_{topic}_rolling26'].diff().fillna(0)

In [226]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
#scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)
scaler2 = MinMaxScaler(feature_range=(0, 1))
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_diff']           =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}_diff']])
  #      grouped_df[f'{metric}_{topic}_rolling26']      = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling26']])

In [227]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_2_Allegiant Air_rolling26_diff,PCA_Word_3_Allegiant Air_diff,PCA_Word_3_Allegiant Air_rolling26,PCA_Word_3_Allegiant Air_rolling26_diff,PCA_Word_4_Allegiant Air_diff,PCA_Word_4_Allegiant Air_rolling26,PCA_Word_4_Allegiant Air_rolling26_diff,Article Count_Allegiant Air_diff,Article Count_Allegiant Air_rolling26,Article Count_Allegiant Air_rolling26_diff
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-0.315296,0.836815,0.768722,0.871473,-0.845677,0.713437,0.697089,0.709520,0.713676,0.696622,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 09:45:00,-0.050548,1.137556,0.786208,1.229506,-1.120847,0.611897,0.596621,0.611572,0.604515,0.585761,...,0.000000,10.764737,-7.979912,0.000000,-23.621024,17.510292,0.000000,-0.210231,0.155844,0.00000
2018-01-02 10:00:00,-0.019941,1.270297,0.882866,1.370717,-1.249990,0.675362,0.662924,0.585112,0.670292,0.642152,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 10:15:00,-0.019941,1.270297,0.882866,1.370717,-1.249990,0.675362,0.662924,0.585112,0.670292,0.642152,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 10:30:00,0.024689,1.272693,0.874916,1.374850,-1.251171,0.679316,0.665869,0.600345,0.667673,0.642329,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-0.464317,0.658204,0.692188,0.642312,-0.655172,0.767281,0.768237,0.801819,0.719579,0.736783,...,-20.745107,0.000000,-37.356611,10.903539,0.000000,81.073279,-24.117033,0.000000,3.394366,-1.00000
2025-05-30 15:00:00,-0.678608,0.688617,0.781740,0.661346,-0.690098,0.881678,0.885451,0.914747,0.839672,0.855843,...,-20.745107,0.000000,-26.453072,10.903539,0.000000,56.956247,-24.117033,0.000000,2.394366,-1.00000
2025-05-30 15:15:00,-0.695495,0.661748,0.789168,0.630997,-0.668474,0.918713,0.915649,1.083005,0.867992,0.894348,...,-20.745107,0.000000,-15.549533,10.903539,0.000000,32.839214,-24.117033,0.000000,1.394366,-1.00000
2025-05-30 15:30:00,-0.590173,0.599874,0.711988,0.575426,-0.609027,0.826026,0.821172,1.025069,0.779970,0.807730,...,-20.745107,0.000000,-4.645993,10.903539,0.000000,8.722182,-24.117033,0.000000,0.394366,-1.00000


In [228]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506141555.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)