In [101]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer
warnings.filterwarnings('ignore')

In [102]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)

In [103]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [104]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [105]:
df = df.to_pandas()

# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density','Self/Group Reference Density','Word Count'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [106]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='inner')

In [107]:
list(df.columns)

['V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'PCA_GKG1_0',
 'PCA_GKG1_1',
 'PCA_GKG1_2',
 'PCA_GKG1_3',
 'PCA_GKG1_4',
 'PCA_GKG1_5',
 'PCA_GKG1_6',
 'PCA_Scored_0',
 'PCA_Scored_1',
 'PCA_Scored_2',
 'PCA_Scored_3',
 'PCA_Scored_4',
 'PCA_Scored_5',
 'PCA_Scored_6',
 'PCA_Scored_7',
 'PCA_Scored_8',
 'PCA_Scored_9',
 'PCA_Scored_10',
 'PCA_Scored_11',
 'PCA_Scored_12',
 'PCA_Scored_13',
 'PCA_Scored_14',
 'PCA_Scored_15',
 'PCA_Scored_16',
 'PCA_Scored_17',
 'PCA_Scored_18',
 'PCA_Scored_19',
 'PCA_Scored_20',
 'PCA_Scored_21',
 'PCA_Scored_22',
 'PCA_Scored_23',
 'PCA_Scored_24',
 'PCA_Scored_25',
 'PCA_Scored_26',
 'PCA_Scored_27',
 'PCA_Scored_28',
 'PCA_Scored_29',
 'P

In [108]:
# Create topic-specific metrics columns
df['Article Count'] = 1

df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [114]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [115]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [116]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [None]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,15,0) , pd.NaT, grouped_df['stock_time'])

# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])

grouped_df = grouped_df.sort_values(by='datetime')

# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].fillna(method='bfill')

# Now shift the stock_time column by 15 minutes (essentially lagging the columns)
grouped_df['stock_time'] = grouped_df['stock_time'] + pd.Timedelta(minutes=15)

# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)

grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [118]:
grouped_df

Unnamed: 0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_1_Allegiant Air,PCA_Word_2_Allegiant Air,PCA_Word_3_Allegiant Air,PCA_Word_4_Allegiant Air,Article Count_Allegiant Air,datetime,datetime_EST,time,date,stock_time
0,-15.608664,149.508624,40.509145,990.467900,-123.780687,19.596263,16.069757,2.426014,19.198589,10.432478,...,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:15:00+00:00,2017-12-31 19:15:00-05:00,19:15:00,2017-12-31,2018-01-02 09:30:00
1,-18.227278,322.422107,84.415447,2207.481664,-271.464210,39.921930,34.069068,1.855677,38.350255,21.149482,...,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:30:00+00:00,2017-12-31 19:30:00-05:00,19:30:00,2017-12-31,2018-01-02 09:30:00
2,-33.135903,474.508079,121.104052,3233.180481,-394.529595,59.318687,51.011164,2.369597,56.679236,32.602499,...,0.0,0.0,0.0,0.0,0.0,2018-01-01 00:45:00+00:00,2017-12-31 19:45:00-05:00,19:45:00,2017-12-31,2018-01-02 09:30:00
3,-5.640297,183.383847,45.157943,1309.101697,-155.574892,19.579862,16.116685,1.951089,18.129845,10.021183,...,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:00:00+00:00,2017-12-31 20:00:00-05:00,20:00:00,2017-12-31,2018-01-02 09:30:00
4,7.939434,646.253922,65.711500,4999.901494,-503.997912,9.694964,8.299846,0.440896,9.205581,5.279613,...,0.0,0.0,0.0,0.0,0.0,2018-01-01 01:15:00+00:00,2017-12-31 20:15:00-05:00,20:15:00,2017-12-31,2018-01-02 09:30:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259962,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,2025-05-31 22:45:00+00:00,2025-05-31 18:45:00-04:00,18:45:00,2025-05-31,NaT
259963,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:00:00+00:00,2025-05-31 19:00:00-04:00,19:00:00,2025-05-31,NaT
259964,-2.846281,134.073859,22.446480,975.933760,-107.565659,9.623811,8.599018,0.509076,9.558054,5.314533,...,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:15:00+00:00,2025-05-31 19:15:00-04:00,19:15:00,2025-05-31,NaT
259965,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,2025-05-31 23:30:00+00:00,2025-05-31 19:30:00-04:00,19:30:00,2025-05-31,NaT


In [None]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Scored_1_Allegiant Air,PCA_Scored_2_Allegiant Air,PCA_Scored_3_Allegiant Air,PCA_Scored_4_Allegiant Air,PCA_Word_0_Allegiant Air,PCA_Word_1_Allegiant Air,PCA_Word_2_Allegiant Air,PCA_Word_3_Allegiant Air,PCA_Word_4_Allegiant Air,Article Count_Allegiant Air
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:15:00-05:00,-1010.415867,55364.615469,11359.180779,399956.295028,-45576.482005,4530.393549,3777.629639,386.551126,4374.753766,2430.583079,...,24.801952,1.282816,26.719006,15.103743,2375.075715,470.606655,277.929737,-153.613302,337.073112,3.0
2018-01-02 09:30:00-05:00,-11.020094,313.841897,64.239984,2169.990105,-253.704281,29.097463,24.931743,1.287862,28.161811,15.861196,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2018-01-02 09:45:00-05:00,-5.427405,34.124221,16.814741,211.105035,-32.403733,9.694754,7.559250,1.891044,9.369047,5.486320,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2018-01-02 10:00:00-05:00,4.914528,468.332905,84.398585,3423.382233,-376.383910,30.645763,26.111326,1.078192,28.933382,14.844337,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2018-01-02 10:15:00-05:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00-04:00,-8.706287,121.683527,19.788283,800.969342,-90.511717,9.994784,8.326840,1.464641,8.874929,5.204360,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2025-05-30 15:00:00-04:00,-36.167956,390.346595,123.913349,2525.067832,-329.083853,57.469924,49.651627,5.117293,55.628016,31.431378,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2025-05-30 15:15:00-04:00,-0.957924,276.485597,77.525142,1852.306857,-234.749348,42.969660,34.846304,6.871954,37.112381,22.071743,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2025-05-30 15:30:00-04:00,-4.885094,356.999922,75.106290,2514.343336,-289.559817,29.965012,25.449019,3.203333,27.154316,16.567194,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0


In [48]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum and rolling mean
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).mean()

In [49]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_1_Allegiant Air_rolling_sum26,PCA_Word_1_Allegiant Air_rolling_mean26,PCA_Word_2_Allegiant Air_rolling_sum26,PCA_Word_2_Allegiant Air_rolling_mean26,PCA_Word_3_Allegiant Air_rolling_sum26,PCA_Word_3_Allegiant Air_rolling_mean26,PCA_Word_4_Allegiant Air_rolling_sum26,PCA_Word_4_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.606655,470.606655,277.929737,277.929737,-153.613302,-153.613302,337.073112,337.073112,3.0,3.000000
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.606655,235.303328,277.929737,138.964869,-153.613302,-76.806651,337.073112,168.536556,3.0,1.500000
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.606655,156.868885,277.929737,92.643246,-153.613302,-51.204434,337.073112,112.357704,3.0,1.000000
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.606655,117.651664,277.929737,69.482434,-153.613302,-38.403325,337.073112,84.268278,3.0,0.750000
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,470.606655,94.121331,277.929737,55.585947,-153.613302,-30.722660,337.073112,67.414622,3.0,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.790666,3.761179,59.942073,2.305464,-35.781422,-1.376209,65.350848,2.513494,3.0,0.115385
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.790666,3.761179,59.942073,2.305464,-35.781422,-1.376209,65.350848,2.513494,3.0,0.115385
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.790666,3.761179,59.942073,2.305464,-35.781422,-1.376209,65.350848,2.513494,3.0,0.115385
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,97.790666,3.761179,59.942073,2.305464,-35.781422,-1.376209,65.350848,2.513494,3.0,0.115385


In [50]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)

for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_sum26']])
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_mean26']])

In [51]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_1_Allegiant Air_rolling_sum26,PCA_Word_1_Allegiant Air_rolling_mean26,PCA_Word_2_Allegiant Air_rolling_sum26,PCA_Word_2_Allegiant Air_rolling_mean26,PCA_Word_3_Allegiant Air_rolling_sum26,PCA_Word_3_Allegiant Air_rolling_mean26,PCA_Word_4_Allegiant Air_rolling_sum26,PCA_Word_4_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.475214,2.537485,0.343306,2.613062,-0.844485,-2.736450,0.483658,2.577807,-0.399276,2.248811
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.475214,2.142862,0.343306,2.159706,-0.844485,-2.383705,0.483658,2.114574,-0.399276,1.786156
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.475214,1.940986,0.343306,1.928193,-0.844485,-2.137069,0.483658,1.961331,-0.399276,1.571286
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.475214,1.778032,0.343306,1.769419,-0.844485,-2.027489,0.483658,1.782098,-0.399276,1.337127
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.475214,1.640883,0.343306,1.591507,-0.844485,-1.912889,0.483658,1.644701,-0.399276,1.177392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.809269,-0.809269,-0.820585,-0.820585,0.232502,0.232502,-0.805580,-0.805580,-0.399276,-0.399276
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.809269,-0.809269,-0.820585,-0.820585,0.232502,0.232502,-0.805580,-0.805580,-0.399276,-0.399276
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.809269,-0.809269,-0.820585,-0.820585,0.232502,0.232502,-0.805580,-0.805580,-0.399276,-0.399276
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.809269,-0.809269,-0.820585,-0.820585,0.232502,0.232502,-0.805580,-0.805580,-0.399276,-0.399276


In [52]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506131403.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)