In [27]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer
warnings.filterwarnings('ignore')

In [28]:
with open(r'../../Data/Processed/pca_fit_1.pkl', 'rb') as f:
    pca_fit_1 = pickle.load(f)

In [29]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)
times = list(stock_data.index.unique())

In [30]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [31]:
df = df.to_pandas()

# Drop duplicate rows
df = df.drop_duplicates()

In [32]:
pca_fit_1.components_[1]

array([0.16050833, 0.61929442, 0.19799623, 0.61506275, 0.25588853,
       0.30481125, 0.12172906])

In [33]:
np.cumsum(pca_fit_1.explained_variance_ratio_)

array([0.37511966, 0.59530869, 0.74866325, 0.89063714, 1.        ,
       1.        , 1.        ])

In [34]:
df_gkg1 = df[['Tone',
 'Positive Score',
 'Negative Score',
 'Polarity',
 'Activity Reference Density',
 'Self/Group Reference Density',
 'Word Count']].copy()

In [35]:
# Use the PCA fit to create components for each row
for i in range(len(pca_fit_1.components_)):
    weights = pca_fit_1.components_[i]
    df[f'PCA_GKG1_{i}'] = np.matmul(df_gkg1, pca_fit_1.components_[i])
    # This does a row-wise dot product between the GKG1 data and the PCA component weights

In [36]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics = ['airplane','airline','airport','Alaska Airlines','American Airlines','Delta Air Lines','Frontier Airlines','Hawaiian Airlines','JetBlue','Southwest Airlines','Spirit Airlines','Sun Country Airlines','United Airlines','Allegiant Air']
metrics = ['PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [37]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

grouped_df['datetime'] = pd.to_datetime(grouped_df.index, errors='coerce')

In [38]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.set_index('datetime').reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']

In [39]:
##### Handling after-hours articles #####
# Convert 'datetime' to datetime type

# Move the time back by 4 hours to account for UTC
grouped_df['datetime_EST'] = grouped_df['datetime'] - pd.Timedelta(hours=4)

# Join with stock data to filter out after-hours articles
grouped_df['stock_time'] = np.where(grouped_df['datetime_EST'].isin(times), grouped_df['datetime_EST'], pd.NaT)

# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])

grouped_df = grouped_df.sort_values(by='datetime')

# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].fillna(method='bfill')

In [40]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [41]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum and rolling mean
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).mean()

In [42]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_airplane,PCA_GKG1_1_airplane,PCA_GKG1_2_airplane,PCA_GKG1_3_airplane,Article Count_airplane,PCA_GKG1_0_airline,PCA_GKG1_1_airline,PCA_GKG1_2_airline,PCA_GKG1_3_airline,Article Count_airline,...,PCA_GKG1_0_Allegiant Air_rolling_sum26,PCA_GKG1_0_Allegiant Air_rolling_mean26,PCA_GKG1_1_Allegiant Air_rolling_sum26,PCA_GKG1_1_Allegiant Air_rolling_mean26,PCA_GKG1_2_Allegiant Air_rolling_sum26,PCA_GKG1_2_Allegiant Air_rolling_mean26,PCA_GKG1_3_Allegiant Air_rolling_sum26,PCA_GKG1_3_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,-841.230005,45154.527962,9552.451582,333277.209417,396.0,...,-4.107011,-4.107011,592.867980,592.867980,87.498706,87.498706,4553.904146,4553.904146,3.0,3.000000
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,18.897952,1022.118533,130.498405,8044.811790,3.0,...,-4.107011,-2.053506,592.867980,296.433990,87.498706,43.749353,4553.904146,2276.952073,3.0,1.500000
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,34.370800,1660.369206,170.509920,13288.977480,3.0,...,-4.107011,-1.369004,592.867980,197.622660,87.498706,29.166235,4553.904146,1517.968049,3.0,1.000000
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,-18.812391,321.804258,86.778057,2264.398933,4.0,...,-4.107011,-1.026753,592.867980,148.216995,87.498706,21.874677,4553.904146,1138.476036,3.0,0.750000
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,-10.398101,306.799835,63.485890,2172.180420,3.0,...,-4.107011,-0.821402,592.867980,118.573596,87.498706,17.499741,4553.904146,910.780829,3.0,0.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,-4.517552,-0.173752,138.709211,5.334970,46.899356,1.803821,877.952456,33.767402,3.0,0.115385
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,-35.396409,382.128452,123.098202,2527.885847,6.0,...,-4.517552,-0.173752,138.709211,5.334970,46.899356,1.803821,877.952456,33.767402,3.0,0.115385
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,-0.398220,270.478169,76.901482,1854.298029,4.0,...,-4.517552,-0.173752,138.709211,5.334970,46.899356,1.803821,877.952456,33.767402,3.0,0.115385
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,-4.160261,348.857994,74.223793,2516.837176,3.0,...,-4.517552,-0.173752,138.709211,5.334970,46.899356,1.803821,877.952456,33.767402,3.0,0.115385


In [43]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)

for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_sum26']])
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_mean26']])

In [44]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_airplane,PCA_GKG1_1_airplane,PCA_GKG1_2_airplane,PCA_GKG1_3_airplane,Article Count_airplane,PCA_GKG1_0_airline,PCA_GKG1_1_airline,PCA_GKG1_2_airline,PCA_GKG1_3_airline,Article Count_airline,...,PCA_GKG1_0_Allegiant Air_rolling_sum26,PCA_GKG1_0_Allegiant Air_rolling_mean26,PCA_GKG1_1_Allegiant Air_rolling_sum26,PCA_GKG1_1_Allegiant Air_rolling_mean26,PCA_GKG1_2_Allegiant Air_rolling_sum26,PCA_GKG1_2_Allegiant Air_rolling_mean26,PCA_GKG1_3_Allegiant Air_rolling_sum26,PCA_GKG1_3_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,-32.202504,87.968435,82.823131,91.042500,76.313697,...,-0.123041,-1.759320,0.333524,2.578277,-0.113625,2.429153,0.415134,2.596582,-0.399276,2.248811
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.723419,1.991255,1.131467,2.197629,0.578134,...,-0.123041,-1.470150,0.333524,2.129706,-0.113625,1.980879,0.415134,2.147974,-0.399276,1.786156
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,1.315723,3.234672,1.478381,3.630196,0.578134,...,-0.123041,-1.278959,0.333524,1.928815,-0.113625,1.708514,0.415134,1.939647,-0.399276,1.571286
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,-0.720143,0.626928,0.752396,0.618574,0.770845,...,-0.123041,-1.106956,0.333524,1.786712,-0.113625,1.546698,0.415134,1.821490,-0.399276,1.337127
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,-0.398042,0.597696,0.550445,0.593382,0.578134,...,-0.123041,-0.999170,0.333524,1.622691,-0.113625,1.379667,0.415134,1.640711,-0.399276,1.177392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.159406,-0.158136,-0.821509,-0.821509,-0.642125,-0.642125,-0.868378,-0.868378,-0.399276,-0.399276
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,-1.354984,0.744449,1.067305,0.690551,1.156268,...,-0.159406,-0.158136,-0.821509,-0.821509,-0.642125,-0.642125,-0.868378,-0.868378,-0.399276,-0.399276
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,-0.015244,0.526936,0.666763,0.506545,0.770845,...,-0.159406,-0.158136,-0.821509,-0.821509,-0.642125,-0.642125,-0.868378,-0.868378,-0.399276,-0.399276
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,-0.159256,0.679633,0.643547,0.687533,0.578134,...,-0.159406,-0.158136,-0.821509,-0.821509,-0.642125,-0.642125,-0.868378,-0.868378,-0.399276,-0.399276


In [45]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506101513.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)