In [14]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer
warnings.filterwarnings('ignore')

In [15]:
with open(r'../../Data/Processed/pca_fit_1.pkl', 'rb') as f:
    pca_fit_1 = pickle.load(f)

In [16]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)
times = list(stock_data.index.unique())

In [17]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [None]:
df = df.to_pandas()

# Drop duplicate rows
df = df.drop_duplicates()

In [19]:
pca_fit_1.components_[1]

array([0.16050833, 0.61929442, 0.19799623, 0.61506275, 0.25588853,
       0.30481125, 0.12172906])

In [20]:
np.cumsum(pca_fit_1.explained_variance_ratio_)

array([0.37511966, 0.59530869, 0.74866325, 0.89063714, 1.        ,
       1.        , 1.        ])

In [21]:
df_gkg1 = df[['Tone',
 'Positive Score',
 'Negative Score',
 'Polarity',
 'Activity Reference Density',
 'Self/Group Reference Density',
 'Word Count']].copy()

In [22]:
# Use the PCA fit to create components for each row
for i in range(len(pca_fit_1.components_)):
    weights = pca_fit_1.components_[i]
    df[f'PCA_GKG1_{i}'] = np.matmul(df_gkg1, pca_fit_1.components_[i])
    # This does a row-wise dot product between the GKG1 data and the PCA component weights

In [23]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics = ['airplane','airline','airport','Alaska Airlines','American Airlines','Delta Air Lines','Frontier Airlines','Hawaiian Airlines','JetBlue','Southwest Airlines','Spirit Airlines','Sun Country Airlines','United Airlines','Allegiant Air']
metrics = ['PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [11]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

grouped_df['datetime'] = pd.to_datetime(grouped_df.index, errors='coerce')

In [75]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.set_index('datetime').reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']

In [76]:
##### Handling after-hours articles #####
# Convert 'datetime' to datetime type

# Move the time back by 4 hours to account for UTC
grouped_df['datetime_EST'] = grouped_df['datetime'] - pd.Timedelta(hours=4)

# Join with stock data to filter out after-hours articles
grouped_df['stock_time'] = np.where(grouped_df['datetime_EST'].isin(times), grouped_df['datetime_EST'], pd.NaT)

# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])

grouped_df = grouped_df.sort_values(by='datetime')

# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].fillna(method='bfill')

In [77]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [78]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum and rolling mean
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).mean()

In [79]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_airplane,PCA_GKG1_1_airplane,PCA_GKG1_2_airplane,PCA_GKG1_3_airplane,Article Count_airplane,PCA_GKG1_0_airline,PCA_GKG1_1_airline,PCA_GKG1_2_airline,PCA_GKG1_3_airline,Article Count_airline,...,PCA_GKG1_0_Allegiant Air_rolling_sum26,PCA_GKG1_0_Allegiant Air_rolling_mean26,PCA_GKG1_1_Allegiant Air_rolling_sum26,PCA_GKG1_1_Allegiant Air_rolling_mean26,PCA_GKG1_2_Allegiant Air_rolling_sum26,PCA_GKG1_2_Allegiant Air_rolling_mean26,PCA_GKG1_3_Allegiant Air_rolling_sum26,PCA_GKG1_3_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,-357.779842,28535.161374,8742.527012,229202.964605,256.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,43.520679,1526.447575,309.905905,13323.982955,3.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,...,-3.554305,-0.136704,130.188707,5.007258,55.926626,2.151024,880.817169,33.877583,3.0,0.115385
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,-32.796213,357.333991,149.419245,2535.596087,6.0,...,-3.554305,-0.136704,130.188707,5.007258,55.926626,2.151024,880.817169,33.877583,3.0,0.115385
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,1.399769,252.245547,96.077355,1859.869222,4.0,...,-3.554305,-0.136704,130.188707,5.007258,55.926626,2.151024,880.817169,33.877583,3.0,0.115385
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,-2.027934,323.829997,100.459546,2524.019900,3.0,...,-3.554305,-0.136704,130.188707,5.007258,55.926626,2.151024,880.817169,33.877583,3.0,0.115385


In [80]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)

for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_rolling_sum26']  = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_sum26']])
        grouped_df[f'{metric}_{topic}_rolling_mean26'] = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling_mean26']])

In [81]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_airplane,PCA_GKG1_1_airplane,PCA_GKG1_2_airplane,PCA_GKG1_3_airplane,Article Count_airplane,PCA_GKG1_0_airline,PCA_GKG1_1_airline,PCA_GKG1_2_airline,PCA_GKG1_3_airline,Article Count_airline,...,PCA_GKG1_0_Allegiant Air_rolling_sum26,PCA_GKG1_0_Allegiant Air_rolling_mean26,PCA_GKG1_1_Allegiant Air_rolling_sum26,PCA_GKG1_1_Allegiant Air_rolling_mean26,PCA_GKG1_2_Allegiant Air_rolling_sum26,PCA_GKG1_2_Allegiant Air_rolling_mean26,PCA_GKG1_3_Allegiant Air_rolling_sum26,PCA_GKG1_3_Allegiant Air_rolling_mean26,Article Count_Allegiant Air_rolling_sum26,Article Count_Allegiant Air_rolling_mean26
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.0,0.0,0.0,0.0,0.0,-18.818718,55.401749,50.850571,58.912623,43.167344,...,0.143512,0.143512,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2018-01-02 09:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.143512,0.143512,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2018-01-02 10:00:00,0.0,0.0,0.0,0.0,0.0,2.289127,2.963637,1.802556,3.424697,0.505867,...,0.143512,0.143512,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2018-01-02 10:15:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.143512,0.143512,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
2018-01-02 10:30:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.143512,0.143512,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.393287,-0.393287,-0.569264,-0.569264,-0.453739,-0.453739,-0.626512,-0.626512,-0.146048,-0.146048
2025-05-30 15:00:00,0.0,0.0,0.0,0.0,0.0,-1.725035,0.693773,0.869091,0.651731,1.011735,...,-0.393287,-0.393287,-0.569264,-0.569264,-0.453739,-0.453739,-0.626512,-0.626512,-0.146048,-0.146048
2025-05-30 15:15:00,0.0,0.0,0.0,0.0,0.0,0.073626,0.489741,0.558830,0.478047,0.674490,...,-0.393287,-0.393287,-0.569264,-0.569264,-0.453739,-0.453739,-0.626512,-0.626512,-0.146048,-0.146048
2025-05-30 15:30:00,0.0,0.0,0.0,0.0,0.0,-0.106666,0.628724,0.584319,0.648755,0.505867,...,-0.393287,-0.393287,-0.569264,-0.569264,-0.453739,-0.453739,-0.626512,-0.626512,-0.146048,-0.146048


In [82]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506101513.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)