In [39]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [40]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)

In [41]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [42]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [43]:
df = df.to_pandas()

# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Positive Score','Negative Score','Activity Reference Density','Self/Group Reference Density'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [44]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='inner')

In [45]:
list(df.columns)

['V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'Tone',
 'Polarity',
 'Word Count',
 'PCA_GKG1_0',
 'PCA_GKG1_1',
 'PCA_GKG1_2',
 'PCA_GKG1_3',
 'PCA_GKG1_4',
 'PCA_GKG1_5',
 'PCA_GKG1_6',
 'PCA_Scored_0',
 'PCA_Scored_1',
 'PCA_Scored_2',
 'PCA_Scored_3',
 'PCA_Scored_4',
 'PCA_Scored_5',
 'PCA_Scored_6',
 'PCA_Scored_7',
 'PCA_Scored_8',
 'PCA_Scored_9',
 'PCA_Scored_10',
 'PCA_Scored_11',
 'PCA_Scored_12',
 'PCA_Scored_13',
 'PCA_Scored_14',
 'PCA_Scored_15',
 'PCA_Scored_16',
 'PCA_Scored_17',
 'PCA_Scored_18',
 'PCA_Scored_19',
 'PCA_Scored_20',
 'PCA_Scored_21',
 'PCA_Scored_22',
 'PCA_Scored_23',
 'PCA_Scored_24',
 'PCA_Scored_25',
 'PCA_Scored_26',
 'PCA_Scored_27',
 '

In [46]:
# Create topic-specific metrics columns
df['Article Count'] = 1

df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['Tone','Polarity','Word Count',
           'PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [47]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [48]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [49]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [50]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()
# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)
#grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [51]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period

windows = [4, 8, 16]

for window in windows:
    for topic in topics:
        for metric in metrics:
            grouped_df[f'{metric}_{topic}_{window}'] = grouped_df[f'{metric}_{topic}'].rolling(window, min_periods=1).sum()

In [52]:
grouped_df.columns

Index(['Tone_general', 'Polarity_general', 'Word Count_general',
       'PCA_GKG1_0_general', 'PCA_GKG1_1_general', 'PCA_GKG1_2_general',
       'PCA_GKG1_3_general', 'PCA_GKG1_4_general', 'PCA_Scored_0_general',
       'PCA_Scored_1_general',
       ...
       'PCA_Scored_1_Allegiant Air_16', 'PCA_Scored_2_Allegiant Air_16',
       'PCA_Scored_3_Allegiant Air_16', 'PCA_Scored_4_Allegiant Air_16',
       'PCA_Word_0_Allegiant Air_16', 'PCA_Word_1_Allegiant Air_16',
       'PCA_Word_2_Allegiant Air_16', 'PCA_Word_3_Allegiant Air_16',
       'PCA_Word_4_Allegiant Air_16', 'Article Count_Allegiant Air_16'],
      dtype='object', length=613)

In [None]:
grouped_df

Unnamed: 0,Tone_general,Polarity_general,Word Count_general,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,...,PCA_Scored_1_Allegiant Air_16,PCA_Scored_2_Allegiant Air_16,PCA_Scored_3_Allegiant Air_16,PCA_Scored_4_Allegiant Air_16,PCA_Word_0_Allegiant Air_16,PCA_Word_1_Allegiant Air_16,PCA_Word_2_Allegiant Air_16,PCA_Word_3_Allegiant Air_16,PCA_Word_4_Allegiant Air_16,Article Count_Allegiant Air_16
0,-7.430732,13.882776,1009.0,-15.608664,149.508624,40.509145,990.467900,-123.780687,19.596263,16.069757,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-6.994452,19.982017,2247.0,-18.227278,322.422107,84.415447,2207.481664,-271.464210,39.921930,34.069068,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-14.590960,33.605149,3291.0,-33.135903,474.508079,121.104052,3233.180481,-394.529595,59.318687,51.011164,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-2.680700,6.221132,1331.0,-5.640297,183.383847,45.157943,1309.101697,-155.574892,19.579862,16.116685,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-2.259993,8.124766,5067.0,7.939434,646.253922,65.711500,4999.901494,-503.997912,9.694964,8.299846,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259962,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259963,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259964,-1.392758,5.663881,991.0,-2.846281,134.073859,22.446480,975.933760,-107.565659,9.623811,8.599018,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
259965,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.drop(columns=['stock_time'], inplace=True)
grouped_df.sort_index(inplace=True)
grouped_df

Unnamed: 0_level_0,Tone_general,Polarity_general,Word Count_general,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,...,PCA_Scored_1_Allegiant Air_16,PCA_Scored_2_Allegiant Air_16,PCA_Scored_3_Allegiant Air_16,PCA_Scored_4_Allegiant Air_16,PCA_Word_0_Allegiant Air_16,PCA_Word_1_Allegiant Air_16,PCA_Word_2_Allegiant Air_16,PCA_Word_3_Allegiant Air_16,PCA_Word_4_Allegiant Air_16,Article Count_Allegiant Air_16
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-1.817716,20.837173,2207.0,-11.020094,313.841897,64.239984,2169.990105,-253.704281,29.097463,24.931743,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 09:45:00,-2.788845,2.788845,216.0,-5.427405,34.124221,16.814741,211.105035,-32.403733,9.694754,7.559250,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:00:00,4.677223,12.279291,3476.0,4.914528,468.332905,84.398585,3423.382233,-376.383910,30.645763,26.111326,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:15:00,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-01-02 10:30:00,-3.405895,15.378354,1988.0,-12.041481,284.405394,76.200454,1953.011078,-241.236697,40.127573,33.396032,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-2.396514,14.379085,815.0,-8.706287,121.683527,19.788283,800.969342,-90.511717,9.994784,8.326840,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:00:00,-14.529022,34.685461,2575.0,-36.167956,390.346595,123.913349,2525.067832,-329.083853,57.469924,49.651627,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:15:00,6.850257,14.534555,1887.0,-0.957924,276.485597,77.525142,1852.306857,-234.749348,42.969660,34.846304,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2025-05-30 15:30:00,0.998197,16.897969,2556.0,-4.885094,356.999922,75.106290,2514.343336,-289.559817,29.965012,25.449019,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
#    {f'{metric}_{topic}': ['sum','mean'] for topic in topics for metric in metrics}
#    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
    {f'{metric}_{topic}_{window}': ['mean'] for topic in topics for metric in metrics for window in windows}
)

# Flatten the MultiIndex columns
#grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]
grouped_df.columns = ['_'.join(col).strip().replace('_mean','') for col in grouped_df.columns.values]
#grouped_df.columns = ['_'.join(col).strip().replace('_sum','').replace('_mean','') for col in grouped_df.columns.values]
#grouped_df.columns = ['_'.join(col).strip() for col in grouped_df.columns.values]

grouped_df.sort_index(inplace=True)

In [37]:
grouped_df

Unnamed: 0_level_0,Tone_general_4,Tone_general_8,Tone_general_16,Polarity_general_4,Polarity_general_8,Polarity_general_16,Word Count_general_4,Word Count_general_8,Word Count_general_16,PCA_GKG1_0_general_4,...,PCA_Word_2_Allegiant Air_16,PCA_Word_3_Allegiant Air_4,PCA_Word_3_Allegiant Air_8,PCA_Word_3_Allegiant Air_16,PCA_Word_4_Allegiant Air_4,PCA_Word_4_Allegiant Air_8,PCA_Word_4_Allegiant Air_16,Article Count_Allegiant Air_4,Article Count_Allegiant Air_8,Article Count_Allegiant Air_16
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-10.834549,-21.399580,-42.592870,50.001946,98.771373,193.515783,10448.409091,20337.532468,39784.759740,-26.260086,...,28.875817,-3.989956,-7.979912,-15.959824,8.755146,17.510292,35.020583,0.077922,0.155844,0.311688
2018-01-02 09:45:00,-14.414311,-19.987304,-13.109345,55.157293,84.623146,164.334796,18428.000000,28654.000000,38487.000000,-5.400037,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:00:00,-7.096079,-16.253478,-9.515888,58.169009,95.959041,159.925956,8450.000000,31946.000000,40631.000000,-31.210818,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:15:00,0.070662,-16.253478,-10.112649,35.905309,95.959041,155.407627,5899.000000,31946.000000,38493.000000,-11.532971,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2018-01-02 10:30:00,-1.517517,-11.968819,-18.218283,30.446489,92.944787,166.086241,5680.000000,32041.000000,40140.000000,-12.554357,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-8.217379,-5.241426,-1.922811,66.368091,153.519825,276.156318,4978.000000,15016.000000,25819.000000,-43.372442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:00:00,-8.173096,-21.303607,-9.374147,69.905302,178.612743,300.477709,5026.000000,15470.000000,27976.000000,-45.648102,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:15:00,-0.396913,-22.325427,-5.888299,79.069486,174.360156,307.407572,6427.000000,14765.000000,29088.000000,-42.797025,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2025-05-30 15:30:00,-9.077083,-17.552863,3.895428,80.497069,150.013318,314.486419,7833.000000,13463.000000,31287.000000,-50.717262,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [225]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum
for topic in topics:
    for metric in metrics:
        # Create a first difference of the rolling sum
        grouped_df[f'{metric}_{topic}_diff'] = grouped_df[f'{metric}_{topic}'].diff().fillna(0)

        grouped_df[f'{metric}_{topic}_rolling26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        # Create a first difference of the rolling sum
        grouped_df[f'{metric}_{topic}_rolling26_diff'] = grouped_df[f'{metric}_{topic}_rolling26'].diff().fillna(0)

In [226]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
#scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)
scaler2 = MinMaxScaler(feature_range=(0, 1))
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_diff']           =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}_diff']])
  #      grouped_df[f'{metric}_{topic}_rolling26']      = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling26']])

In [227]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_2_Allegiant Air_rolling26_diff,PCA_Word_3_Allegiant Air_diff,PCA_Word_3_Allegiant Air_rolling26,PCA_Word_3_Allegiant Air_rolling26_diff,PCA_Word_4_Allegiant Air_diff,PCA_Word_4_Allegiant Air_rolling26,PCA_Word_4_Allegiant Air_rolling26_diff,Article Count_Allegiant Air_diff,Article Count_Allegiant Air_rolling26,Article Count_Allegiant Air_rolling26_diff
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,-0.315296,0.836815,0.768722,0.871473,-0.845677,0.713437,0.697089,0.709520,0.713676,0.696622,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 09:45:00,-0.050548,1.137556,0.786208,1.229506,-1.120847,0.611897,0.596621,0.611572,0.604515,0.585761,...,0.000000,10.764737,-7.979912,0.000000,-23.621024,17.510292,0.000000,-0.210231,0.155844,0.00000
2018-01-02 10:00:00,-0.019941,1.270297,0.882866,1.370717,-1.249990,0.675362,0.662924,0.585112,0.670292,0.642152,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 10:15:00,-0.019941,1.270297,0.882866,1.370717,-1.249990,0.675362,0.662924,0.585112,0.670292,0.642152,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
2018-01-02 10:30:00,0.024689,1.272693,0.874916,1.374850,-1.251171,0.679316,0.665869,0.600345,0.667673,0.642329,...,0.000000,0.000000,-7.979912,0.000000,0.000000,17.510292,0.000000,0.000000,0.155844,0.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-0.464317,0.658204,0.692188,0.642312,-0.655172,0.767281,0.768237,0.801819,0.719579,0.736783,...,-20.745107,0.000000,-37.356611,10.903539,0.000000,81.073279,-24.117033,0.000000,3.394366,-1.00000
2025-05-30 15:00:00,-0.678608,0.688617,0.781740,0.661346,-0.690098,0.881678,0.885451,0.914747,0.839672,0.855843,...,-20.745107,0.000000,-26.453072,10.903539,0.000000,56.956247,-24.117033,0.000000,2.394366,-1.00000
2025-05-30 15:15:00,-0.695495,0.661748,0.789168,0.630997,-0.668474,0.918713,0.915649,1.083005,0.867992,0.894348,...,-20.745107,0.000000,-15.549533,10.903539,0.000000,32.839214,-24.117033,0.000000,1.394366,-1.00000
2025-05-30 15:30:00,-0.590173,0.599874,0.711988,0.575426,-0.609027,0.826026,0.821172,1.025069,0.779970,0.807730,...,-20.745107,0.000000,-4.645993,10.903539,0.000000,8.722182,-24.117033,0.000000,0.394366,-1.00000


In [228]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506141555.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)