In [68]:
import pandas as pd
import polars as pl
import numpy as np
import pickle, warnings, datetime, pytz
import matplotlib.pyplot as plt
from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler
warnings.filterwarnings('ignore')

In [69]:
with open('../../data/processed/gdelt_pca.pkl', 'rb') as f:
    pca_data = pickle.load(f)

In [70]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())

In [71]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [72]:
df = df.to_pandas()

# Drop duplicate rows
df.drop_duplicates(subset=['GKGRECORDID'], inplace=True)

df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)
df.drop(columns=['Tone','Positive Score','Negative Score','Polarity','Activity Reference Density','Self/Group Reference Density','Word Count'], inplace=True)
df.drop(columns=[i for i in df.columns if 'SCOREDVALUE' in i], inplace=True)
df.drop(columns=[i for i in df.columns if 'WORDCOUNT' in i], inplace=True)

In [73]:
df = pd.merge(df, pca_data, on='GKGRECORDID', how='inner')

In [74]:
list(df.columns)

['V2SOURCECOMMONNAME',
 'V2DOCUMENTIDENTIFIER',
 'V1THEMES',
 'datetime',
 'date',
 'airplane',
 'airline',
 'airport',
 'Alaska Airlines',
 'American Airlines',
 'Delta Air Lines',
 'Frontier Airlines',
 'Hawaiian Airlines',
 'JetBlue',
 'Southwest Airlines',
 'Spirit Airlines',
 'Sun Country Airlines',
 'United Airlines',
 'Allegiant Air',
 'article_title',
 'PCA_GKG1_0',
 'PCA_GKG1_1',
 'PCA_GKG1_2',
 'PCA_GKG1_3',
 'PCA_GKG1_4',
 'PCA_GKG1_5',
 'PCA_GKG1_6',
 'PCA_Scored_0',
 'PCA_Scored_1',
 'PCA_Scored_2',
 'PCA_Scored_3',
 'PCA_Scored_4',
 'PCA_Scored_5',
 'PCA_Scored_6',
 'PCA_Scored_7',
 'PCA_Scored_8',
 'PCA_Scored_9',
 'PCA_Scored_10',
 'PCA_Scored_11',
 'PCA_Scored_12',
 'PCA_Scored_13',
 'PCA_Scored_14',
 'PCA_Scored_15',
 'PCA_Scored_16',
 'PCA_Scored_17',
 'PCA_Scored_18',
 'PCA_Scored_19',
 'PCA_Scored_20',
 'PCA_Scored_21',
 'PCA_Scored_22',
 'PCA_Scored_23',
 'PCA_Scored_24',
 'PCA_Scored_25',
 'PCA_Scored_26',
 'PCA_Scored_27',
 'PCA_Scored_28',
 'PCA_Scored_29',
 'P

In [75]:
# Create topic-specific metrics columns
df['Article Count'] = 1

df['general'] = 1

topics  = ['general','Alaska Airlines','American Airlines','Delta Air Lines','JetBlue','Southwest Airlines','United Airlines','Allegiant Air']
metrics = ['PCA_GKG1_0','PCA_GKG1_1','PCA_GKG1_2','PCA_GKG1_3','PCA_GKG1_4',
           'PCA_Scored_0','PCA_Scored_1','PCA_Scored_2','PCA_Scored_3','PCA_Scored_4',
           'PCA_Word_0','PCA_Word_1','PCA_Word_2','PCA_Word_3','PCA_Word_4',
           'Article Count']

for topic in topics:
    for metric in metrics:
        df[f'{metric}_{topic}'] = df[metric] * df[topic]

In [76]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

grouped_df = df.groupby('datetime').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [77]:
# Fill in all missing times
start = datetime.datetime(2018, 1, 1, 0, 15, 0)
end   = datetime.datetime(2025, 5, 31, 23, 45, 0)
dates = pd.date_range(start=start, end=end, freq='15min')
grouped_df = grouped_df.reindex(dates).reset_index()
grouped_df = grouped_df.fillna(0)

grouped_df['datetime'] = grouped_df['index']
grouped_df.drop(columns=['index'], inplace=True)

In [78]:
# Convert from UTC to EST, accounting for daylight saving time
grouped_df['datetime'] = pd.to_datetime(grouped_df['datetime'], utc=True)
grouped_df['datetime_EST'] = grouped_df['datetime'].dt.tz_convert('America/New_York')
grouped_df['time'] = grouped_df['datetime_EST'].dt.time
grouped_df['date'] = grouped_df['datetime_EST'].dt.date

In [79]:
##### Handling after-hours articles #####

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime_EST'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,15,0) , pd.NaT, grouped_df['stock_time'])

# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])

grouped_df = grouped_df.sort_values(by='datetime')

# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].fillna(method='bfill')

# Remove the timezone information 
grouped_df['stock_time'] = grouped_df['stock_time'].dt.tz_localize(None)

grouped_df.drop(columns=['datetime', 'datetime_EST', 'time', 'date'], inplace=True)

In [80]:
# Group again to get rid of duplicate stock times
grouped_df = grouped_df.groupby('stock_time').agg(
    {f'{metric}_{topic}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

grouped_df.sort_index(inplace=True)

In [81]:
# For each column, create a 26-period (# of 15-min periods in 1 trading day) rolling sum
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}_rolling26'] = grouped_df[f'{metric}_{topic}'].rolling(window=26, min_periods=1).sum()
        # Create a first difference of the rolling sum
        grouped_df[f'{metric}_{topic}_rolling26_diff'] = grouped_df[f'{metric}_{topic}_rolling26'].diff().fillna(0)

In [82]:
# standardize the PCA vars, but only divide by SD so that we keep the 0s in
scaler  = RobustScaler(with_centering=False, unit_variance=True)
#scaler2 = QuantileTransformer(output_distribution='normal', subsample=None)
scaler2 = MinMaxScaler(feature_range=(0, 1))
for topic in topics:
    for metric in metrics:
        grouped_df[f'{metric}_{topic}']                =  scaler.fit_transform(grouped_df[[f'{metric}_{topic}']])
        grouped_df[f'{metric}_{topic}_rolling26']      = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling26']])
        grouped_df[f'{metric}_{topic}_rolling26_diff'] = scaler2.fit_transform(grouped_df[[f'{metric}_{topic}_rolling26_diff']])

In [83]:
grouped_df

Unnamed: 0_level_0,PCA_GKG1_0_general,PCA_GKG1_1_general,PCA_GKG1_2_general,PCA_GKG1_3_general,PCA_GKG1_4_general,PCA_Scored_0_general,PCA_Scored_1_general,PCA_Scored_2_general,PCA_Scored_3_general,PCA_Scored_4_general,...,PCA_Word_1_Allegiant Air_rolling26,PCA_Word_1_Allegiant Air_rolling26_diff,PCA_Word_2_Allegiant Air_rolling26,PCA_Word_2_Allegiant Air_rolling26_diff,PCA_Word_3_Allegiant Air_rolling26,PCA_Word_3_Allegiant Air_rolling26_diff,PCA_Word_4_Allegiant Air_rolling26,PCA_Word_4_Allegiant Air_rolling26_diff,Article Count_Allegiant Air_rolling26,Article Count_Allegiant Air_rolling26_diff
stock_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:15:00,-33.720902,95.205623,87.552606,98.310646,-96.136658,78.906686,77.147308,74.122819,79.306570,77.280079,...,0.011835,0.500204,0.029393,0.499786,0.789585,0.500000,0.015975,0.500204,0.0048,0.500000
2018-01-02 09:30:00,-0.367777,0.539686,0.495139,0.533391,-0.535151,0.506796,0.509160,0.246953,0.510524,0.504305,...,0.011835,0.500204,0.029393,0.499786,0.789585,0.500000,0.015975,0.500204,0.0048,0.500000
2018-01-02 09:45:00,-0.181130,0.058680,0.129602,0.051890,-0.068351,0.168855,0.154376,0.362616,0.169844,0.174437,...,0.011835,0.500204,0.029393,0.499786,0.789585,0.500000,0.015975,0.500204,0.0048,0.500000
2018-01-02 10:00:00,0.164014,0.805351,0.650515,0.841479,-0.793925,0.533763,0.533249,0.206748,0.524511,0.471974,...,0.011835,0.500204,0.029393,0.499786,0.789585,0.500000,0.015975,0.500204,0.0048,0.500000
2018-01-02 10:15:00,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.011835,0.500204,0.029393,0.499786,0.789585,0.500000,0.015975,0.500204,0.0048,0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,-0.290557,0.209248,0.152521,0.196881,-0.190921,0.174081,0.170052,0.280851,0.160887,0.165472,...,0.002459,0.500204,0.019596,0.499786,0.802160,0.500000,0.004932,0.500204,0.0048,0.500000
2025-05-30 15:00:00,-1.207044,0.671244,0.955081,0.620670,-0.694152,1.000964,1.013993,0.981263,1.008438,0.999357,...,0.002459,0.500204,0.019596,0.499786,0.802160,0.500000,0.004932,0.500204,0.0048,0.500000
2025-05-30 15:15:00,-0.031969,0.475448,0.597537,0.455303,-0.495168,0.748410,0.711636,1.317726,0.672782,0.701768,...,0.002459,0.500204,0.019596,0.499786,0.802160,0.500000,0.004932,0.500204,0.0048,0.500000
2025-05-30 15:30:00,-0.163032,0.613901,0.578893,0.618034,-0.610782,0.521906,0.519724,0.614253,0.492260,0.526752,...,0.001719,0.499723,0.018652,0.499187,0.803322,0.501071,0.003946,0.499565,0.0032,0.498355


In [84]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202506141555.pkl", 'wb') as f:
    pickle.dump(grouped_df, f)