In [1]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime, gc, ctypes
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, IncrementalPCA
warnings.filterwarnings('ignore')

In [2]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [3]:
def subPCA(df, n, p):
    cols = df.columns.tolist()
    subset_size = int(np.ceil(len(cols) / n))
    pca_results = []
    pca_colnames = []

    for i in range(n):
        subset_cols = cols[i*subset_size:(i+1)*subset_size]
        if not subset_cols:
            continue
        subset = df[subset_cols]
        pca = PCA()
        subset_pca = pca.fit_transform(subset)
        cum_var = np.cumsum(pca.explained_variance_ratio_)
        n_components = np.argmax(cum_var >= p) + 1
        subset_pca = subset_pca[:, :n_components]
        pca_results.append(subset_pca)
        pca_colnames += [f'pca_sub{i+1}_{j+1:04d}' for j in range(n_components)]
        print(f"Processed subset {i+1}/{n} with {n_components} components.")

    combined = np.concatenate(pca_results, axis=1)
    return pd.DataFrame(combined, index=df.index, columns=pca_colnames)

In [4]:
with open('../../data/processed/gdelt_llm_sentiment.pkl', 'rb') as f:
    llm_data = pickle.load(f)
llm_data = downcast(llm_data, verbose = True)

0.0% compressed


In [5]:
# Determine which times to keep based on the stock data
with open(r"../../Data/Processed/stock_data_simple.pkl", 'rb') as f:
    stock_data = pickle.load(f)

stock_data['date'] = stock_data.index.date
days = list(stock_data['date'].unique())
stock_times = stock_data.index.unique().tolist()

In [6]:
with open(r'../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)
df = downcast(df, verbose = True)

0.0% compressed


In [7]:
df.index = df['GKGRECORDID']
df.drop(columns=['GKGRECORDID'], inplace=True)

llm_data.index = llm_data['GKGRECORDID']
llm_data.drop(columns=['GKGRECORDID'], inplace=True)

In [8]:
# Perform initial PCA on LLM data
index = llm_data.index
pca = PCA()
pca_fit = pca.fit_transform(llm_data)
cum_var = np.cumsum(pca.explained_variance_ratio_)
n_components = np.argmax(cum_var >= 0.8) + 1
print(f"Number of components explaining at least 80% of the variance: {n_components}")
llm_data = pd.DataFrame(pca_fit[:,:n_components], index=index, columns=[f'llm_pca_{i+1:03d}' for i in range(n_components)])

Number of components explaining at least 80% of the variance: 120


In [9]:
# Renaming GDELT columns from airline name to stock ticker
df.columns = [col.replace('Allegiant Air', 'ALGT') for col in df.columns]
df.columns = [col.replace('Alaska Airlines', 'ALK') for col in df.columns]
df.columns = [col.replace('United Airlines', 'UAL') for col in df.columns]
df.columns = [col.replace('Delta Air Lines', 'DAL') for col in df.columns]
df.columns = [col.replace('JetBlue', 'JBLU') for col in df.columns]
df.columns = [col.replace('Southwest Airlines', 'LUV') for col in df.columns]
df.columns = [col.replace('American Airlines', 'AAL') for col in df.columns]

In [10]:
df.drop(columns=['airplane','airline','airport','Frontier Airlines','Hawaiian Airlines',
                 'Spirit Airlines','Sun Country Airlines','article_title','V2SOURCECOMMONNAME','V2DOCUMENTIDENTIFIER','V1THEMES'
                 ], inplace=True)

In [11]:
# Perform initial PCA on sentiment data
index = df.index
cols_to_pca = [i for i in df.columns if i not in ['datetime','date','ALK','AAL','DAL','JBLU','LUV','UAL','ALGT','Tone']]
df_to_pca = df[cols_to_pca]
df.drop(columns=cols_to_pca, inplace=True)
scaler = StandardScaler()
df_to_pca = scaler.fit_transform(df_to_pca)
pca = IncrementalPCA(batch_size=50000)
df_to_pca = pca.fit_transform(df_to_pca)
cum_var = np.cumsum(pca.explained_variance_ratio_)

In [12]:
n_components = np.argmax(cum_var >= 0.8) + 1
print(f"Number of components explaining at least 80% of the variance: {n_components}")
df_to_pca = pd.DataFrame(df_to_pca[:,:n_components], index=index, columns=[f'sentiment_pca_{i+1:03d}' for i in range(n_components)])
df = pd.concat([df, df_to_pca], axis=1)

Number of components explaining at least 80% of the variance: 142


In [13]:
df = pd.merge(df, llm_data, left_index=True, right_index=True, how='left')

In [14]:
# Create topic-specific metrics columns
df['Article Count'] = 1

topics  = ['AAL','ALGT','ALK','DAL','JBLU','LUV','UAL']
metrics = [i for i in df.columns if 'pca' in i or i in ['Article Count','Tone']]

for topic in topics:
    for metric in metrics:
        df[f'{topic}_{metric}'] = df[metric] * df[topic]

In [15]:
# Convert from UTC to EST, accounting for daylight saving time
df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
df['datetime'] = df['datetime'].dt.tz_convert('America/New_York')

# Remove the timezone information 
df['datetime'] = df['datetime'].dt.tz_localize(None)

In [16]:
df['datetime'].describe()

count                          1121056
mean     2021-05-16 18:00:53.125446400
min                2017-12-31 19:15:00
25%                2019-04-10 18:15:00
50%                2021-01-29 12:30:00
75%                2023-07-24 21:30:00
max                2025-05-31 19:45:00
Name: datetime, dtype: object

In [17]:
# Create a grouped dataframe, grouped by datetime, that creates a sum for each metric
grouped_df = df.groupby('datetime').agg(
    {f'{topic}_{metric}': ['sum'] for topic in topics for metric in metrics}
)

# Flatten the MultiIndex columns
grouped_df.columns = ['_'.join(col).strip().replace('_sum','') for col in grouped_df.columns.values]

In [18]:
# Fill in all missing times
current_times = grouped_df.index.tolist()
to_add = list(set(stock_times) - set(current_times)) # fast way to figure out which stock times are missing
df_to_add = pd.DataFrame(index=to_add, columns=grouped_df.columns)
df_to_add = df_to_add.fillna(0)

grouped_df = pd.concat([grouped_df, df_to_add], copy=False, sort=True)

In [19]:
##### Handling after-hours articles #####
grouped_df['datetime'] = grouped_df.index
grouped_df['time'] = grouped_df['datetime'].dt.time
grouped_df['date'] = grouped_df['datetime'].dt.date

# Join with stock data
grouped_df['stock_time'] = np.where(grouped_df['date'].isin(days), grouped_df['datetime'], pd.NaT)
# Limit times after 15:45 and before 9:15
grouped_df['stock_time'] = np.where(grouped_df['time'] > datetime.time(15,45,0), pd.NaT, grouped_df['stock_time'])
grouped_df['stock_time'] = np.where(grouped_df['time'] < datetime.time(9,30,0) , pd.NaT, grouped_df['stock_time'])
# format the stock_time column
grouped_df['stock_time'] = pd.to_datetime(grouped_df['stock_time'])
grouped_df = grouped_df.sort_values(by='datetime')
# Backfill the stock_time2 column
grouped_df['stock_time'] = grouped_df['stock_time'].ffill().bfill()

In [20]:
# The choice of window here ultimately affects how much after-hours time should be counted towards market open
# For example, a 4-period window would mean that articles from 8:15 to 9:15 are counted towards the 9:30 period
windows = [4, 16, 48, 96]

for i in [j for j in grouped_df.columns if j not in ['datetime', 'date', 'time', 'stock_time']]:
    for window in windows:
        grouped_df[f'{i}_cum{window:02d}'] = grouped_df[f'{i}'].rolling(window, min_periods=1).sum()

In [21]:
# Now we keep the last of each stock time to remove duplicate values.
grouped_df = grouped_df.drop_duplicates(subset=['stock_time'], keep='last')
grouped_df.index = grouped_df['stock_time']
grouped_df.sort_index(inplace=True)
grouped_df.drop(columns=['stock_time','datetime','time','date'], inplace=True)

In [26]:
# Calculate lags
cols = [i for i in grouped_df.columns if 'cum' not in i]
for j in cols:
    for i in [1,2,4,16,26]:
        grouped_df[f'{j}_lag{i:02d}'] = grouped_df[f'{j}'].shift(i)


cols = [i for i in grouped_df.columns if 'cum' in i]
for j in cols:
    grouped_df[f'{j}_lag01'] = grouped_df[f'{j}'].shift(1)


In [27]:
grouped_df.shape

(48418, 25872)

In [28]:
# Keep only the lagged columns
grouped_df = grouped_df[[col for col in grouped_df.columns if 'lag' in col]]

In [29]:
grouped_df = grouped_df.dropna()
# Order the columns alphabetically
grouped_df = grouped_df.reindex(sorted(grouped_df.columns), axis=1)

In [30]:
pd.to_pickle(grouped_df, '../../data/processed/temp_save.pkl')

In [3]:
grouped_df = pd.read_pickle('../../data/processed/temp_save.pkl')

In [None]:
# Split columns into MultiIndex (ticker, metric)
grouped_df.columns = grouped_df.columns.str.split('_', n=1, expand=True)
grouped_df = grouped_df.stack(level=0).rename_axis(['date', 'ticker']).reset_index()
grouped_df = grouped_df.set_index(['date', 'ticker'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,Article Count_lag02,Article Count_lag04,Article Count_lag16,Article Count_lag26,Tone_cum04_lag01,...,sentiment_pca_141_lag26,sentiment_pca_142_cum04_lag01,sentiment_pca_142_cum16_lag01,sentiment_pca_142_cum48_lag01,sentiment_pca_142_cum96_lag01,sentiment_pca_142_lag01,sentiment_pca_142_lag02,sentiment_pca_142_lag04,sentiment_pca_142_lag16,sentiment_pca_142_lag26
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-03 09:45:00,AAL,4.0,25.0,80.0,143.0,2.0,1.0,2.0,0.0,0.0,-11.367094,...,0.0,0.990666,8.393536,-3.331346,-24.085169,1.162734,0.283201,-0.710801,0.000000,0.0
2018-01-03 09:45:00,ALGT,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.000000,0.000000,-0.602188,-0.602188,0.000000,0.000000,0.000000,0.000000,0.0
2018-01-03 09:45:00,ALK,10.0,24.0,185.0,241.0,0.0,5.0,0.0,2.0,0.0,-36.802987,...,0.0,3.250842,10.537131,65.050272,82.299843,0.000000,1.460186,0.000000,-0.985267,0.0
2018-01-03 09:45:00,DAL,3.0,7.0,24.0,52.0,0.0,1.0,0.0,0.0,0.0,-2.903702,...,0.0,0.974562,2.316525,3.186986,3.871179,0.000000,0.409089,0.000000,0.000000,0.0
2018-01-03 09:45:00,JBLU,2.0,9.0,17.0,40.0,0.0,1.0,0.0,0.0,0.0,-4.124275,...,0.0,4.033086,3.967061,5.669416,14.845301,0.000000,4.196315,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 15:45:00,ALK,0.0,3.0,9.0,31.0,0.0,0.0,0.0,1.0,0.0,0.000000,...,0.0,0.000000,1.121771,3.947301,-1.170787,0.000000,0.000000,0.000000,0.782757,0.0
2025-05-30 15:45:00,DAL,3.0,12.0,15.0,43.0,0.0,1.0,1.0,1.0,0.0,-5.105798,...,0.0,-4.030308,-8.087564,-8.116597,-0.956386,0.000000,0.199423,-4.441724,-0.505506,0.0
2025-05-30 15:45:00,JBLU,1.0,4.0,13.0,49.0,0.0,0.0,0.0,0.0,0.0,3.289474,...,0.0,0.177455,4.411203,5.425773,18.937056,0.000000,0.000000,0.000000,0.000000,0.0
2025-05-30 15:45:00,LUV,2.0,3.0,11.0,30.0,1.0,1.0,0.0,0.0,0.0,2.046745,...,0.0,-2.336301,-2.102221,-2.930745,-3.994552,-0.519144,-1.817158,0.000000,0.000000,0.0


In [5]:
grouped_df = downcast(grouped_df, verbose = True)
grouped_df

49.8% compressed


Unnamed: 0_level_0,Unnamed: 1_level_0,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,Article Count_lag02,Article Count_lag04,Article Count_lag16,Article Count_lag26,Tone_cum04_lag01,...,sentiment_pca_141_lag26,sentiment_pca_142_cum04_lag01,sentiment_pca_142_cum16_lag01,sentiment_pca_142_cum48_lag01,sentiment_pca_142_cum96_lag01,sentiment_pca_142_lag01,sentiment_pca_142_lag02,sentiment_pca_142_lag04,sentiment_pca_142_lag16,sentiment_pca_142_lag26
date,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-03 09:45:00,AAL,4,25,80,143,2,1,2,0,0,-11.367094,...,0.0,0.990666,8.393536,-3.331346,-24.085169,1.162734,0.283201,-0.710801,0.000000,0.0
2018-01-03 09:45:00,ALGT,0,0,2,2,0,0,0,0,0,0.000000,...,0.0,0.000000,0.000000,-0.602188,-0.602188,0.000000,0.000000,0.000000,0.000000,0.0
2018-01-03 09:45:00,ALK,10,24,185,241,0,5,0,2,0,-36.802986,...,0.0,3.250842,10.537130,65.050270,82.299843,0.000000,1.460186,0.000000,-0.985267,0.0
2018-01-03 09:45:00,DAL,3,7,24,52,0,1,0,0,0,-2.903702,...,0.0,0.974562,2.316525,3.186986,3.871179,0.000000,0.409089,0.000000,0.000000,0.0
2018-01-03 09:45:00,JBLU,2,9,17,40,0,1,0,0,0,-4.124275,...,0.0,4.033086,3.967061,5.669415,14.845301,0.000000,4.196315,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 15:45:00,ALK,0,3,9,31,0,0,0,1,0,0.000000,...,0.0,0.000000,1.121771,3.947301,-1.170787,0.000000,0.000000,0.000000,0.782757,0.0
2025-05-30 15:45:00,DAL,3,12,15,43,0,1,1,1,0,-5.105798,...,0.0,-4.030308,-8.087564,-8.116597,-0.956386,0.000000,0.199423,-4.441724,-0.505507,0.0
2025-05-30 15:45:00,JBLU,1,4,13,49,0,0,0,0,0,3.289474,...,0.0,0.177455,4.411203,5.425774,18.937056,0.000000,0.000000,0.000000,0.000000,0.0
2025-05-30 15:45:00,LUV,2,3,11,30,1,1,0,0,0,2.046745,...,0.0,-2.336301,-2.102221,-2.930745,-3.994552,-0.519144,-1.817158,0.000000,0.000000,0.0


In [8]:
index = grouped_df.index
df = grouped_df.copy()

for metric in ['llm', 'sentiment']:
    cols_to_pca = [i for i in df.columns if metric in i]
    df_to_pca = df[cols_to_pca]
    scaler = StandardScaler()
    df_to_pca = scaler.fit_transform(df_to_pca)
    pca = PCA()
    df_to_pca = pca.fit_transform(df_to_pca)
    cum_var = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(cum_var >= 0.8) + 1
    
    df_to_pca = pd.DataFrame(df_to_pca[:,:n_components], index=index, columns=[f'{metric}_pca_{i+1:03d}' for i in range(n_components)])
    df.drop(columns=cols_to_pca, inplace=True)
    df = pd.concat([df, df_to_pca], axis=1)

df.shape

(338744, 634)

In [11]:
df['date'] = df.index.get_level_values('date')
df['ticker'] = df.index.get_level_values('ticker')

In [13]:
# Export to pickle object
with open(r"../../Data/Processed/GDELT_Clean_202507141200.pkl", 'wb') as f:
    pickle.dump(df, f)