In [28]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [29]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [30]:
with open('../../Data/Processed/GDELT_Clean_202507141200.pkl', 'rb') as f:
    gdelt = pickle.load(f)
gdelt.index.names = [None,None]

with open('../../Data/Processed/stock_data_long.pkl', 'rb') as f:
    stocks = pickle.load(f)

# Merge the datasets by row index
df = pd.merge(stocks, gdelt, on=['date', 'ticker'], how='inner')

In [31]:
# Ensure no missing values
df = df.ffill().bfill()

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Move 'date' and 'ticker' to the front
df = df[['date', 'ticker'] + [col for col in df.columns if col not in ['date', 'ticker']]]

# Downcast numeric columns to reduce memory usage
df = downcast(df, verbose = True)

40.3% compressed


In [32]:
# Write to pickle
with open('../../Data/Processed/merged_data_202507141200.pkl', 'wb') as f:
    pickle.dump(df, f)

In [26]:
df

Unnamed: 0,date,ticker,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,Article Count_lag02,Article Count_lag04,Article Count_lag16,...,sentiment_pca_293,sentiment_pca_294,sentiment_pca_295,sentiment_pca_296,sentiment_pca_297,sentiment_pca_298,sentiment_pca_299,sentiment_pca_300,sentiment_pca_301,sentiment_pca_302
0,2018-01-03 09:45:00,AAL,4,25,80,143,2,1,2,0,...,-0.071769,0.092828,-0.337849,-0.188729,-0.178771,0.273368,-0.329562,-0.057382,-0.137960,0.025572
1,2018-01-03 09:45:00,ALGT,0,0,2,2,0,0,0,0,...,-0.009447,0.011431,-0.020614,0.010191,-0.002531,-0.019852,0.018064,0.024361,0.020991,-0.017599
2,2018-01-03 09:45:00,ALK,10,24,185,241,0,5,0,2,...,0.404593,-0.898324,0.201081,0.254234,-0.098305,0.409223,-0.220153,0.458351,0.873481,0.123471
3,2018-01-03 09:45:00,DAL,3,7,24,52,0,1,0,0,...,0.031247,-0.045478,-0.097993,-0.026443,0.011369,0.065421,-0.070437,-0.002086,-0.047073,-0.130507
4,2018-01-03 09:45:00,JBLU,2,9,17,40,0,1,0,0,...,0.321627,0.147672,-0.019831,-0.489542,-0.149224,0.110391,-0.112406,-0.536246,-0.022727,-0.315006
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337601,2025-05-30 15:45:00,ALK,0,3,9,31,0,0,0,1,...,0.069228,0.859418,1.533909,0.130467,1.489660,-0.571697,-0.214990,-0.773403,-0.102158,-0.522079
337602,2025-05-30 15:45:00,DAL,3,12,15,43,0,1,1,1,...,-0.098549,0.869943,-0.262049,-0.578786,1.021411,0.129237,-0.009626,0.281706,-0.049660,-0.664873
337603,2025-05-30 15:45:00,JBLU,1,4,13,49,0,0,0,0,...,0.020847,0.078184,0.042978,-0.019870,0.051009,0.030061,-0.006248,-0.067871,-0.019440,-0.027907
337604,2025-05-30 15:45:00,LUV,2,3,11,30,1,1,0,0,...,0.173591,-0.098528,0.003599,0.030117,0.188478,-0.291041,0.105769,-0.013884,-0.119724,0.156275


In [27]:
df['ticker'].value_counts()

ticker
DAL     48248
LUV     48247
AAL     48246
JBLU    48246
ALK     48246
UAL     48246
ALGT    48127
Name: count, dtype: int64