In [1]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [3]:
with open('../../Data/Processed/GDELT_Clean_finance.pkl', 'rb') as f:
    gdelt = pickle.load(f)
gdelt.index.names = [None,None]

with open('../../Data/Processed/stock_data_long.pkl', 'rb') as f:
    stocks = pickle.load(f)

# Merge the datasets by row index
df = pd.merge(stocks, gdelt, on=['date', 'ticker'], how='inner')

In [4]:
len([i for i in stocks.columns if 'lag' in i])

800

In [5]:
# Ensure no missing values
df = df.ffill().bfill()

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Move 'date' and 'ticker' to the front
df = df[['date', 'ticker'] + [col for col in df.columns if col not in ['date', 'ticker']]]

# Downcast numeric columns to reduce memory usage
df = downcast(df, verbose = True)

35.8% compressed


In [6]:
# Write to pickle
with open('../../Data/Processed/merged_data_finance.pkl', 'wb') as f:
    pickle.dump(df, f)

In [7]:
df

Unnamed: 0,date,ticker,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,BNO_Change_High-Low,BNO_Change_High-Low_lag01,BNO_Change_High-Low_ma04,...,v42.8; SCOREDVALUE; fairness_sent_cum04_lag01,v42.8; SCOREDVALUE; fairness_sent_cum16_lag01,v42.8; SCOREDVALUE; fairness_sent_cum48_lag01,v42.8; SCOREDVALUE; fairness_sent_cum96_lag01,v42.8; SCOREDVALUE; fairness_sent_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum04_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum16_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum48_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum96_lag01,v42.9; SCOREDVALUE; loyalty_sent_lag01
0,2018-01-02 09:45:00,AAL,1,6,6,6,0,-0.035,0.020,0.01000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
1,2018-01-02 09:45:00,ALGT,0,0,0,0,0,-0.035,0.020,0.01000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
2,2018-01-02 09:45:00,ALK,0,3,5,5,0,-0.035,0.020,0.01000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
3,2018-01-02 09:45:00,DAL,0,1,1,1,0,-0.035,0.020,0.01000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
4,2018-01-02 09:45:00,JBLU,1,3,3,3,0,-0.035,0.020,0.01000,...,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337783,2025-05-30 15:45:00,ALK,0,0,4,13,0,-0.020,0.015,0.00875,...,0.0,0.000000,-0.139706,-0.133636,0.0,0.0,0.000000,-0.067318,0.075841,0.0
337784,2025-05-30 15:45:00,DAL,0,4,7,11,0,-0.020,0.015,0.00875,...,0.0,-0.039292,0.056338,-0.185764,0.0,0.0,0.065506,0.186295,0.109351,0.0
337785,2025-05-30 15:45:00,JBLU,0,2,5,20,0,-0.020,0.015,0.00875,...,0.0,0.013177,0.021646,0.389782,0.0,0.0,0.052000,0.107777,0.615597,0.0
337786,2025-05-30 15:45:00,LUV,0,0,1,11,0,-0.020,0.015,0.00875,...,0.0,0.000000,0.028658,-0.189179,0.0,0.0,0.000000,0.042602,0.175897,0.0


In [8]:
df['ticker'].value_counts()

ticker
DAL     48274
LUV     48273
AAL     48272
JBLU    48272
ALK     48272
UAL     48272
ALGT    48153
Name: count, dtype: int64