In [1]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [3]:
with open('../../Data/Processed/GDELT_Clean_finance.pkl', 'rb') as f:
    gdelt = pickle.load(f)
gdelt.index.names = [None,None]

with open('../../Data/Processed/stock_data_long.pkl', 'rb') as f:
    stocks = pickle.load(f)

# Merge the datasets by row index
df = pd.merge(stocks, gdelt, on=['date', 'ticker'], how='inner')

In [4]:
# Ensure no missing values
df = df.ffill().bfill()

# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

# Move 'date' and 'ticker' to the front
df = df[['date', 'ticker'] + [col for col in df.columns if col not in ['date', 'ticker']]]

# Downcast numeric columns to reduce memory usage
df = downcast(df, verbose = True)

33.1% compressed


In [5]:
# Write to pickle
with open('../../Data/Processed/merged_data_finance.pkl', 'wb') as f:
    pickle.dump(df, f)

In [6]:
df

Unnamed: 0,date,ticker,Article Count_cum04_lag01,Article Count_cum16_lag01,Article Count_cum48_lag01,Article Count_cum96_lag01,Article Count_lag01,Article Count_lag02,Article Count_lag04,Article Count_lag16,...,v42.8; SCOREDVALUE; fairness_sent_lag26,v42.9; SCOREDVALUE; loyalty_sent_cum04_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum16_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum48_lag01,v42.9; SCOREDVALUE; loyalty_sent_cum96_lag01,v42.9; SCOREDVALUE; loyalty_sent_lag01,v42.9; SCOREDVALUE; loyalty_sent_lag02,v42.9; SCOREDVALUE; loyalty_sent_lag04,v42.9; SCOREDVALUE; loyalty_sent_lag16,v42.9; SCOREDVALUE; loyalty_sent_lag26
0,2018-01-03 09:30:00,AAL,4,24,84,142,1,0,2,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,2018-01-03 09:30:00,ALGT,0,0,2,2,0,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
2,2018-01-03 09:30:00,ALK,15,26,188,241,5,0,1,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
3,2018-01-03 09:30:00,DAL,4,8,24,53,1,0,0,0,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
4,2018-01-03 09:30:00,JBLU,6,9,17,42,1,1,0,2,...,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337608,2025-05-30 15:45:00,ALK,0,3,9,31,0,0,0,1,...,0.0,0.000000,0.105368,0.037671,0.040941,0.000000,0.000000,0.000000,0.031631,0.0
337609,2025-05-30 15:45:00,DAL,3,12,15,43,0,1,1,1,...,0.0,0.016665,0.099109,0.007065,-0.059048,0.000000,-0.000191,0.007384,-0.036876,0.0
337610,2025-05-30 15:45:00,JBLU,1,4,13,49,0,0,0,0,...,0.0,0.010904,0.126156,0.400086,1.505577,0.000000,0.000000,0.000000,0.000000,0.0
337611,2025-05-30 15:45:00,LUV,2,3,11,30,1,1,0,0,...,0.0,0.040624,0.037287,-0.233369,-0.264374,-0.008633,0.049257,0.000000,0.000000,0.0


In [7]:
df['ticker'].value_counts()

ticker
DAL     48249
LUV     48248
AAL     48247
JBLU    48247
ALK     48247
UAL     48247
ALGT    48128
Name: count, dtype: int64