In [1]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [3]:
with open('../../Data/Processed/GDELT_Clean_202507091200.pkl', 'rb') as f:
    gdelt = pickle.load(f)

with open('../../Data/Processed/stock_data.pkl', 'rb') as f:
    stocks = pickle.load(f)

with open('../../Data/Processed/stock_data_long.pkl', 'rb') as f:
    stocks_long = pickle.load(f)

In [4]:
# Merge the datasets by row index
df = pd.merge(stocks, gdelt, left_index=True, right_index=True, how='inner')

In [5]:
# Ensure no missing values
df['Time'] = df.index
df = df[df['Time'] > datetime.datetime(2018, 1, 3, 9, 30, 0)]
df = df.ffill().bfill()
df.drop(columns=['Time'], inplace=True)

In [6]:
# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

In [7]:
# Downcast numeric columns to reduce memory usage
df = downcast(df, verbose = True)

46.6% compressed


In [8]:
df['dates'] = df.index
df['dates'].describe()

count                            48254
mean     2021-09-15 03:06:59.094789888
min                2018-01-03 09:45:00
25%                2019-11-07 13:03:45
50%                2021-09-15 09:52:30
75%                2023-07-24 11:26:15
max                2025-05-30 15:45:00
Name: dates, dtype: object

In [None]:
# Write to pickle
with open('../../Data/Processed/merged_data_202507111200.pkl', 'wb') as f:
    pickle.dump(df, f)

In [27]:
df = pd.read_pickle('../../Data/Processed/merged_data_202507111200.pkl')

In [28]:
df_long = df[[i for i in df.columns if any(t in i for t in ['AAL','ALGT','ALK','DAL','JBLU','LUV','UAL'])]]
df_long = df_long.melt(var_name='metric', value_name='value', ignore_index=False)
df_long['ticker'] = df_long['metric'].apply(lambda x: x.split('_')[0])
df_long=df_long[~df_long['metric'].str.contains('pca')]

In [29]:
# remove ticker from metric
df_long['ticker'] = df_long['metric'].apply(lambda x: x.split('_')[0])
df_long['metric'] = df_long['metric'].apply(lambda x: '_'.join(x.split('_')[1:]))
df_long['date'] = df_long.index

df_long

Unnamed: 0,metric,value,ticker,date
2018-01-03 09:45:00,Change_High-Low,-0.100000,AAL,2018-01-03 09:45:00
2018-01-03 10:00:00,Change_High-Low,-0.100000,AAL,2018-01-03 10:00:00
2018-01-03 10:15:00,Change_High-Low,-0.130000,AAL,2018-01-03 10:15:00
2018-01-03 10:30:00,Change_High-Low,0.130000,AAL,2018-01-03 10:30:00
2018-01-03 10:45:00,Change_High-Low,-0.070000,AAL,2018-01-03 10:45:00
...,...,...,...,...
2025-05-30 14:45:00,Volume_ma26_lag01,141133.192308,UAL,2025-05-30 14:45:00
2025-05-30 15:00:00,Volume_ma26_lag01,136067.346154,UAL,2025-05-30 15:00:00
2025-05-30 15:15:00,Volume_ma26_lag01,139252.961538,UAL,2025-05-30 15:15:00
2025-05-30 15:30:00,Volume_ma26_lag01,150593.846154,UAL,2025-05-30 15:30:00


In [30]:
df = df_long.pivot(columns='metric', values='value', index=['date','ticker'])
df['date'] = df.index.get_level_values('date')
df['ticker'] = df.index.get_level_values('ticker')
df = df.reset_index(drop=True)
df = df.sort_values(by=['date', 'ticker'])
df

metric,Change_High-Low,Change_High-Low_lag01,Change_High-Low_ma04,Change_High-Low_ma04_lag01,Change_High-Low_ma08,Change_High-Low_ma08_lag01,Change_High-Low_ma16,Change_High-Low_ma16_lag01,Change_High-Low_ma26,Change_High-Low_ma26_lag01,...,Volume_ma04,Volume_ma04_lag01,Volume_ma08,Volume_ma08_lag01,Volume_ma16,Volume_ma16_lag01,Volume_ma26,Volume_ma26_lag01,date,ticker
0,-0.1000,0.2600,0.06000,0.097500,0.026250,3.250000e-02,0.017812,0.013750,-2.732857e-16,-0.000577,...,244616.00,222352.50,152123.125,143299.750,1.220101e+05,116552.5000,1.152820e+05,115171.115385,2018-01-03 09:45:00,AAL
1,0.8750,1.3500,0.58125,0.362500,0.253125,1.625000e-01,0.120312,0.046875,8.365384e-02,-0.159423,...,7546.25,8164.25,5821.125,5628.625,3.933438e+03,3749.6875,4.709308e+03,4588.538462,2018-01-03 09:45:00,ALGT
2,-0.0988,0.3270,0.08830,0.118000,0.041650,5.150000e-02,0.013969,0.021688,2.238462e-03,-0.004154,...,88518.00,87661.25,67783.250,61364.625,4.884350e+04,46884.8750,4.714338e+04,47035.153846,2018-01-03 09:45:00,ALK
3,-0.2200,0.6600,0.11500,0.183475,0.048750,8.000000e-02,0.030631,0.043750,9.230769e-03,0.018846,...,465326.25,347732.50,285456.250,223620.625,1.976236e+05,161854.0000,1.732292e+05,159512.961538,2018-01-03 09:45:00,DAL
4,0.0678,0.2100,0.06945,0.067500,0.039725,3.437500e-02,0.021737,0.014375,7.992308e-03,0.003885,...,286926.50,212064.00,177329.875,137584.000,1.153356e+05,92741.0625,9.307854e+04,82673.807692,2018-01-03 09:45:00,JBLU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
337773,0.2150,-0.0040,0.06500,0.010000,0.022500,6.250000e-03,0.010313,-0.006562,-3.461538e-03,-0.000577,...,171396.50,68213.25,102663.125,52261.625,7.163350e+04,45361.3125,6.364819e+04,68541.153846,2025-05-30 15:45:00,ALK
337774,0.0850,-0.0500,0.04125,0.006250,0.016875,9.587500e-03,0.007812,-0.005313,-1.923077e-03,0.000192,...,523619.00,277616.25,393776.125,250181.500,2.539109e+05,179876.7500,2.110315e+05,201942.115385,2025-05-30 15:45:00,DAL
337775,0.0450,0.0000,0.01000,-0.006250,0.000625,1.110223e-16,0.000288,-0.001250,1.153846e-03,-0.000769,...,2820797.75,918372.50,1688417.875,741431.750,1.140313e+06,683423.1250,1.165446e+06,944848.923077,2025-05-30 15:45:00,JBLU
337776,0.1476,0.0524,0.05625,0.025600,0.026250,1.217500e-02,0.011675,0.001400,3.846154e-03,-0.000669,...,754457.25,290716.75,455407.000,215956.500,2.971371e+05,185166.5625,2.797868e+05,252901.846154,2025-05-30 15:45:00,LUV


In [33]:
# Create day of week dummies
df['day_of_week']   = df['date'].dt.dayofweek
df['month_of_year'] = df['date'].dt.month
df['hour_of_day']   = df['date'].dt.hour

# Create dummy variables for categorical features
df = pd.get_dummies(df, columns=['day_of_week', 'month_of_year', 'hour_of_day'], drop_first=False)

df['is_open']  = np.where((df['date'].dt.hour == 9)  & (df['date'].dt.minute == 30), 1, 0)
df['is_close'] = np.where((df['date'].dt.hour == 15) & (df['date'].dt.minute == 45), 1, 0)

In [34]:
df = downcast(df, verbose = True)

# Write to pickle
with open('../../Data/Processed/merged_data_202507132335_long.pkl', 'wb') as f:
    pickle.dump(df, f)

1.3% compressed
