In [1]:
import pandas as pd
import numpy as np
import pickle, warnings, datetime

import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [2]:
def downcast(df, verbose = True):
    start_memory = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == "object" or 'date' in dtype_name:
            pass
        elif dtype_name == "bool":
            df[col] = df[col].astype("int8")
        elif dtype_name.startswith("int") or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast = "integer")
        else:
            df[col] = pd.to_numeric(df[col], downcast = "float")
    end_memory = df.memory_usage().sum() / 1024 ** 2

    if verbose:
        print("{:.1f}% compressed".format(100 * (start_memory - end_memory) / start_memory))

    return df

In [3]:
with open('../../Data/Processed/GDELT_Clean_daily.pkl', 'rb') as f:
    gdelt = pickle.load(f)

with open('../../Data/Processed/stock_data_daily.pkl', 'rb') as f:
    stocks = pickle.load(f)

In [4]:
# Merge the datasets by row index
df = pd.merge(stocks, gdelt, left_index=True, right_index=True, how='inner')

In [5]:
# Ensure no missing values
df = df.ffill().bfill()

In [6]:
# Sort columns alphabetically
df = df.reindex(sorted(df.columns), axis=1)

In [10]:
df.columns.tolist()

['AAL_Article Count_pca_001',
 'AAL_Article Count_pca_002',
 'AAL_Article Count_pca_003',
 'AAL_Article Count_pca_004',
 'AAL_Article Count_pca_005',
 'AAL_Article Count_pca_006',
 'AAL_Article Count_pca_007',
 'AAL_Article Count_pca_008',
 'AAL_Change_High-Low',
 'AAL_Change_High-Low_lag01',
 'AAL_Change_High-Low_ma04',
 'AAL_Change_High-Low_ma04_lag01',
 'AAL_Change_High-Low_ma08',
 'AAL_Change_High-Low_ma08_lag01',
 'AAL_Change_High-Low_ma16',
 'AAL_Change_High-Low_ma16_lag01',
 'AAL_Change_High-Low_ma26',
 'AAL_Change_High-Low_ma26_lag01',
 'AAL_Change_Last',
 'AAL_Change_Last-Open',
 'AAL_Change_Last-Open_lag01',
 'AAL_Change_Last-Open_ma04',
 'AAL_Change_Last-Open_ma04_lag01',
 'AAL_Change_Last-Open_ma08',
 'AAL_Change_Last-Open_ma08_lag01',
 'AAL_Change_Last-Open_ma16',
 'AAL_Change_Last-Open_ma16_lag01',
 'AAL_Change_Last-Open_ma26',
 'AAL_Change_Last-Open_ma26_lag01',
 'AAL_Change_Last_lag01',
 'AAL_Change_Last_lag02',
 'AAL_Change_Last_lag03',
 'AAL_Change_Last_lag04',
 'AAL_

In [7]:
# Downcast numeric columns to reduce memory usage
df = downcast(df, verbose = True)

45.7% compressed


In [8]:
# Write to pickle
with open('../../Data/Processed/merged_data_daily.pkl', 'wb') as f:
    pickle.dump(df, f)