In [47]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [48]:
path_to_data = r"../../data/raw/stock prices"

In [49]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [50]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
ALGT    80000
ALK     80000
UAL     80000
DAL     80000
ITA     80000
JBLU    80000
LUV     80000
IYT     79869
AAL     74806
JETS    62745
Name: count, dtype: int64

In [51]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [52]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d %H:%M')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)
df_main

Unnamed: 0,Time,Open,High,Low,Last,Change,%Chg,Volume,ticker
0,2022-05-03 11:15:00,18.6900,18.770,18.6650,18.7450,0.0550,0.29,844892.0,AAL
1,2022-05-03 11:30:00,18.7450,18.855,18.7350,18.8350,0.0900,0.48,790483.0,AAL
2,2022-05-03 11:45:00,18.8400,18.850,18.7400,18.7761,-0.0589,-0.31,600037.0,AAL
3,2022-05-03 12:00:00,18.7703,18.810,18.6540,18.7272,-0.0489,-0.26,738723.0,AAL
4,2022-05-03 12:15:00,18.7300,18.820,18.7300,18.7400,0.0128,0.07,582906.0,AAL
...,...,...,...,...,...,...,...,...,...
777395,2022-05-03 10:00:00,50.5000,50.750,49.8250,49.9700,-0.5300,-1.05,472374.0,UAL
777396,2022-05-03 10:15:00,49.9700,50.270,49.6500,50.2400,0.2700,0.54,388933.0,UAL
777397,2022-05-03 10:30:00,50.2400,50.550,50.0800,50.1600,-0.0800,-0.16,346036.0,UAL
777398,2022-05-03 10:45:00,50.1700,50.520,50.0301,50.4450,0.2850,0.57,319102.0,UAL


## Feature Engineering

### Finance Vars
- High-Low spread
- High-Low % spread
- Last-Open spread
- Last-Open % spread
- Rolling variance
- variance of Rolling variance

### Dummies
- Day of week
- Hour of day
- Month of year
- First observation per day

In [53]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

for i in ['Open', 'High', 'Low', 'Last', 'Volume']:
    df_main[f'Change_{i}'] = df_main.groupby('ticker')[i].diff()

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

In [54]:
df_main['ti']=df_main['Time'].dt.time
df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# Change in volume compared to same time on previous trading day
df_main['Change_Volume_Day'] = None
for time_val in df_main['ti'].unique():
    # Get rows with this time value
    same_time_rows = df_main[df_main['ti'] == time_val]
    
    # Create a shifted index
    vols = same_time_rows.groupby('ticker')['Volume'].diff().fillna(0)

    # Assign to the original dataframe
    mask = df_main['ti'] == time_val
    df_main.loc[mask, 'Change_Volume_Day'] = vols

df_main.drop(columns=['ti'], inplace=True)

In [55]:
df_main.columns

Index(['Time', 'Open', 'High', 'Low', 'Last', 'Change', '%Chg', 'Volume',
       'ticker', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%',
       'Change_Open', 'Change_High', 'Change_Low', 'Change_Last',
       'Change_Volume', 'Roll_SD_Last_10', 'Roll_SDSD_Last_10',
       'Roll_SD_Volume_10', 'Roll_SDSD_Volume_10', 'Change_Volume_Day'],
      dtype='object')

In [56]:
windows=[8,16,26,52]
vars=['Open', 'High', 'Low', 'Last', 'Change', '%Chg', 'Volume', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%']

# Moving averages
for window in windows:
    for var in vars:
        df_main[f'{var}_{window}'] = df_main.groupby('ticker')[var].transform(lambda x: x.rolling(window=window).mean())

In [57]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [58]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [59]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)

In [60]:
df_wide.columns.to_list()

['AAL_%Chg',
 'ALGT_%Chg',
 'ALK_%Chg',
 'DAL_%Chg',
 'ITA_%Chg',
 'IYT_%Chg',
 'JBLU_%Chg',
 'JETS_%Chg',
 'LUV_%Chg',
 'UAL_%Chg',
 'AAL_%Chg_16',
 'ALGT_%Chg_16',
 'ALK_%Chg_16',
 'DAL_%Chg_16',
 'ITA_%Chg_16',
 'IYT_%Chg_16',
 'JBLU_%Chg_16',
 'JETS_%Chg_16',
 'LUV_%Chg_16',
 'UAL_%Chg_16',
 'AAL_%Chg_26',
 'ALGT_%Chg_26',
 'ALK_%Chg_26',
 'DAL_%Chg_26',
 'ITA_%Chg_26',
 'IYT_%Chg_26',
 'JBLU_%Chg_26',
 'JETS_%Chg_26',
 'LUV_%Chg_26',
 'UAL_%Chg_26',
 'AAL_%Chg_52',
 'ALGT_%Chg_52',
 'ALK_%Chg_52',
 'DAL_%Chg_52',
 'ITA_%Chg_52',
 'IYT_%Chg_52',
 'JBLU_%Chg_52',
 'JETS_%Chg_52',
 'LUV_%Chg_52',
 'UAL_%Chg_52',
 'AAL_%Chg_8',
 'ALGT_%Chg_8',
 'ALK_%Chg_8',
 'DAL_%Chg_8',
 'ITA_%Chg_8',
 'IYT_%Chg_8',
 'JBLU_%Chg_8',
 'JETS_%Chg_8',
 'LUV_%Chg_8',
 'UAL_%Chg_8',
 'AAL_Change',
 'ALGT_Change',
 'ALK_Change',
 'DAL_Change',
 'ITA_Change',
 'IYT_Change',
 'JBLU_Change',
 'JETS_Change',
 'LUV_Change',
 'UAL_Change',
 'AAL_Change_16',
 'ALGT_Change_16',
 'ALK_Change_16',
 'DAL_Change_16',