In [31]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [32]:
path_to_data = r"../../data/raw/stock prices"

In [33]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [34]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
ALGT    80000
ALK     80000
UAL     80000
DAL     80000
ITA     80000
JBLU    80000
LUV     80000
IYT     79869
AAL     74806
JETS    62745
Name: count, dtype: int64

In [35]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [36]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d %H:%M')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)
df_main

Unnamed: 0,Time,Open,High,Low,Last,Change,%Chg,Volume,ticker
0,2022-05-03 11:15:00,18.6900,18.770,18.6650,18.7450,0.0550,0.29,844892.0,AAL
1,2022-05-03 11:30:00,18.7450,18.855,18.7350,18.8350,0.0900,0.48,790483.0,AAL
2,2022-05-03 11:45:00,18.8400,18.850,18.7400,18.7761,-0.0589,-0.31,600037.0,AAL
3,2022-05-03 12:00:00,18.7703,18.810,18.6540,18.7272,-0.0489,-0.26,738723.0,AAL
4,2022-05-03 12:15:00,18.7300,18.820,18.7300,18.7400,0.0128,0.07,582906.0,AAL
...,...,...,...,...,...,...,...,...,...
777395,2022-05-03 10:00:00,50.5000,50.750,49.8250,49.9700,-0.5300,-1.05,472374.0,UAL
777396,2022-05-03 10:15:00,49.9700,50.270,49.6500,50.2400,0.2700,0.54,388933.0,UAL
777397,2022-05-03 10:30:00,50.2400,50.550,50.0800,50.1600,-0.0800,-0.16,346036.0,UAL
777398,2022-05-03 10:45:00,50.1700,50.520,50.0301,50.4450,0.2850,0.57,319102.0,UAL


## Feature Engineering

### Finance Vars
- High-Low spread
- High-Low % spread
- Last-Open spread
- Last-Open % spread
- Rolling variance
- variance of Rolling variance

### Dummies
- Day of week
- Hour of day
- Month of year
- First observation per day

In [37]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

In [38]:
df_main['ti']=df_main['Time'].dt.time
df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# Change in volume compared to same time on previous trading day
df_main['Change_Volume_Day'] = None
for time_val in df_main['ti'].unique():
    # Get rows with this time value
    same_time_rows = df_main[df_main['ti'] == time_val]
    
    # Create a shifted index
    vols = same_time_rows.groupby('ticker')['Volume'].diff().fillna(0)

    # Assign to the original dataframe
    mask = df_main['ti'] == time_val
    df_main.loc[mask, 'Change_Volume_Day'] = vols

In [39]:
# Moving averages
# 4-period moving average of 'Last'
df_main['MA4_Last']  = df_main.groupby('ticker')['Last'].transform(lambda x: x.rolling(window=4).mean())
df_main['MA12_Last'] = df_main.groupby('ticker')['Last'].transform(lambda x: x.rolling(window=12).mean())

In [40]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [41]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [42]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)

In [43]:
df_wide.columns.to_list()

['AAL_%Chg',
 'ALGT_%Chg',
 'ALK_%Chg',
 'DAL_%Chg',
 'ITA_%Chg',
 'IYT_%Chg',
 'JBLU_%Chg',
 'JETS_%Chg',
 'LUV_%Chg',
 'UAL_%Chg',
 'AAL_Change',
 'ALGT_Change',
 'ALK_Change',
 'DAL_Change',
 'ITA_Change',
 'IYT_Change',
 'JBLU_Change',
 'JETS_Change',
 'LUV_Change',
 'UAL_Change',
 'AAL_Change_Volume_Day',
 'ALGT_Change_Volume_Day',
 'ALK_Change_Volume_Day',
 'DAL_Change_Volume_Day',
 'ITA_Change_Volume_Day',
 'IYT_Change_Volume_Day',
 'JBLU_Change_Volume_Day',
 'JETS_Change_Volume_Day',
 'LUV_Change_Volume_Day',
 'UAL_Change_Volume_Day',
 'AAL_High',
 'ALGT_High',
 'ALK_High',
 'DAL_High',
 'ITA_High',
 'IYT_High',
 'JBLU_High',
 'JETS_High',
 'LUV_High',
 'UAL_High',
 'AAL_High-Low',
 'ALGT_High-Low',
 'ALK_High-Low',
 'DAL_High-Low',
 'ITA_High-Low',
 'IYT_High-Low',
 'JBLU_High-Low',
 'JETS_High-Low',
 'LUV_High-Low',
 'UAL_High-Low',
 'AAL_High-Low%',
 'ALGT_High-Low%',
 'ALK_High-Low%',
 'DAL_High-Low%',
 'ITA_High-Low%',
 'IYT_High-Low%',
 'JBLU_High-Low%',
 'JETS_High-Low%'

In [44]:
# Simple version
df_wide2 = df_wide[[
 'AAL_Change',
 'ALGT_Change',
 'ALK_Change',
 'DAL_Change',
 'ITA_Change',
 'IYT_Change',
 'JBLU_Change',
 'JETS_Change',
 'LUV_Change',
 'UAL_Change',
 'AAL_%Chg',
 'ALGT_%Chg',
 'ALK_%Chg',
 'DAL_%Chg',
 'ITA_%Chg',
 'IYT_%Chg',
 'JBLU_%Chg',
 'JETS_%Chg',
 'LUV_%Chg',
 'UAL_%Chg',
]]

output_path = r"../../data/processed/stock_data_simple.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide2, f)

In [45]:
df_wide2

Unnamed: 0_level_0,AAL_Change,ALGT_Change,ALK_Change,DAL_Change,ITA_Change,IYT_Change,JBLU_Change,JETS_Change,LUV_Change,UAL_Change,AAL_%Chg,ALGT_%Chg,ALK_%Chg,DAL_%Chg,ITA_%Chg,IYT_%Chg,JBLU_%Chg,JETS_%Chg,LUV_%Chg,UAL_%Chg
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2018-01-02 09:30:00,0.0700,0.800,0.9600,0.1750,0.0750,0.3825,0.0300,0.2200,0.3199,0.290,0.13,0.52,1.31,0.31,0.08,0.80,0.13,0.68,0.49,0.43
2018-01-02 09:45:00,0.1000,-1.375,0.0375,-0.0750,-0.1148,0.0325,-0.0561,-0.0120,0.2801,-0.170,0.19,-0.88,0.05,-0.13,-0.12,0.07,-0.25,-0.04,0.43,-0.25
2018-01-02 10:00:00,-0.0400,1.675,0.1325,0.1250,0.1623,0.1125,0.0611,0.1043,-0.1200,0.090,-0.08,1.09,0.18,0.22,0.17,0.23,0.27,0.32,-0.18,0.13
2018-01-02 10:15:00,0.0100,-0.050,0.1500,0.0150,-0.1520,-0.1389,-0.0850,-0.0023,-0.1800,0.190,0.02,-0.03,0.20,0.03,-0.16,-0.29,-0.38,-0.01,-0.27,0.28
2018-01-02 10:30:00,0.0200,-1.150,0.0400,0.1699,-0.1727,-0.0411,-0.0100,-0.0261,0.2000,0.160,0.04,-0.74,0.05,0.30,-0.18,-0.09,-0.04,-0.08,0.30,0.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0250,0.070,0.0450,0.0100,0.1699,0.0800,0.0199,0.0200,0.0400,0.230,0.22,0.13,0.09,0.02,0.10,0.12,0.39,0.09,0.12,0.29
2025-05-30 15:00:00,0.0100,-0.015,-0.0450,0.1000,0.1975,0.0150,-0.0200,0.0150,0.0700,0.040,0.09,-0.03,-0.09,0.21,0.11,0.02,-0.39,0.07,0.21,0.05
2025-05-30 15:15:00,0.0300,0.120,0.1250,0.1700,0.3825,0.1150,0.0150,0.0500,0.0700,0.390,0.26,0.21,0.24,0.35,0.22,0.18,0.29,0.22,0.21,0.49
2025-05-30 15:30:00,-0.0149,-0.060,-0.1600,-0.1175,-0.2000,-0.0900,-0.0250,-0.0200,-0.0500,-0.150,-0.13,-0.11,-0.31,-0.24,-0.11,-0.14,-0.49,-0.09,-0.15,-0.19
