In [98]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [99]:
path_to_data = r"../../data/raw/stock prices"

In [100]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [101]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
ALGT    80000
ALK     80000
DAL     80000
UAL     80000
ITA     80000
LUV     80000
JBLU    80000
IYT     79869
AAL     74806
JETS    62745
BNO     60000
Name: count, dtype: int64

In [102]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [103]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d %H:%M')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)
df_main

Unnamed: 0,Time,Open,High,Low,Last,Change,%Chg,Volume,ticker
0,2022-05-03 11:15:00,18.6900,18.770,18.6650,18.7450,0.0550,0.29,844892.0,AAL
1,2022-05-03 11:30:00,18.7450,18.855,18.7350,18.8350,0.0900,0.48,790483.0,AAL
2,2022-05-03 11:45:00,18.8400,18.850,18.7400,18.7761,-0.0589,-0.31,600037.0,AAL
3,2022-05-03 12:00:00,18.7703,18.810,18.6540,18.7272,-0.0489,-0.26,738723.0,AAL
4,2022-05-03 12:15:00,18.7300,18.820,18.7300,18.7400,0.0128,0.07,582906.0,AAL
...,...,...,...,...,...,...,...,...,...
837395,2022-05-03 10:00:00,50.5000,50.750,49.8250,49.9700,-0.5300,-1.05,472374.0,UAL
837396,2022-05-03 10:15:00,49.9700,50.270,49.6500,50.2400,0.2700,0.54,388933.0,UAL
837397,2022-05-03 10:30:00,50.2400,50.550,50.0800,50.1600,-0.0800,-0.16,346036.0,UAL
837398,2022-05-03 10:45:00,50.1700,50.520,50.0301,50.4450,0.2850,0.57,319102.0,UAL


In [104]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

for i in ['Last', 'Volume', 'High-Low', 'Last-Open', 'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Roll_SDSD_Volume_10']:
    df_main[f'Change_{i}'] = df_main.groupby('ticker')[i].diff()

In [105]:
df_main['ti']=df_main['Time'].dt.time
df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# More efficient approach for lagged variables
for day_lag in range(1, 11):
    df_main[f'Change_Volume_Day_lag{day_lag:02d}'] = None
    
    for time_val in df_main['ti'].unique():
        mask = df_main['ti'] == time_val
        subset = df_main[mask].copy()
        subset = subset.sort_values(['ticker', 'Time'])
        
        # Calculate lag for each ticker separately
        lagged_values = subset.groupby('ticker')['Volume'].diff().shift(day_lag)
        df_main.loc[mask, f'Change_Volume_Day_lag{day_lag:02d}'] = lagged_values.values

In [106]:
df_main.drop(columns=[
    'Open', 'High', 'Low', '%Chg', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%',
       'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Change', 'Roll_SDSD_Volume_10', 'ti'
       ], inplace=True)

In [107]:
windows=[4,8,16,26]
vars=['Last', 'Volume', 'Change_Last', 'Change_Volume',
       'Change_High-Low', 'Change_Last-Open', 'Change_Roll_SD_Last_10',
       'Change_Roll_SDSD_Last_10', 'Change_Roll_SD_Volume_10',
       'Change_Roll_SDSD_Volume_10']

# Moving averages
for window in windows:
    for var in vars:
        df_main[f'{var}_ma{window:02d}'] = df_main.groupby('ticker')[var].transform(lambda x: x.rolling(window=window).mean())

In [108]:
# Calculate lags
vars = ['Change_High-Low','Change_High-Low_ma04','Change_High-Low_ma08','Change_High-Low_ma16','Change_High-Low_ma26','Change_Last-Open','Change_Last-Open_ma04','Change_Last-Open_ma08','Change_Last-Open_ma16','Change_Last-Open_ma26','Change_Last_ma04','Change_Last_ma08','Change_Last_ma16','Change_Last_ma26','Change_Roll_SDSD_Last_10','Change_Roll_SDSD_Last_10_ma04','Change_Roll_SDSD_Last_10_ma08','Change_Roll_SDSD_Last_10_ma16','Change_Roll_SDSD_Last_10_ma26','Change_Roll_SDSD_Volume_10','Change_Roll_SDSD_Volume_10_ma04','Change_Roll_SDSD_Volume_10_ma08','Change_Roll_SDSD_Volume_10_ma16','Change_Roll_SDSD_Volume_10_ma26','Change_Roll_SD_Last_10','Change_Roll_SD_Last_10_ma04','Change_Roll_SD_Last_10_ma08','Change_Roll_SD_Last_10_ma16','Change_Roll_SD_Last_10_ma26','Change_Roll_SD_Volume_10','Change_Roll_SD_Volume_10_ma04','Change_Roll_SD_Volume_10_ma08','Change_Roll_SD_Volume_10_ma16','Change_Roll_SD_Volume_10_ma26','Change_Volume_ma04','Change_Volume_ma08','Change_Volume_ma16','Change_Volume_ma26','Last_ma04','Last_ma08','Last_ma16','Last_ma26','Volume_ma04','Volume_ma08','Volume_ma16','Volume_ma26']

for var in vars:
    df_main[f'{var}_lag01'] = df_main.groupby('ticker')[var].shift(1)

vars=['Last', 'Volume', 'Change_Last', 'Change_Volume']

for l in range(1, 27):
    for var in vars:
        df_main[f'{var}_lag{l:02d}'] = df_main.groupby('ticker')[var].shift(l)

In [109]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [110]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [111]:
# Sort columns alphabetically
df_wide = df_wide.reindex(sorted(df_wide.columns), axis=1)

In [112]:
# Create day of week dummies
df_wide['day_of_week']   = df_wide.index.dayofweek
df_wide['month_of_year'] = df_wide.index.month
df_wide['hour_of_day']   = df_wide.index.hour

# Create dummy variables for categorical features
df_wide = pd.get_dummies(df_wide, columns=['day_of_week', 'month_of_year', 'hour_of_day'], drop_first=False)

df_wide['is_open']  = np.where((df_wide.index.hour == 9)  & (df_wide.index.minute == 30), 1, 0)
df_wide['is_close'] = np.where((df_wide.index.hour == 15) & (df_wide.index.minute == 45), 1, 0)

In [113]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)

In [17]:
# Create the long data for each of the 7 tickers of interest
tickers = ['AAL', 'ALGT', 'ALK', 'DAL', 'JBLU', 'LUV', 'UAL']

# Copy the df_wide 7 times
df_long = pd.concat([df_wide] * 7, keys=tickers, names=['ticker', 'Time'])

# Split the index into two columns
df_long = df_long.reset_index()
df_long.rename(columns={'level_0': 'ticker', 'level_1': 'Time'}, inplace=True)

# Add overall columns (y variables)
vars=['Last', 'Volume', 'Change_Last', 'Change_Volume']

for var in vars:
    df_long[var] = None

    for ticker in tickers:
        same_rows = df_long[df_long['ticker'] == ticker]
        vals = same_rows[f'{ticker}_{var}'].values
        mask = df_long['ticker'] == ticker
        df_long.loc[mask, var] = vals

# Move the new columns to the front
cols = ['ticker', 'Time', 'Last', 'Volume', 'Change_Last', 'Change_Volume'] + [col for col in df_long.columns if col not in ['ticker', 'Time', 'Last', 'Volume', 'Change_Last', 'Change_Volume']]
df_long = df_long[cols]

In [18]:
# Convert new columns to numeric
for col in ['Last', 'Volume', 'Change_Last', 'Change_Volume']:
    df_long[col] = pd.to_numeric(df_long[col], errors='coerce')

In [22]:
# Now we can drop any X variables that use contemporaneous info
cols_to_drop = [i for i in df_long.columns if 'lag' not in i 
                and i not in ['ticker', 'Time', 'Last', 'Volume', 'Change_Last', 'Change_Volume', 'is_open', 'is_close']
                and 'day_of_week'   not in i
                and 'month_of_year' not in i
                and 'hour_of_day'   not in i
                ]
df_long.drop(columns=cols_to_drop, inplace=True)

In [23]:
df_long['Log_Volume'] = np.log1p(df_long['Volume'])
df_long['Log_Price']  = np.log1p(df_long['Last'])

In [24]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data_long.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_long, f)