In [125]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [126]:
path_to_data = r"../../data/raw/stock prices"

In [127]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [128]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
ALGT    80000
ALK     80000
DAL     80000
UAL     80000
ITA     80000
LUV     80000
JBLU    80000
IYT     79869
AAL     74806
JETS    62745
BNO     60000
Name: count, dtype: int64

In [129]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [130]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d %H:%M')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)
df_main

Unnamed: 0,Time,Open,High,Low,Last,Change,%Chg,Volume,ticker
0,2022-05-03 11:15:00,18.6900,18.770,18.6650,18.7450,0.0550,0.29,844892.0,AAL
1,2022-05-03 11:30:00,18.7450,18.855,18.7350,18.8350,0.0900,0.48,790483.0,AAL
2,2022-05-03 11:45:00,18.8400,18.850,18.7400,18.7761,-0.0589,-0.31,600037.0,AAL
3,2022-05-03 12:00:00,18.7703,18.810,18.6540,18.7272,-0.0489,-0.26,738723.0,AAL
4,2022-05-03 12:15:00,18.7300,18.820,18.7300,18.7400,0.0128,0.07,582906.0,AAL
...,...,...,...,...,...,...,...,...,...
837395,2022-05-03 10:00:00,50.5000,50.750,49.8250,49.9700,-0.5300,-1.05,472374.0,UAL
837396,2022-05-03 10:15:00,49.9700,50.270,49.6500,50.2400,0.2700,0.54,388933.0,UAL
837397,2022-05-03 10:30:00,50.2400,50.550,50.0800,50.1600,-0.0800,-0.16,346036.0,UAL
837398,2022-05-03 10:45:00,50.1700,50.520,50.0301,50.4450,0.2850,0.57,319102.0,UAL


In [131]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

for i in ['Last', 'Volume', 'High-Low', 'Last-Open', 'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Roll_SDSD_Volume_10']:
    df_main[f'Change_{i}'] = df_main.groupby('ticker')[i].diff()

In [132]:
df_main['ti']=df_main['Time'].dt.time
df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# Change in volume compared to same time on previous trading days
for day_lag in range(1, 11):
    col_name = f'Change_Volume_Day_{day_lag:02d}'
    df_main[col_name] = None
    
    for time_val in df_main['ti'].unique():
        # Get rows with this time value
        same_time_rows = df_main[df_main['ti'] == time_val]
        
        # Create a shifted index for the specified day lag
        vols = same_time_rows.groupby('ticker')['Volume'].shift(day_lag)
        diffs = same_time_rows['Volume'].values - vols
        
        # Assign to the original dataframe
        mask = df_main['ti'] == time_val
        df_main.loc[mask, col_name] = diffs

df_main.drop(columns=['ti'], inplace=True)

In [133]:
df_main.drop(columns=[
    'Open', 'High', 'Low', '%Chg', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%',
       'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Change', 'Roll_SDSD_Volume_10' 
       ], inplace=True)

In [134]:
windows=[4,8,16,26]
vars=['Last', 'Volume', 'Change_Last', 'Change_Volume',
       'Change_High-Low', 'Change_Last-Open', 'Change_Roll_SD_Last_10',
       'Change_Roll_SDSD_Last_10', 'Change_Roll_SD_Volume_10',
       'Change_Roll_SDSD_Volume_10']

# Moving averages
for window in windows:
    for var in vars:
        df_main[f'{var}_ma{window:02d}'] = df_main.groupby('ticker')[var].transform(lambda x: x.rolling(window=window).mean())

In [135]:
# Calculate lags
vars=['Last', 'Volume', 'Change_Last', 'Change_Volume']

for l in range(1, 27):
    for var in vars:
        df_main[f'{var}_lag{l:02d}'] = df_main.groupby('ticker')[var].shift(l)

In [136]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [137]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [138]:
# Sort columns alphabetically
df_wide = df_wide.reindex(sorted(df_wide.columns), axis=1)

In [139]:
df_wide

Unnamed: 0_level_0,AAL_Change_High-Low,AAL_Change_High-Low_ma04,AAL_Change_High-Low_ma08,AAL_Change_High-Low_ma16,AAL_Change_High-Low_ma26,AAL_Change_Last,AAL_Change_Last-Open,AAL_Change_Last-Open_ma04,AAL_Change_Last-Open_ma08,AAL_Change_Last-Open_ma16,...,UAL_Volume_lag21,UAL_Volume_lag22,UAL_Volume_lag23,UAL_Volume_lag24,UAL_Volume_lag25,UAL_Volume_lag26,UAL_Volume_ma04,UAL_Volume_ma08,UAL_Volume_ma16,UAL_Volume_ma26
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02 09:30:00,0.2450,0.098750,0.051875,0.025863,0.002500,0.0700,8.000000e-02,0.045000,0.027500,0.013125,...,44066.0,52148.0,131140.0,166651.0,183662.0,84670.0,216357.50,131468.250,103301.4375,101848.692308
2018-01-02 09:45:00,-0.1150,0.053900,0.035000,0.017875,0.000385,0.1000,-3.300000e-01,-0.037500,-0.013112,-0.003738,...,89848.0,44066.0,52148.0,131140.0,166651.0,183662.0,226471.00,141953.875,107733.5000,99791.038462
2018-01-02 10:00:00,-0.1600,0.025000,0.010000,0.004375,0.000169,-0.0400,1.600000e-01,0.002500,0.016237,-0.002500,...,55212.0,89848.0,44066.0,52148.0,131140.0,166651.0,209684.25,144164.000,107041.9375,95804.846154
2018-01-02 10:15:00,-0.0200,-0.012500,0.011250,0.004375,0.002308,0.0100,-6.000000e-02,-0.037500,0.000000,-0.006162,...,51469.0,55212.0,89848.0,44066.0,52148.0,131140.0,102717.00,151362.750,107658.3750,94670.846154
2018-01-02 10:30:00,0.0800,-0.053750,0.022500,0.010000,0.006923,0.0200,-7.105427e-15,-0.057500,-0.006250,0.001337,...,79705.0,51469.0,55212.0,89848.0,44066.0,52148.0,89550.25,152953.875,102100.1875,95102.538462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,0.0103,0.000500,0.002500,0.000000,0.000192,0.0250,-4.650000e-02,-0.008775,-0.002500,0.000937,...,231179.0,804723.0,117120.0,107823.0,86995.0,276367.0,128346.25,108634.375,102708.8750,136067.346154
2025-05-30 15:00:00,-0.0100,-0.006225,-0.010000,0.000000,-0.000381,0.0100,5.000000e-03,0.003775,0.011250,-0.001875,...,121493.0,231179.0,804723.0,117120.0,107823.0,86995.0,152876.75,109021.875,105855.8750,139252.961538
2025-05-30 15:15:00,0.0100,-0.007175,0.001250,-0.002187,0.000000,0.0300,-1.500000e-02,0.007400,-0.001875,0.000625,...,128007.0,121493.0,231179.0,804723.0,117120.0,107823.0,209132.50,151517.125,123894.4375,150593.846154
2025-05-30 15:30:00,-0.0100,0.000075,0.001250,-0.002500,-0.000385,-0.0149,3.990000e-02,-0.004150,-0.000012,-0.001256,...,125545.0,128007.0,121493.0,231179.0,804723.0,117120.0,237376.50,172509.375,127066.8750,155025.538462


In [140]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)

In [141]:
df_wide.columns.to_list()

['AAL_Change_High-Low',
 'AAL_Change_High-Low_ma04',
 'AAL_Change_High-Low_ma08',
 'AAL_Change_High-Low_ma16',
 'AAL_Change_High-Low_ma26',
 'AAL_Change_Last',
 'AAL_Change_Last-Open',
 'AAL_Change_Last-Open_ma04',
 'AAL_Change_Last-Open_ma08',
 'AAL_Change_Last-Open_ma16',
 'AAL_Change_Last-Open_ma26',
 'AAL_Change_Last_lag01',
 'AAL_Change_Last_lag02',
 'AAL_Change_Last_lag03',
 'AAL_Change_Last_lag04',
 'AAL_Change_Last_lag05',
 'AAL_Change_Last_lag06',
 'AAL_Change_Last_lag07',
 'AAL_Change_Last_lag08',
 'AAL_Change_Last_lag09',
 'AAL_Change_Last_lag10',
 'AAL_Change_Last_lag11',
 'AAL_Change_Last_lag12',
 'AAL_Change_Last_lag13',
 'AAL_Change_Last_lag14',
 'AAL_Change_Last_lag15',
 'AAL_Change_Last_lag16',
 'AAL_Change_Last_lag17',
 'AAL_Change_Last_lag18',
 'AAL_Change_Last_lag19',
 'AAL_Change_Last_lag20',
 'AAL_Change_Last_lag21',
 'AAL_Change_Last_lag22',
 'AAL_Change_Last_lag23',
 'AAL_Change_Last_lag24',
 'AAL_Change_Last_lag25',
 'AAL_Change_Last_lag26',
 'AAL_Change_Last_m