In [1]:
import pandas as pd
import numpy as np
import os, pickle
import warnings, datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')

In [2]:
path_to_data = r"../../data/raw/stock prices"

In [3]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [4]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df = pd.read_csv(file)
        df['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df = pd.concat([df, df_temp], ignore_index=True)

In [5]:
# Drop duplicates rows
df = df.drop_duplicates()
df = df.dropna()

In [6]:
# Convert 'Time' column to datetime format
df['Time'] = pd.to_datetime(df['Time'], format='%Y-%m-%d %H:%M')
df['ti']=df['Time'].dt.time
df.sort_values(by=['ticker', 'Time'], inplace=True)

# More efficient approach for lagged variables
for day_lag in range(1, 6):
    df[f'Volume_Day_lag{day_lag:02d}'] = None
    
    for time_val in df['ti'].unique():
        mask = df['ti'] == time_val
        subset = df[mask].copy()
        subset = subset.sort_values(['ticker', 'Time'])
        
        # Calculate lag for each ticker separately
        lagged_values = subset.groupby('ticker')['Volume'].shift(day_lag)
        df.loc[mask, f'Volume_Day_lag{day_lag:02d}'] = lagged_values.values

df['Volume_Day_lagma5'] = df[['Volume_Day_lag01', 'Volume_Day_lag02', 'Volume_Day_lag03',
                                         'Volume_Day_lag04', 'Volume_Day_lag05']].mean(axis=1)

In [7]:
df = df.dropna()
df.sort_values(by=['Time', 'ticker'], inplace=True)
df.index = df[['Time','ticker']]
df['date'] = df['Time']
df = df[df['date'] >= datetime.datetime(2018, 1, 1, 0, 0)]

In [8]:
df = df[df['ticker'].isin(['AAL', 'ALGT', 'ALK', 'DAL', 'JBLU', 'LUV', 'UAL'])]

In [13]:
y_cols = ['Volume']
x_cols = ['Volume_Day_lagma5']

y = df[y_cols]
x = df[x_cols]

# Train/test splitting
split_val  = round(0.8 * len(y))
split_test = round(0.9 * len(y))

y_train = y[:split_val]
y_val   = y[split_val:split_test]
y_test  = y[split_test:]

x_train = x[:split_val]
x_val   = x[split_val:split_test]
x_test  = x[split_test:]

# Normalize the features to [0,1]
sc2 = MinMaxScaler(feature_range=(0, 1))

x_train = sc2.fit_transform(x_train)
x_val   = sc2.transform(x_val)
x_test  = sc2.transform(x_test)

In [14]:
ols = LinearRegression()
ols.fit(x_train, y_train)

x_eval = np.concatenate((x_val, x_test), axis=0)
y_eval = y[split_val:]

print(f"OLS: {ols.score(x_eval, y_eval)}")

OLS: 0.5595847093722299
