In [33]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [34]:
path_to_data = r"../../data/raw/oil and other controls"

In [35]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [36]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    if file == csv_files[0]:
        df_main = pd.read_csv(file)
    else:
        df_temp = pd.read_csv(file)
        df_main = pd.concat([df_main, df_temp], ignore_index=True)

In [37]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [38]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d %H:%M')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)

df_main.sort_values(by=['Time'], inplace=True)
df_main.index = df_main['Time']
df_main.drop(columns=['Time'], inplace=True)

## Feature Engineering

### Finance Vars
- High-Low spread
- High-Low % spread
- Last-Open spread
- Last-Open % spread
- Rolling variance
- variance of Rolling variance

In [39]:
df_main['Last'].rolling(window=10).std()

Time
2016-02-05 13:45:00         NaN
2016-02-05 14:00:00         NaN
2016-02-05 14:15:00         NaN
2016-02-05 14:30:00         NaN
2016-02-05 14:45:00         NaN
                         ...   
2025-05-30 14:45:00    0.064313
2025-05-30 15:00:00    0.045583
2025-05-30 15:15:00    0.021986
2025-05-30 15:30:00    0.026153
2025-05-30 15:45:00    0.029964
Name: Last, Length: 59965, dtype: float64

In [40]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main['Last'].rolling(window=10).std()
df_main['Roll_SDSD_Last_10'] = df_main['Roll_SD_Last_10'].rolling(window=10).std()
df_main['Roll_SD_Volume_10']   = df_main['Volume'].rolling(window=10).std()
df_main['Roll_SDSD_Volume_10'] = df_main['Roll_SD_Volume_10'].rolling(window=10).std()

In [41]:
# Moving averages
# 4-period moving average of 'Last'
df_main['MA4_Last']  = df_main['Last'].transform(lambda x: x.rolling(window=4).mean())
df_main['MA12_Last'] = df_main['Last'].transform(lambda x: x.rolling(window=12).mean())

In [None]:
# Forward fill missing values
df_wide = df_main.ffill()
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

Unnamed: 0_level_0,Open,High,Low,Last,Change,%Chg,Volume,High-Low,High-Low%,Last-Open,Last-Open%,Roll_SD_Last_10,Roll_SDSD_Last_10,Roll_SD_Volume_10,Roll_SDSD_Volume_10,MA4_Last,MA12_Last
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2018-01-02 09:30:00,18.170,18.1800,18.100,18.1061,0.0061,0.03,11817.0,0.0800,0.000550,0.0639,0.003529,0.033187,0.003449,4931.431121,1044.505444,18.106525,18.129258
2018-01-02 09:45:00,18.110,18.1400,18.095,18.1000,-0.0061,-0.03,21330.0,0.0450,0.001657,0.0100,0.000552,0.031982,0.003440,6799.912909,1407.459022,18.106525,18.125925
2018-01-02 10:00:00,18.120,18.1419,18.120,18.1400,0.0400,0.22,21115.0,0.0219,0.001209,-0.0200,-0.001103,0.022526,0.003694,7922.457795,1792.904236,18.111525,18.125925
2018-01-02 10:15:00,18.140,18.1553,18.080,18.0800,-0.0600,-0.33,3478.0,0.0753,0.000843,0.0600,0.003319,0.015480,0.005434,8025.061814,1936.139595,18.106525,18.119258
2018-01-02 10:30:00,18.100,18.1000,18.070,18.0850,0.0050,0.03,9475.0,0.0300,0.000000,0.0150,0.000829,0.016914,0.006495,7987.519591,2046.916279,18.101250,18.110508
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-30 14:45:00,26.880,26.8850,26.870,26.8700,0.0100,0.04,8001.0,0.0150,0.000186,0.0100,0.000372,0.064313,0.009584,32786.197212,3654.157067,26.880000,26.820208
2025-05-30 15:00:00,26.870,26.8850,26.860,26.8600,-0.0100,-0.04,5610.0,0.0250,0.000558,0.0100,0.000372,0.045583,0.012929,32945.592401,4115.168148,26.870000,26.825208
2025-05-30 15:15:00,26.855,26.8800,26.825,26.8800,0.0200,0.07,22480.0,0.0550,0.000931,-0.0250,-0.000930,0.021986,0.019989,32478.332929,4596.320211,26.867500,26.844375
2025-05-30 15:30:00,26.880,26.9500,26.880,26.9300,0.0500,0.19,11850.0,0.0700,0.002604,-0.0500,-0.001857,0.026153,0.023732,19745.417236,7512.210235,26.885000,26.862708


In [43]:
# add the prefix "Oil_" to all columns
df_wide = df_wide.add_prefix('Oil_')

In [None]:
# Save the processed DataFrame to a pickle file
output_path = r"../data/processed/oil_data.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)