## Setup

In [1]:
!pip install -q holidays

In [1]:
import pandas as pd
import numpy as np

In [7]:
def load_stock_price_dataset(symbol):
    return pd.read_csv(
            f"/content/{symbol.lower()}.us.txt", 
            index_col='datetime', 
            parse_dates={'datetime': ['<DATE>', '<TIME>']},
            usecols=['<DATE>', '<TIME>', '<CLOSE>'], 
            na_values=['nan']
    ).rename(columns={'<CLOSE>': 'price'})   

In [8]:
import datetime
import holidays

def is_us_holiday(dt):
    return dt.strftime('%Y-%m-%d') in holidays.UnitedStates()

def extract_datetime_features(ds):
    df = pd.DataFrame()
    df.index = ds
    df['year'] = ds.year
    df['month'] = ds.month
    df['day'] = ds.day
    df['hour'] = ds.hour
    df['day_of_year'] = ds.day_of_year
    df['week_of_year'] = ds.weekofyear
    df['month_name'] = ds.month_name()
    df['day_name'] = ds.day_name()
    df['is_weekend'] = ((ds.day_of_week == 5) | (ds.day_of_week == 6))
    df['is_month_start'] = ds.is_month_start
    df['is_quarter_start'] = ds.is_quarter_start
    df['is_month_end'] = ds.is_month_end
    df['is_year_start'] = ds.is_year_start
    # US holidays
    df['is_holiday'] = pd.Series(ds.values).apply(is_us_holiday).values
    df['is_day_before_holiday'] = pd.Series(ds + datetime.timedelta(days=1)).map(is_us_holiday).values
    df['is_day_after_holiday'] = pd.Series(ds - datetime.timedelta(days=1)).map(is_us_holiday).values
    return df

def add_datetime_features(df):
    return pd.concat([extract_datetime_features(df.index), df], axis=1)

In [9]:
from pandas.api.types import CategoricalDtype

ORDINALS_INFO = [
    ('day_of_year', list(range(0, 366))),
    ('week_of_year', list(range(0, 52))),
]
ORDINALS = [feat for feat, _ in ORDINALS_INFO]

NOMINALS = [
    'hour',
    'month_name',
    'day_name',
    "is_weekend",
    "is_month_start",
    "is_quarter_start",
    "is_month_end",
    "is_year_start",
    "is_holiday",
    "is_day_before_holiday",
    "is_day_after_holiday",
]

NUMERICALS = [
    'day_of_year', 
    'week_of_year', 
    'price'
]

UNUSED = []

TARGET_VAR = 'price'

def prepare_dataset(dataf):
    dataf = (dataf
        .drop(columns=UNUSED, errors='ignore')
    )

    for col in NUMERICALS:
        if col not in dataf.columns:
            continue
        dataf[col] = dataf[col].astype('float')
    
    for col, categories in ORDINALS_INFO:
        if col not in dataf.columns:
            continue
        dataf[col] = dataf[col].astype(CategoricalDtype(categories=categories, ordered=True))
    
    for col in NOMINALS:
        if col not in dataf.columns:
            continue
        dataf[col] = dataf[col].astype('category')
    
    existing_cols = set(dataf.columns)
    col_order =  [col for col in NUMERICALS + ORDINALS + NOMINALS
                  if col in existing_cols]
    return dataf[col_order]

In [12]:
SYMBOL = 'META'
df = (
    load_stock_price_dataset(SYMBOL)
    .pipe(add_datetime_features)
    .pipe(prepare_dataset)
)
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1463 entries, 2022-01-25 16:00:00 to 2022-11-21 22:00:00
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   day_of_year            1463 non-null   float64 
 1   week_of_year           1463 non-null   float64 
 2   price                  1463 non-null   float64 
 3   hour                   1463 non-null   category
 4   month_name             1463 non-null   category
 5   day_name               1463 non-null   category
 6   is_weekend             1463 non-null   category
 7   is_month_start         1463 non-null   category
 8   is_quarter_start       1463 non-null   category
 9   is_month_end           1463 non-null   category
 10  is_year_start          1463 non-null   category
 11  is_holiday             1463 non-null   category
 12  is_day_before_holiday  1463 non-null   category
 13  is_day_after_holiday   1463 non-null   category
dtypes: c