## Setup

In [1]:
!pip install -q holidays

In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_stock_price_dataset(symbol):
    return pd.read_csv(
            f"../datasets/{symbol.lower()}.us.txt", 
            index_col='datetime', 
            parse_dates={'datetime': ['<DATE>', '<TIME>']},
            usecols=['<DATE>', '<TIME>', '<CLOSE>'], 
            na_values=['nan']
    ).rename(columns={'<CLOSE>': 'price'})   

In [4]:
import datetime
import holidays

def is_us_holiday(dt):
    return dt.strftime('%Y-%m-%d') in holidays.UnitedStates()

def extract_datetime_features(ds):
    df = pd.DataFrame()
    df.index = ds
    df['year'] = ds.year
    df['month'] = ds.month
    df['day'] = ds.day
    df['hour'] = ds.hour
    df['day_of_year'] = ds.day_of_year
    df['week_of_year'] = ds.weekofyear
    df['month_name'] = ds.month_name()
    df['day_name'] = ds.day_name()
    df['is_weekend'] = ((ds.day_of_week == 5) | (ds.day_of_week == 6))
    df['is_month_start'] = ds.is_month_start
    df['is_quarter_start'] = ds.is_quarter_start
    df['is_month_end'] = ds.is_month_end
    df['is_year_start'] = ds.is_year_start
    # US holidays
    us_holidays = holidays.UnitedStates()
    df['is_holiday'] = pd.Series(ds.values).apply(is_us_holiday).values
    df['is_day_before_holiday'] = pd.Series(ds + datetime.timedelta(days=1)).map(is_us_holiday).values
    df['is_day_after_holiday'] = pd.Series(ds - datetime.timedelta(days=1)).map(is_us_holiday).values
    return df

def add_datetime_features(df):
    return pd.concat([extract_datetime_features(df.index), df], axis=1)

In [5]:
SYMBOL = 'META'
df = (
    load_stock_price_dataset(SYMBOL)
    .pipe(add_datetime_features)
)
df.info()

  df['week_of_year'] = ds.weekofyear


<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1454 entries, 2022-02-16 16:00:00 to 2022-12-13 22:00:00
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   year                   1454 non-null   int64  
 1   month                  1454 non-null   int64  
 2   day                    1454 non-null   int64  
 3   hour                   1454 non-null   int64  
 4   day_of_year            1454 non-null   int64  
 5   week_of_year           1454 non-null   int64  
 6   month_name             1454 non-null   object 
 7   day_name               1454 non-null   object 
 8   is_weekend             1454 non-null   bool   
 9   is_month_start         1454 non-null   bool   
 10  is_quarter_start       1454 non-null   bool   
 11  is_month_end           1454 non-null   bool   
 12  is_year_start          1454 non-null   bool   
 13  is_holiday             1454 non-null   bool   
 14  is_day_before_holida

In [6]:
df.head()

Unnamed: 0_level_0,year,month,day,hour,day_of_year,week_of_year,month_name,day_name,is_weekend,is_month_start,is_quarter_start,is_month_end,is_year_start,is_holiday,is_day_before_holiday,is_day_after_holiday,price
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2022-02-16 16:00:00,2022,2,16,16,47,7,February,Wednesday,False,False,False,False,False,False,False,False,215.05
2022-02-16 17:00:00,2022,2,16,17,47,7,February,Wednesday,False,False,False,False,False,False,False,False,215.848
2022-02-16 18:00:00,2022,2,16,18,47,7,February,Wednesday,False,False,False,False,False,False,False,False,215.3
2022-02-16 19:00:00,2022,2,16,19,47,7,February,Wednesday,False,False,False,False,False,False,False,False,214.74
2022-02-16 20:00:00,2022,2,16,20,47,7,February,Wednesday,False,False,False,False,False,False,False,False,214.0


In [7]:
df.to_csv('../datasets/meta.us-datetime-features.csv')