In [None]:
import pandas as pd
import datetime
import pytz
from utils.ipynb_helpers import read_data, write_df, convert_tz

##### Read Data From All-Data CSV (Multi Index Columns)

In [None]:
df_all = read_data("realdata_alp_1h.csv")
df_all.head()

# Filtering & Processing the Master Dataset

In [None]:
def just_close_data(data):
    return data.iloc[:, data.columns.get_level_values(1)=='close'] #data.xs("close",level=1, axis=1)

def no_premarket_after_hours(data):
    mkt_start = datetime.time(hour=9,minute=30, tzinfo=pytz.timezone('US/Eastern'))
    mkt_end = datetime.time(hour=15,minute=59, tzinfo=pytz.timezone('US/Eastern'))
    data = convert_tz(data, time_zone='US/Eastern')
    data = data.between_time(mkt_start,mkt_end)
    data = convert_tz(data, time_zone='UTC')
    return data

In [None]:
# Filter df_all to normal hours
df = no_premarket_after_hours(df_all)

# Filter df_all to just Close data
df_close = just_close_data(df)

df_close

##### Fill NaNs

In [None]:
def ffill_nans(data):
    data = data.fillna(method="ffill")
    data = data.dropna()
    return data

def del_nans_ffill(data, thresh):
    data = data.dropna(thresh=thresh)
    data = ffill_nans(data)
    return data

def percentage_nans(data):
    percent_missing = data.isnull().sum() * 100 / len(data)
    missing_value_df = pd.DataFrame({ #'column_name': data.columns,
                                 'percent_missing': percent_missing})

    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df

def filter_percentage_nans(data, thresh=.1):
    thresh *= 100
    per_nans = percentage_nans(data)
    return data[per_nans[per_nans['percent_missing'] < thresh].index]

percentage_nans(df_close)

In [None]:
df_close = filter_percentage_nans(df_close)
df_close

In [None]:
df_close = ffill_nans(df_close) # 138,607
# df_processed_holes = del_nans_ffill(df_close, 3) # 91,693
# df_close # 348,724

##### Save Data

In [None]:
write_df(df_close, "data/ETT/close_1h.csv")
# write_df(df_processed_holes, "data/ETT/processed_holes.csv")

## Extras

##### Read data and convert to percent delta

In [None]:
df = read_data("data/ETT/close_1h.csv")

print("Before:\n", df.head())
df_new = df.pct_change()
df_new.iloc[0] = 0

print("After:\n",df_new.head())
write_df(df_new, "data/ETT/close_1h_pct_change.csv")