In [None]:
import pandas as pd
import datetime
import pytz
import os
from dotenv import load_dotenv
load_dotenv()

def convert_tz(data, time_zone='US/Eastern'):
    t = data.index.to_series(keep_tz=True)
    t = t.dt.tz_convert(time_zone)
    data.index = t
    return data

def write_df(data, out_file):
    # Save flatten
    og_cols = data.columns.copy()
    data.columns = data.columns.to_flat_index()

    data.columns = pd.Index(["_".join(col) for col in data.columns])

    if os.path.exists(out_file):
        # Move current file to data/old
        data_old = "data/old"
        if not os.path.exists(data_old):
            os.makedirs(data_old)
        new_file_name = f"{out_file[:out_file.rfind('.')]}_{datetime.datetime.now().strftime('%d_%m_%Y_%H_%M_%S')}{out_file[out_file.rfind('.'):]}"
        os.rename(out_file, os.path.join(data_old, new_file_name))

    data.to_csv(out_file)
    data.columns = og_cols

# write_df(df, "test.csv")
def read_data(out_file="realdata.csv"):
    data = pd.read_csv(out_file, index_col=0)

    converter = lambda col: tuple(col.split("_"))
    # ast.literal_eval
    data.columns = data.columns.map(converter)
    data.index = pd.to_datetime(data.index)
    return data

# df = read_data("test2.csv")
# write_df(df2, "test2.csv")

# df = df.fillna(0).round(decimals=4)
# df2 = df2.fillna(0).round(decimals=4)
# print(df.head())
# print(df2.head())
# print(df == df2)
# print(df.equals(df2))

##### Read Data From All-Data CSV (Multi Index Columns)

In [None]:
df_all = read_data("realdata.csv")
df_all.head()

# Filtering & Processing the Master Dataset

In [None]:
def just_close_data(data):
    return data.iloc[:, data.columns.get_level_values(1)=='close'] #data.xs("close",level=1, axis=1)

def no_premarket_after_hours(data):
    mkt_start = datetime.time(hour=9,minute=30, tzinfo=pytz.timezone('US/Eastern'))
    mkt_end = datetime.time(hour=15,minute=59, tzinfo=pytz.timezone('US/Eastern'))
    data = convert_tz(data, time_zone='US/Eastern')
    data = data.between_time(mkt_start,mkt_end)
    data = convert_tz(data, time_zone='UTC')
    return data

In [None]:
# Filter df_all to normal hours
df = no_premarket_after_hours(df_all)

# Filter df_all to just Close data
df_close = just_close_data(df)

df_close

##### Fill NaNs

In [None]:
def ffill_nans(data):
    data = data.fillna(method="ffill")
    data = data.dropna()
    return data

def del_nans_ffill(data, thresh):
    data = data.dropna(thresh=thresh)
    data = ffill_nans(data)
    return data

def percentage_nans(data):
    percent_missing = data.isnull().sum() * 100 / len(data)
    missing_value_df = pd.DataFrame({ #'column_name': data.columns,
                                 'percent_missing': percent_missing})

    missing_value_df.sort_values('percent_missing', inplace=True)
    return missing_value_df

def filter_percentage_nans(data, thresh=.1):
    thresh *= 100
    per_nans = percentage_nans(data)
    return data[per_nans[per_nans['percent_missing'] < thresh].index]



In [None]:
df_close = filter_percentage_nans(df_close)
df_close

In [None]:
df_close = ffill_nans(df_close) # 138,607
# df_processed_holes = del_nans_ffill(df_close, 3) # 91,693
# df_close # 348,724

##### Save Data

In [None]:
write_df(df_close, "data/ETT/close.csv")
# write_df(df_processed_holes, "data/ETT/processed_holes.csv")