Imports

In [41]:
import os
import pandas as pd
from datetime import datetime

In [82]:


def clean_daily_data(name: str,
                     data: pd.DataFrame,
                     after: datetime = datetime(2016,1,1),
                     before: datetime = datetime(2022,11,1)) -> pd.DataFrame:
    # convert date to datetime object
    data["date"] = pd.to_datetime(data["date"])
    data = data.sort_values(by="date")
    # set the value as a float (currently str)
    data["value"] = data["value"].astype(float)
    # filter data after
    data = data[data["date"] >= after].reset_index(drop=True)
    # filter data before
    data = data[data["date"] < before].reset_index(drop=True)
    # truncate date to month
    data["date"] = data["date"].astype(str).apply(lambda x: x[:7])
    # get the mean by month
    feature = data.groupby("date").mean()
    feature["std"] = data.groupby("date").std()["value"].tolist()
    feature.columns = [name, f"{name}_std"]
    # get the percent change month to month
    feature[f"{name}_pct_change"] = (feature[name]/feature[name].shift(1)) - 1
    # this line keeps the direct time series in
    # feature = feature[[name,f"{name}_pct_change", f"{name}_std"]]
    feature = feature[[f"{name}_pct_change", f"{name}_std"]]
    # reset index
    feature.reset_index(inplace=True)
    # set index? redundant
    feature.set_index("date", inplace=True)
    return feature

# Datasets
- FFR time series
- 30 Year Mortgage rate

In [83]:
data_dir = "../data/feature/raw/"

In [88]:

def build_feature_data(path: str) -> dict[str, pd.DataFrame]:
    base = dict()
    for file in os.listdir(data_dir):
        name, ext = file.split(".")[-2], file.split(".")[-1]
        if ext == "csv":
            base[name] = pd.read_csv(f"{path}/{file}")
    datasets = list()
    for key, value in base.items():
        try:
            datasets.append(clean_daily_data(key, value))
        except:
            pass
    output = pd.concat(datasets, join="outer", axis=1)
    output.reset_index(inplace=True)
    output["date"] = pd.to_datetime(output["date"])
    output = output.sort_values(by="date").reset_index(drop=True)
    output = output[2:].dropna(axis=1).reset_index(drop=True)
    return output

In [89]:
features = build_feature_data(data_dir)


In [91]:
features.to_csv("../data/feature/all.csv", index=False)