# Data preparation

## Setup

In [None]:
from datetime import datetime
import glob
import os
import tarfile

import pandas as pd
import yfinance as yf

In [None]:
today = datetime.today().strftime("%Y-%m-%d")

In [None]:
# Get function that processes any dataset from yfinance
def process_yf_dataset(yf_ticker: str, title: str, start_date: str = "2010-01-01", end_date: str = today) -> pd.DataFrame:
    # Fetch data from yfinance
    df = yf.download(yf_ticker, start=start_date, end=end_date)
    print(f"\nDataset for {yf_ticker} downloaded")

    # Export raw dataset
    raw_csv_path = f"~/Downloads/{title}_raw.csv"
    df.to_csv(os.path.expanduser(raw_csv_path), index=True)
    print(f"\nRaw dataset exported to '{raw_csv_path}'")

    # Remove irrelevant columns, and rename index and rename remaining columns
    df = df.drop(columns=["Adj Close", "Volume"]).rename_axis("date").rename(
        columns={"Open": "open", "High": "high", "Low": "low", "Close": "close"}
    )
    print("\nDataset columns adjusted")

    # Data quality checks
    print("\nData quality checks:")
    print(f"- Consistent data types: {df.dtypes.nunique() == 1}")
    print(f"- Null values:\n{df.isna().sum()}")
    print(f"- Zero values:\n{(df == 0).sum()}")
    print(f"- Duplicated dates: {df.index.duplicated().sum()}")

    # Export the final processed dataset
    final_csv_path = f"../data/{title}.csv"
    df.to_csv(final_csv_path, index=True)
    print(f"\nFinal dataset exported to '{final_csv_path}'")

    return df

In [None]:
def show_dataset_basic_info(df: pd.DataFrame) -> None:
    print("First and last entries of the dataset:")
    display(df.iloc[[0, -1]])

    print("\nBasic statistics of the dataset:")
    display(df.describe().T)

    median_days = df.index.year.value_counts().median()
    print(f"\nTypical number of trading days per year: {int(median_days)}")

## Bitcoin

*Yahoo Finance provides BTC data only starting from 2014. Therefore, this Coin Codex dataset requires a different processing approach.*

In [None]:
# Match the path of the downloaded raw CSV
raw_btc_path = glob.glob(os.path.expanduser("~/Downloads/bitcoin_2010-07-22*.csv"))[0]

In [None]:
# Read raw CSV as df and show it
df_btc = pd.read_csv(raw_btc_path)
df_btc

In [None]:
# Reverse the order of the rows
df_btc = df_btc[::-1].copy()

In [None]:
# Remove irrelevant columns, and rename the remaining
df_btc.drop(columns=["End", "Volume", "Market Cap"], inplace=True)
df_btc.rename(columns={"Start": "date", "Open": "open", "High": "high", "Low": "low", "Close": "close"}, inplace=True)

In [None]:
# Check the data type of each column
df_btc.dtypes

In [None]:
# Check if the data type of each column is consistent across all rows
df_btc.map(type).nunique() == 1

In [None]:
# Check for null values in each column
df_btc.isna().sum()

In [None]:
# Check for zero values in each column
(df_btc == 0).sum()

In [None]:
# Check if dates progress correctly (without skipped or duplicated dates)
df_btc["date"] = pd.to_datetime(df_btc["date"])
date_diff = df_btc["date"].diff().dropna()
(date_diff == pd.Timedelta(days=1)).all()

In [None]:
# Reset index and show final df
df_btc.reset_index(drop=True, inplace=True)
df_btc

In [None]:
# Export final df
df_btc.to_csv("../data/BTC.csv", index=False)

In [None]:
show_dataset_basic_info(df_btc)

## S&P 500

In [None]:
df_sp500 = process_yf_dataset("^GSPC", "SP500")

In [None]:
show_dataset_basic_info(df_sp500)

## US 10-year treasury yield

In [None]:
df_us10y = process_yf_dataset("^TNX", "US10Y")

In [None]:
show_dataset_basic_info(df_us10y)

## Gold (futures)

In [None]:
df_au = process_yf_dataset("GC=F", "AU")

In [None]:
show_dataset_basic_info(df_au)

## Crude oil (futures)

In [None]:
df_wti = process_yf_dataset("CL=F", "WTI")

In [None]:
show_dataset_basic_info(df_wti)

## USD/CHF

In [None]:
df_chf = process_yf_dataset("CHF=X", "CHF")

In [None]:
show_dataset_basic_info(df_chf)

## Archive the raw data

In [None]:
# List of filenames for raw CSV data
csv_titles = ["SP500", "US10Y", "AU", "WTI", "CHF"]

# Generate full paths of the raw CSV files in the downloads dir
csv_file_paths = [os.path.join(os.path.expanduser("~/Downloads/"), f"{csv_title}.csv") for csv_title in csv_titles]

# Append the bitcoin raw CSV path to the list
csv_file_paths.append(raw_btc_path)

# Create a compressed tar archive of the raw CSV files
with tarfile.open("../data/.raw_data.tar.xz", "w:xz") as archive:
    for csv_file_path in csv_file_paths:
        archive.add(csv_file_path, arcname=os.path.basename(csv_file_path))