# Data preparation

## Setup

In [None]:
from datetime import datetime
import glob
import os

import pandas as pd
import yfinance as yf

In [None]:
today = datetime.today().strftime("%Y-%m-%d")

## Bitcoin

In [None]:
# Match the path of the downloaded raw CSV (yfinance only has BTC data since 2014)
raw_btc_path = glob.glob(os.path.expanduser("~/Downloads/bitcoin_2010-07-22*.csv"))[0]

In [None]:
# Read raw CSV as df and show it
df_btc = pd.read_csv(raw_btc_path)
df_btc

In [None]:
# Reverse the order of the rows
df_btc = df_btc[::-1].copy()

In [None]:
# Remove irrelevant columns, and rename the remaining
df_btc.drop(columns=["End", "Volume", "Market Cap"], inplace=True)
df_btc.rename(columns={"Start": "date", "Open": "open", "High": "high", "Low": "low", "Close": "close"}, inplace=True)

In [None]:
# Check the data type of each column
df_btc.dtypes

In [None]:
# Check if the data type of each column is consistent across all rows
df_btc.map(type).nunique() == 1

In [None]:
# Check for null values in each column
df_btc.isna().sum()

In [None]:
# Check for zero values in each column
(df_btc == 0).sum()

In [None]:
# Check if dates progress correctly (without skipped or duplicated dates)
df_btc["date"] = pd.to_datetime(df_btc["date"])
date_diff = df_btc["date"].diff().dropna()
(date_diff == pd.Timedelta(days=1)).all()

In [None]:
# Reset index and show final df
df_btc.reset_index(drop=True, inplace=True)
df_btc

In [None]:
# Export final df
df_btc.to_csv("../data/BTC.csv", index=False)

### Dataset basic info

In [None]:
# First and last entries
pd.concat([df_btc.head(1), df_btc.tail(1)])

In [None]:
df_btc.describe().T

In [None]:
# How many trading days per year on average
days_per_year = df_btc[~df_btc.index.year.isin([2024, 2010])].index.year.value_counts()
days_per_year.mean().round(2)

## S&P 500

In [None]:
# Get df from yfinance API and show it
df_sp500 = yf.download("^GSPC", start="2010-01-01", end=today)
df_sp500

In [None]:
# Export the df as raw CSV file
raw_sp500_path = os.path.expanduser("~/Downloads/SP500.csv")
df_sp500.to_csv(raw_sp500_path, index=True)

In [None]:
# Remove irrelevant columns, and rename the index and remaining columns
df_sp500.drop(columns=["Adj Close", "Volume"], inplace=True)
df_sp500.rename_axis("date", inplace=True)
df_sp500.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close"}, inplace=True)

In [None]:
# Check the data type of each column
df_sp500.dtypes

In [None]:
# Check if the data type of each column is consistent across all rows
df_sp500.map(type).nunique() == 1

In [None]:
# Check for null values in each column
df_sp500.isna().sum()

In [None]:
# Check for zero values in each column
(df_sp500 == 0).sum()

In [None]:
# Check for duplicated dates
df_sp500.index.duplicated().any()

In [None]:
# Show final df
df_sp500

In [None]:
# Export final df
df_sp500.to_csv("../data/SP500.csv", index=True)

### Dataset basic info

In [None]:
# First and last entries
pd.concat([df_sp500.head(1), df_sp500.tail(1)])

In [None]:
df_sp500.describe().T

In [None]:
# How many trading days per year on average
days_per_year = df_sp500[df_sp500.index.year != 2024].index.year.value_counts()
days_per_year.mean().round(2)