In [20]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
import warnings
warnings.filterwarnings("ignore")
import sys
import os

### 1. Load Data

In [21]:
# Define the assets we'll analyze
assets = ['TSLA', 'BND', 'SPY']

# Define the time period
end_date = "2025-7-31"
start_date = "2015-07-01"


# Fetch the data
data = yf.download(assets, start=start_date, end=end_date)
print("\nData shape:", data.shape)

[*********************100%***********************]  3 of 3 completed


Data shape: (2535, 15)





In [22]:
## Save the dataset
data.to_csv('../data/raw/comp_hist_finance_data.csv')

In [23]:
sys.path.append(os.path.abspath('../src/'))

In [24]:
from preprocess import data_loader
df = data_loader('../data/raw/comp_hist_finance_data.csv')

### 2. Data Cleaning

In [None]:
from preprocess import reconstruct_from_two_row_csv, fetch_with_yfinance, prepare_time_index_and_interpolate, compute_returns_and_rolling, detect_outliers, compute_var_sharpe, adf_test, plot_series, plot_returns, plot_correlation

Unnamed: 0,Price,Close,Close.1,Close.2,High,High.1,High.2,Low,Low.1,Low.2,Open,Open.1,Open.2,Volume,Volume.1,Volume.2
0,Ticker,BND,SPY,TSLA,BND,SPY,TSLA,BND,SPY,TSLA,BND,SPY,TSLA,BND,SPY,TSLA
1,Date,,,,,,,,,,,,,,,
2,2015-07-01,60.816768646240234,174.9170684814453,17.94333267211914,60.914494390821815,175.36384357914824,18.174667358398438,60.76414620909392,174.1246710288372,17.856666564941406,60.7942169925226,175.1109488593942,18.073999404907227,5399300,135979900,31518000
3,2015-07-02,60.967098236083984,174.75697326660156,18.667999267578125,61.0272397846012,175.5662341300755,18.829999923706055,60.93702746182537,174.3354861812592,18.220666885375977,60.93702746182537,175.39764186849158,18.68000030517578,1060100,104373700,107458500
4,2015-07-06,61.17759323120117,174.25958251953125,18.648000717163086,61.22270226421179,175.04354214835712,18.779333114624023,61.05731585913941,173.2564414233038,18.420000076293945,61.22270226421179,173.45875980949862,18.591999053955078,2210700,117975400,61828500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2532,2025-07-24,72.69886016845703,634.4199829101562,305.29998779296875,72.79852804402405,636.1500244140625,310.1499938964844,72.57925415527372,633.989990234375,300.4100036621094,72.59918468871852,634.5999755859375,310.0,6139200,71307100,156966000
2533,2025-07-25,72.87826538085938,637.0999755859375,316.05999755859375,72.87826538085938,637.5800170898438,323.6300048828125,72.68889109437573,634.8400268554688,308.010009765625,72.7088216278191,635.0900268554688,308.739990234375,4845900,56865400,148227000
2534,2025-07-28,72.75865936279297,636.9400024414062,325.5899963378906,72.80849710265525,638.0399780273438,330.489990234375,72.71879069173744,635.5399780273438,315.69000244140625,72.778589896235,637.47998046875,318.45001220703125,5806600,54917100,112673800
2535,2025-07-29,73.13739776611328,635.260009765625,321.20001220703125,73.14736683457035,638.6699829101562,326.25,72.83839414994219,634.3400268554688,318.25,72.83839414994219,638.3499755859375,325.54998779296875,7397900,60556300,87358900


In [18]:
#Drop first two rows (ticker labels and NaN row)
df = df.iloc[2:].reset_index(drop=True)

In [19]:
df.head()

Unnamed: 0,Price,Close,Close.1,Close.2,High,High.1,High.2,Low,Low.1,Low.2,Open,Open.1,Open.2,Volume,Volume.1,Volume.2
0,2015-07-01,60.81676864624024,174.9170684814453,17.94333267211914,60.914494390821815,175.36384357914824,18.17466735839844,60.76414620909392,174.1246710288372,17.856666564941406,60.7942169925226,175.1109488593942,18.073999404907227,5399300,135979900,31518000
1,2015-07-02,60.967098236083984,174.75697326660156,18.667999267578125,61.0272397846012,175.5662341300755,18.82999992370605,60.93702746182537,174.3354861812592,18.220666885375977,60.93702746182537,175.39764186849158,18.68000030517578,1060100,104373700,107458500
2,2015-07-06,61.17759323120117,174.25958251953125,18.64800071716309,61.22270226421179,175.04354214835712,18.779333114624023,61.05731585913941,173.2564414233038,18.420000076293945,61.22270226421179,173.45875980949862,18.59199905395508,2210700,117975400,61828500
3,2015-07-07,61.237735748291016,175.3554229736328,17.858667373657227,61.47077710342403,175.48186390537626,18.34666633605957,61.237735748291016,172.0593921500799,17.384666442871094,61.410641284154245,174.4618727552857,18.333332061767575,4183200,173820200,91576500
4,2015-07-08,61.41059875488281,172.4134521484375,16.997333526611328,61.463221144934735,174.2932803592427,17.386667251586914,61.31286736235041,172.17742048352747,16.95400047302246,61.37300887538414,174.00667231796066,17.288000106811523,1967800,164020100,93316500


In [7]:
# Display basic information about the dataset
print("\nData columns:")
print(df.columns.levels[0].tolist())
print("\nFirst few rows of Adj Close prices:")
print(df['Close'].head())


Data columns:


AttributeError: 'Index' object has no attribute 'levels'

In [14]:

df = df.fillna(method='ffill')  # Forward fill for missing values
df[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df[['Open', 'High', 'Low', 'Close', 'Adj Close']].astype(float)
df['Volume'] = df['Volume'].astype(int)
df = df.sort_index()

KeyError: "['Adj Close'] not in index"

### 3. Descriptive Statistics

In [None]:
stats = {ticker: data[ticker][['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']].describe() for ticker in tickers}
for ticker, stat in stats.items():
    print(f"\n{ticker} Statistics:")
    print(stat)

### 4. Calculate daily returns

In [None]:
for ticker in tickers:
    data[ticker]['Daily Return'] = data[ticker]['Adj Close'].pct_change()
    print(f"\n{ticker} Daily Return Statistics:")
    print(data[ticker]['Daily Return'].describe())