In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from datetime import datetime
from scipy import stats
import yfinance as yf
from statsmodels.tsa.stattools import adfuller
import time

In [2]:
# List the source files
# files = [
#     'SPX_1m_2010_2014.csv',
#     'SPX_1m_2015_2017.csv',
#     'SPX_1m_2018_2020.csv',
#     'SPX_1m_2021_2023.csv'
# ]

files = ['SPX_1m_2020Dec.csv',
         'SPX_1m_2021_2023.csv'
        ]

In [3]:
# Read the files into dataframes
dfs = [pd.read_csv(f) for f in files]

# Combine the list of dataframes
df = pd.concat(dfs, ignore_index=True)

print(df)

FileNotFoundError: ignored

# New Section

# New Section

In [None]:
starttime = time.time()

# Set timestamp as index
df = df.set_index(df['Time'])
df.drop(['Time'], axis=1, inplace=True)
df.index = pd.to_datetime(df.index)

df.plot()

endtime = time.time()
print('Time spent on historical price plot is:', endtime-starttime)

In [None]:
# Calculate log return
df['log_ret'] = 10000 * np.log(df.Price).diff() # 10,000 times log return

# Calculate the time difference in mininutes
df['deltaT'] = (df.index.to_series().diff().dt.days.mul(60*24, fill_value=0) +
                df.index.to_series().diff().dt.seconds.div(60, fill_value=0)
               )

# Calculate the cumulative sum of time in minutes
# df['cumsum_T'] = df['deltaT'].cumsum()

df

In [None]:
one_min_pct = df['deltaT'][df['deltaT'] == 1].count() / df['deltaT'].count()
less_1day_pct = df['deltaT'][(df['deltaT'] > 1) & (df['deltaT'] <= (60 * 24))].count() / df['deltaT'].count()
over_1day_pct = df['deltaT'][df['deltaT'] > 60 * 24].count() / df['deltaT'].count()

labels = 'one_min', 'less_1day', 'over_1day'
sizes = [one_min_pct, less_1day_pct, over_1day_pct]
explode = (0.4, 0.4, 0.4)
colors=['olivedrab', 'rosybrown', 'black']

fig, ax = plt.subplots()
ax.pie(sizes, explode=explode, labels=labels, autopct='%1.2f%%',
        colors=colors, shadow=False, startangle=0)
plt.title('Data Availablity per Price Time Gaps')
plt.show()

In [None]:
df_1min = df[df['deltaT'] == 1]
df_1min

In [None]:
df_1min['log_ret_sq'] = np.square(df_1min['log_ret'])
df_1min

In [None]:
starttime = time.time()

result = adfuller(df_1min['log_ret_sq'], autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'n_lags: {result[1]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
    print('Critial Values:')
    print(f'   {key}, {value}')

endtime = time.time()
print('Time spent on ADF test is:', endtime-starttime)

In [None]:
res_1min = stats.normaltest(df_1min['log_ret_sq'])
res_1min.pvalue

In [None]:
df_1min['log_ret_sq'].mean()

In [None]:
df_1min['log_ret_sq'].skew()

In [None]:
df_1min['log_ret_sq'].kurt()

In [None]:
spx_daily = yf.download(tickers='^GSPC', start='2020-12-01', end='2023-06-30')
spx_daily['log_ret'] = 100 * np.log(spx_daily['Close']).diff()
spx_daily.dropna(inplace=True)
spx_daily

In [None]:
res_daily = stats.normaltest(spx_daily['log_ret'])
res_daily.pvalue

In [None]:
spx_daily['log_ret'].mean()

In [None]:
spx_daily['log_ret'].skew()

In [None]:
spx_daily['log_ret'].kurt()

In [None]:
# Calculate minutes in 1 day, 1 week and 1 month
n_min_daily = 60 * 24
n_min_weekly = n_min_daily * 7
n_min_monthly = n_min_weekly * 30

In [None]:
max_deltaT_id = df['deltaT'].idxmax()

In [None]:
# df = df.groupby([df.index.dt.date.year, df.index.dt.date.dayofyear])['log_ret'].var().rename_axis(('year','dayofyear')).reset_index()

In [None]:
# df['deltaT'].plot()
# df['log_ret'].plot.bar()

In [None]:
VIX = yf.download(tickers='^VIX', start='2018-01-01', end='2022-12-31')
VIX['ret'] = np.log(VIX['Adj Close']).diff()
VIX.dropna(axis=0, inplace=True)
VIX