Homework 4 Group 3 - Matt Benbenek, Ben Teske, Sam Barbel

In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
import statsmodels.api as sm
from statsmodels.tsa.seasonal import STL
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.stats.diagnostic import acorr_ljungbox
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
from prophet import Prophet
import matplotlib.pyplot as plt
import itertools

Part 1

In [None]:
ticker = "NVDA"
data = yf.download(ticker, start="2020-01-01", end="2025-3-01")
data.head()

Part 2

In [None]:
data.to_csv('data.csv')

Formatted csv offline and reuploaded

In [None]:
NVDA_data=pd.read_csv('NVDA_data.csv')

In [None]:
print(NVDA_data.columns)

In [None]:
NVDA_data.head()

In [None]:
NVDA_data['Date'] = pd.to_datetime(NVDA_data['Date'])

NVDA_data = NVDA_data.sort_values(by='Date')  # Sort by Date if not already sorted

In [None]:
NVDA_data.head()

In [None]:
NVDA_data.head(50)

In [None]:
# Plot the time series
plt.figure(figsize=(12, 6))
plt.plot(NVDA_data['Date'], NVDA_data['Close'], label="Close Price", color='b')

# Formatting the plot
plt.xlabel("Date")
plt.ylabel("Close Price")
plt.title("NVDA Data Close Price Time Series")
plt.legend()
plt.grid(True)


# Display the plot
plt.show()

In [None]:
NVDA_data.set_index('Date', inplace=True)

# Ensure period is set (252 trading days in a year)
stl = STL(NVDA_data['Close'], period=252, robust=True)
result = stl.fit()

# Plot the decomposition
fig, axs = plt.subplots(4, 1, figsize=(12, 8), sharex=True)

axs[0].plot(NVDA_data.index, NVDA_data['Close'], color='black', label="Original")
axs[0].set_title("Original Time Series")

axs[1].plot(NVDA_data.index, result.trend, color='blue', label="Trend")
axs[1].set_title("Trend Component")

axs[2].plot(NVDA_data.index, result.seasonal, color='green', label="Seasonal")
axs[2].set_title("Seasonal Component")

axs[3].plot(NVDA_data.index, result.resid, color='red', label="Residual")
axs[3].set_title("Residual Component")
axs[3].axhline(0, linestyle='--', color='gray')

# Formatting
plt.tight_layout()
plt.show()

In [None]:
# ADF test function
def adf_test(series, title=""):
    result = adfuller(series.dropna())  # Drop NaN values for differenced series
    print(f"ADF Test for {title}")
    print(f"ADF Statistic: {result[0]}")
    print(f"p-value: {result[1]}")
    print("Critical Values:", result[4])
    if result[1] <= 0.05:
        print("The series is stationary (reject H0).")
    else:
        print("The series is non-stationary (fail to reject H0).")
    print("-" * 50)

# Perform ADF test on original Close price
adf_test(NVDA_data['Close'], title="Original Close Price")

# Apply first-order differencing
NVDA_data['Close_diff'] = NVDA_data['Close'].diff()

# Perform ADF test on differenced data
adf_test(NVDA_data['Close_diff'], title="Differenced Close Price")

# Plot the original and differenced series
fig, axs = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

axs[0].plot(NVDA_data.index, NVDA_data['Close'], color='black', label="Original Close Price")
axs[0].set_title("Original Close Price Time Series")
axs[0].legend()

axs[1].plot(NVDA_data.index, NVDA_data['Close_diff'], color='blue', label="First-Order Differenced")
axs[1].set_title("Differenced Close Price Time Series")
axs[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# First-order differencing to make the series stationary
NVDA_data['Close_diff'] = NVDA_data['Close'].diff().dropna()

# Calculate ACF and PACF values
acf_values = acf(NVDA_data['Close_diff'].dropna(), nlags=50)
pacf_values = pacf(NVDA_data['Close_diff'].dropna(), nlags=50)

# Print ACF and PACF values
print("ACF Values:\n", acf_values)
print("\nPACF Values:\n", pacf_values)

# Plot ACF and PACF for the differenced data
fig, axs = plt.subplots(2, 1, figsize=(12, 8))

plot_acf(NVDA_data['Close_diff'].dropna(), lags=50, ax=axs[0])  # Autocorrelation function
axs[0].set_title("Autocorrelation Function (ACF)")

plot_pacf(NVDA_data['Close_diff'].dropna(), lags=50, ax=axs[1])  # Partial autocorrelation function
axs[1].set_title("Partial Autocorrelation Function (PACF)")

plt.tight_layout()
plt.show()

# Run Ljung-Box test on the differenced data (checking for autocorrelation)
# ljung_box_results = acorr_ljungbox(market_data['Close_diff'].dropna(), lags=[10, 20, 30], return_df=True)
# print(ljung_box_results)

In [None]:
# Sort the DataFrame by date (in case it is not sorted)
NVDA_data.sort_index(inplace=True)

# Reindex the DataFrame with a complete date range
# This will insert missing dates as NaN for close prices
date_range = pd.date_range(start=NVDA_data.index.min(), end=NVDA_data.index.max(), freq='B')  # 'B' for business days
NVDA_data_reindexed = NVDA_data.reindex(date_range)

# Interpolate missing data points (if necessary)
# Linear interpolation is a common approach for filling missing data
NVDA_data_reindexed['Close'] = NVDA_data_reindexed['Close'].interpolate(method='linear')

# Split the data: train = before 2024, test = from 2024 onward
train = NVDA_data_reindexed[NVDA_data_reindexed.index < '2025-01-01']
test = NVDA_data_reindexed[NVDA_data_reindexed.index >= '2025-01-01']

In [None]:
test.shape[0]

Step 6. Prophet model below (using data from 2020 on)

In [None]:
# Prepare the train data for Prophet
train_prophet = train.reset_index().rename(columns={'index': 'ds'})[['ds', 'Close']]
train_prophet.columns = ['ds', 'y']  # Prophet requires 'ds' and 'y'

# Initialize and fit Prophet model
model = Prophet(daily_seasonality=True,
                weekly_seasonality=True,  # Add weekly seasonality
                yearly_seasonality=True,
                changepoint_prior_scale=0.1)
model.fit(train_prophet)

# Create future dataframe for prediction (matching test period)
future = pd.DataFrame(test.index, columns=['ds'])
forecast = model.predict(future)

# Extract predictions
predictions = forecast['yhat'].values

# Plot the actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(test.index, test['Close'], label='Actual', color='blue')
plt.plot(test.index, predictions, label='Predicted', color='red')
plt.xlabel('Time')
plt.ylabel('Price')
plt.title('Actual vs Predicted Prices (Prophet Model)')
plt.legend()
plt.show()

# Calculate MSE, MAE, and MAPE
mse = mean_squared_error(test['Close'], predictions)
mae = mean_absolute_error(test['Close'], predictions)
mape = np.mean(np.abs((test['Close'] - predictions) / test['Close'])) * 100

# Show evaluation metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')

# Show model details
model.plot_components(forecast)
plt.show()


Reran with train data starting from 2023 to match the excel model.

In [None]:
# Sort the DataFrame by date (in case it is not sorted)
NVDA_data.sort_index(inplace=True)

# Reindex the DataFrame with a complete date range
# This will insert missing dates as NaN for close prices
date_range = pd.date_range(start=NVDA_data.index.min(), end=NVDA_data.index.max(), freq='B')  # 'B' for business days
NVDA_data_reindexed = NVDA_data.reindex(date_range)

# Interpolate missing data points (if necessary)
# Linear interpolation is a common approach for filling missing data
NVDA_data_reindexed['Close'] = NVDA_data_reindexed['Close'].interpolate(method='linear')

# Split the data: train = before 2024, test = from 2024 onward
train = NVDA_data_reindexed[(NVDA_data_reindexed.index < '2025-01-01') & (NVDA_data_reindexed.index >= '2023-01-01') ]
test = NVDA_data_reindexed[NVDA_data_reindexed.index >= '2025-01-01']

In [None]:
# Prepare the train data for Prophet
train_prophet = train.reset_index().rename(columns={'index': 'ds'})[['ds', 'Close']]
train_prophet.columns = ['ds', 'y']  # Prophet requires 'ds' and 'y'

# Initialize and fit Prophet model
model = Prophet(daily_seasonality=True,
                weekly_seasonality=True,  # Add weekly seasonality
                yearly_seasonality=True,
                changepoint_prior_scale=0.1)
model.fit(train_prophet)

# Create future dataframe for prediction (matching test period)
future = pd.DataFrame(test.index, columns=['ds'])
forecast = model.predict(future)

# Extract predictions
predictions = forecast['yhat'].values

# Plot the actual vs predicted values
plt.figure(figsize=(10, 6))
plt.plot(test.index, test['Close'], label='Actual', color='blue')
plt.plot(test.index, predictions, label='Predicted', color='red')
plt.xlabel('Time')
plt.ylabel('Price')
plt.title('Actual vs Predicted Prices (Prophet Model)')
plt.legend()
plt.show()

# Calculate MSE, MAE, and MAPE
mse = mean_squared_error(test['Close'], predictions)
mae = mean_absolute_error(test['Close'], predictions)
mape = np.mean(np.abs((test['Close'] - predictions) / test['Close'])) * 100

# Show evaluation metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape:.2f}%')

# Show model details
model.plot_components(forecast)
plt.show()

Step 7: Our MSE value for the prophet model is 447, which is much worse than the linear forecast model we made in Excel. Again, this model did not account for the Deep Seek announcement in January. Every point after that has a larger error in the prophet model than in the excel model. Looking at the time series trend data above in this notebook, we see a high increase in residual discrepancy starting in January 2025, so our model having a high MSE for this period of time makes sense. January and February 2025 have not followed the trend of NVDA stock prices over the last few years.