In [None]:
from prophet import Prophet
from neuralprophet import NeuralProphet
import xgboost as xgb

import sys, os, yaml, ta
import pandas as pd
import numpy as np
from datetime import datetime
from datetime import timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import matplotlib.pyplot as plt

import yfinance as yf
from dateutil.relativedelta import relativedelta

from stock_forecast_module import *

pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)

In [None]:
# Load inputs from inputs.yml
with open('inputs.yml', 'r') as file:
    inputs = yaml.safe_load(file)

# Access specific inputs
start_dt = inputs['params']['start_dt']
split_dt = inputs['params']['split_dt']
real_fc_start_dt = datetime.today().date().strftime('%Y-%m-%d') # inputs['params']['real_fc_start_dt']
real_fc_end_dt = inputs['params']['real_fc_end_dt']
stock_sticker = inputs['params']['stock_sticker']

print(f"Stock sticker: {stock_sticker}")
print(f"Start date: {start_dt}")
print(f"Split date: {split_dt}")
print(f"Real forecast start date: {real_fc_start_dt}")
print(f"Real forecast end date: {real_fc_end_dt}")

In [None]:
# Download SP500, NASDAQ, and VIX data
today_date = datetime.today().date()

sp500_data = yf.download('^GSPC',   start=start_dt, end=today_date.strftime('%Y-%m-%d'))
nasdaq_data = yf.download('^IXIC',  start=start_dt, end=today_date.strftime('%Y-%m-%d'))
vix_data = yf.download('^VIX',      start=start_dt, end=today_date.strftime('%Y-%m-%d'))

In [None]:
# Download AAPL, NVDA, and MSFT stock price data
aapl_data = yf.download('AAPL', start=start_dt, end=today_date.strftime('%Y-%m-%d'))
nvda_data = yf.download('NVDA', start=start_dt, end=today_date.strftime('%Y-%m-%d'))
msft_data = yf.download('MSFT', start=start_dt, end=today_date.strftime('%Y-%m-%d'))

display(aapl_data.head())
display(aapl_data.tail())

In [None]:
stock_data, stocks_indicator_features = import_stock_data(stock_sticker, start_dt)
display(stock_data.head())
display(stock_data.tail())

In [None]:
### DATA PREPROCESSING & FEATURE ENGINEERING ###

df, exo_vars = preprocess_data(stock_data, 
                               stocks_additional_features=stocks_indicator_features, 
                               sticker=stock_sticker)

# get stock market data (SP500, NASDAQ, VIX)
df_market = clean_market_data(sp500_data, nasdaq_data, vix_data)
df = df.merge(df_market, on='ds', how='left')
# add market data columns to exogeneous variables
df_market_cols = df_market.columns.tolist()
df_market_cols.remove('ds')  # Remove 'ds' from the list of market columns
exo_vars.extend(df_market_cols)

# get tech stocks data (AAPL, NVDA, MSFT)
df_tech = clean_tech_stocks_data(aapl_data, nvda_data, msft_data)
df = df.merge(df_tech, on='ds', how='left')
# add market data columns to exogeneous variables
df_tech_cols = df_tech.columns.tolist()
df_tech_cols.remove('ds')  # Remove 'ds' from the list of market columns
exo_vars.extend(df_tech_cols)

# get earnings dates (might be helpfull for training/predictions - Right now, not used)
df_earnings, eps_cols = get_earnings_dates(sticker=stock_sticker)
df = df.merge(df_earnings, on='ds', how='left')

# get federal reserve data
df, exo_vars = get_federal_reserve_data(df, exo_vars, start_date=start_dt)

# get daily news sentiment data
df_news_sentiment, exo_vars = extract_historical_news_sentiment(exo_vars, folder_path="gdelt_news_sentiment")
df = df.merge(df_news_sentiment, on='ds', how='left')

# create train and test sets from overall dataset
df, df_train, df_test = create_train_test_sets(df, split_date=split_dt, start_date=start_dt)

print("using exogeneous features:", exo_vars)



### MODEL TRAINING AND FORECASTING ###

# Train Prophet model
prophet_model = train_prophet_model(df_train, exo_vars)
# Make forecast using Prophet model on training data and compute residuals: r = y - y_hat
forecast = make_prophet_forecast(df, prophet_model)
df_residuals = create_residuals(df_train, forecast, exo_vars, eps_cols)
# Train XGBoost model on residuals
xgb_model = train_xgb_model(df_residuals, exo_vars, eps_cols)
# Make predictions using the trained models y_final = y_hat + residuals
# Here, we assume we have future data for exogeneous variables in df_test
df_results = make_predictions(df_residuals, df, df_test, prophet_model, xgb_model, exo_vars, eps_cols, split_date=split_dt)

In [None]:
### PLOTTING RESULTS ###

df1 = forecast[forecast['ds'] < '2025-03-01'].copy(deep=True)

fig, ax = plt.subplots(figsize=(12, 4))
plt.plot(df['ds'],           df['y'],              label='Actual',    color='blue',   marker='o', markersize=3)
plt.plot(df1['ds'],          df1['yhat'],             label='Predicted', color='black',   marker='v', markersize=3)
plt.plot(df_results['ds'],   df_results['preds'],  label='Predicted', color='orange', marker='v', markersize=3)
plt.legend()
plt.show()

fig, ax = plt.subplots(figsize=(12, 4))
plt.plot(df['ds'],           df['y'],              label='Actual',    color='blue',   marker='o', markersize=3)
plt.plot(df_results['ds'],   df_results['preds'],  label='Predicted', color='orange', marker='v', markersize=3)
plt.xlim(pd.to_datetime(split_dt), pd.to_datetime(today_date))
plt.legend()
plt.show()

In [None]:
# Predict future exogeneous variables at future dates since in practice we don't have them available
# This is a mock way to simulate future exogenous variables
df_exo_vars_forecast = predict_future_exo_vars(df_train, df_test, exo_vars, lag_vars=['y_lag1','y_lag2'])
display(df_exo_vars_forecast.head())
# once we roughly-forecast future exogenous variables, we can use them to predict future stock prices
forecast = predict_with_unk_future_exo_vars(
                            df_exo_vars_forecast, df_train, df_test, 
                            prophet_model, xgb_model, 
                            exo_vars, eps_cols, lag_vars=['y_lag1','y_lag2'],
                            split_date=split_dt
            )

In [None]:
### PLOTTING forecast results with future exogenous variables ###
split_date = split_dt
df1 = forecast[forecast['ds'] >= split_date].copy(deep=True)

df['y_moving_avg'] = df['y'].rolling(window=30).mean()
df2 = df[df['ds'] >= split_date].copy(deep=True)

fig, ax = plt.subplots(figsize=(15, 4))
plt.plot(df2['ds'], df2['y'],     label='Actual',    color='blue',   marker='o', markersize=3)
plt.plot(df2['ds'], df2['y_moving_avg'], label='Moving-Average(30)',    color='red',   marker='o', markersize=3)
plt.plot(df1['ds'], df1['yhat'],  label='Predicted', color='orange', marker='v', markersize=3)
plt.legend()
plt.show()


### Calculate Performance Scores for the test set ###
actual_values = df2['y'].values
predicted_values = df_results['preds'].values

mape_score, rmse_score, mae_score = calculate_performance_scores(actual_values, predicted_values)
print(f"MAPE Score: {mape_score*100.0:.2f}%")
print(f"RMSE Score: {rmse_score:.2f}")
print(f"MAE  Score: {mae_score:.2f}")

# Plot exogenous variables trends
for exo_var_ in exo_vars:
    print(exo_var_)
    fig, ax = plt.subplots(figsize=(9, 3))
    plt.plot(df['ds'], df[exo_var_], label='CPI', color='green', linestyle='--')
    plt.plot(df_exo_vars_forecast['ds'], df_exo_vars_forecast[exo_var_], label='CPI', color='blue', linestyle='-')
    plt.show()

### Make real forecast for real future dates/months

In [None]:
# Now we gonna make real predictions for future unknown dates
df, exo_vars = preprocess_data(stock_data, 
                               stocks_additional_features=stocks_indicator_features, 
                               sticker=stock_sticker)

# get market data (SP500, NASDAQ, VIX)
df_market = clean_market_data(sp500_data, nasdaq_data, vix_data)
df = df.merge(df_market, on='ds', how='left')
# add market data columns to exogeneous variables
df_market_cols = df_market.columns.tolist()
df_market_cols.remove('ds')  # Remove 'ds' from the list of market columns
exo_vars.extend(df_market_cols) 

# get tech stocks data (AAPL, NVDA, MSFT)
df_tech = clean_tech_stocks_data(aapl_data, nvda_data, msft_data)
df = df.merge(df_tech, on='ds', how='left')
# add market data columns to exogeneous variables
df_tech_cols = df_tech.columns.tolist()
df_tech_cols.remove('ds')  # Remove 'ds' from the list of market columns
exo_vars.extend(df_tech_cols)

# get earning dates (might be helpfull for training/predictions - Right now, not used)
df_earnings, eps_cols = get_earnings_dates(sticker=stock_sticker)
df = df.merge(df_earnings, on='ds', how='left')

# get federal reserve data
df, exo_vars = get_federal_reserve_data(df, exo_vars, start_date=start_dt)

# get daily news sentiment data
df_news_sentiment, exo_vars = extract_historical_news_sentiment(exo_vars, folder_path="gdelt_news_sentiment")
df = df.merge(df_news_sentiment, on='ds', how='left')

print("using exogeneous features:", exo_vars)

# create train dataset which basically entire dataset of the stock
df_train = df.copy(deep=True)

# train prophet and xgb models
eps_cols = []
prophet_model = train_prophet_model(df_train, exo_vars)
forecast = make_prophet_forecast(df, prophet_model)
df_residuals = create_residuals(df_train, forecast, exo_vars, eps_cols)
xgb_model = train_xgb_model(df_residuals, exo_vars, eps_cols)

# generate forecast for exogeneous in future dates (beyond today date!)
df_exo_vars_forecast = generate_future_exogeneous_vars_forecasts(
                                df_residuals, 
                                exo_vars, lag_vars=['y_lag1','y_lag2'],
                                start_fc_date=real_fc_start_dt, 
                                end_fc_date=real_fc_end_dt
                        )

# Now we can use the trained models to predict future stock prices beyond today date
# starting from 'real_fc_start_dt' to 'real_fc_end_dt' which can be specified in inputs.yml
df_results, forecast_historical = predict_with_forecasted_exo_vars(
                                        df_exo_vars_forecast,
                                        prophet_model, xgb_model, 
                                        exo_vars, eps_cols, lag_vars=['y_lag1','y_lag2'],
                                        start_fc_dt=real_fc_start_dt,
                                        end_fc_dt=real_fc_end_dt,
                                    )

In [None]:
### PLOTTING FINAL RESULTS ###
df1 = forecast_historical.copy(deep=True)
df1.rename(columns={'yhat': 'preds'}, inplace=True)
df1 = pd.concat([df1, df_results], axis=0, ignore_index=True)
df1   = df1[df1['ds'] < real_fc_start_dt].copy(deep=True)

fig, ax = plt.subplots(figsize=(14, 4))
plt.plot(df['ds'],         df['y'],             label='Actual',           color='blue', marker='v', markersize=2)
plt.plot(df_results['ds'], df_results['preds'], label='Predicted Trend',  color='orange', marker='v', markersize=2)
plt.plot(df1['ds'],        df1['preds'],        label='Historical Trend', color='red', marker='v', markersize=2)
# Plot preds_upper and preds_lower
plt.plot(df_results['ds'], df_results['preds_upper'], color='green', linestyle='--')
plt.plot(df_results['ds'], df_results['preds_lower'], color='green', linestyle='--')

# Shade the region between preds_upper and preds_lower
plt.fill_between(df_results['ds'], df_results['preds_upper'], df_results['preds_lower'], color='gray', alpha=0.2, label='Confidence Interval')
plt.title(f'{stock_sticker} Stock Price Trend Prediction')
plt.legend()
plt.xlim(pd.to_datetime(real_fc_end_dt) - relativedelta(years=4), pd.to_datetime(real_fc_end_dt))
# plt.ylim([150, 450])

# Save the plot
# fig.savefig(f'./figures/past_predictions/{stock_sticker}_predictions_on_04_29_2025.png', dpi=300, bbox_inches='tight')