## Package Installaion

In [1]:
%pip install statsmodels

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install neuralprophet
%pip install matplotlib
%pip install seaborn
%pip install pickle

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement pickle (from versions: none)
ERROR: No matching distribution found for pickle


## Package Importation

In [88]:
from neuralprophet import NeuralProphet, set_random_seed, set_log_level; set_random_seed(314159); set_log_level("ERROR", "INFO")
from neuralprophet.df_utils import make_future_df, fill_linear_then_rolling_avg, add_missing_dates_nan

import pandas as pd
import numpy as np

import statsmodels.api as sm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True); sns.set_style("whitegrid")

import time

from collections import defaultdict

import copy

import pickle

## Reading Data

Below we read in the data for train, val and test as well as the associated historical dataframe

In [46]:
IT_sector_historical = pd.read_csv('../data_processing/it_history.csv', index_col=0)

In [104]:
train = pd.read_csv('../data_processing/train_daily.csv', index_col=0).T.reset_index(level=0).rename(columns={'index':'ds'})
val = pd.read_csv('../data_processing/val_daily.csv', index_col=0).T.iloc[1:,:].reset_index(level=0).rename(columns={'index':'ds'})
test = pd.read_csv('../data_processing/test_daily.csv', index_col=0).T.reset_index(level=0).rename(columns={'index':'ds'})

In [36]:
earnings_10yr_df = pd.read_csv('../data_processing/earnings_10yr_past_future_df.csv')

### Creating Valid DataFrame Tickers

In [48]:
industry_symbol = IT_sector_historical[['GICS Sector', 'Symbol']]
industry_symbol.to_csv('industry_symbol.csv')

In [107]:
train = train.rename(columns=pd.Series(industry_symbol['Symbol']).to_dict())
val = val.rename(columns=pd.Series(industry_symbol['Symbol']).to_dict())
test = test.rename(columns=pd.Series(industry_symbol['Symbol']).to_dict())

## Auxiliary Plotting Functions

In [10]:
def plot_forecast(model, data, periods=0, historic_pred=True, highlight_steps_ahead=None, xlabel='Date', ylabel=None, title=None):
    
    future = model.make_future_dataframe(data, 
                                         periods=periods, 
                                         n_historic_predictions=historic_pred)
    forecast = model.predict(future)
    
    fig, ax = plt.subplots(figsize=(14,10))

    if highlight_steps_ahead is not None:
        model = model.highlight_nth_step_ahead_of_each_forecast(highlight_steps_ahead)
        model.plot_last_forecast(forecast, ax=ax, xlabel=xlabel, ylabel=ylabel)
        ax.set_title(title, fontsize=20, fontweight="bold")

    else:    
        model.plot(forecast, ax=ax, xlabel=xlabel, ylabel=ylabel)
        ax.set_title(title, fontsize=20, fontweight="bold")

In [11]:
def plot_validation(metrics_df, metrics = ['SmoothL1Loss', 'MAE', 'RMSE']):
    met_len = len(metrics)

    fig, axs = plt.subplots(ncols=len(metrics), nrows=1, sharex=True, sharey=True, figsize=(8*met_len, 8))
    fig.suptitle('Validation Metrics', fontsize=20, y=0.95)

    if met_len == 1:
        metric = metrics[0]
        axs.plot(metrics_df[metric], 'ob', linewidth=6, label='Training Loss', markersize='3')
        axs.plot(metrics_df[f'{metric}_val'], 'r', linewidth=2, label='Validation Loss')

        axs.set_xlabel('Training Loss')
        axs.set_ylabel('Validation Loss')
        axs.set_title(metric)
        axs.legend()
    else:
        for metric, ax in zip(metrics, axs.ravel()):
            ax.plot(metrics_df[metric], 'ob', linewidth=6, label='Training Loss', markersize='3')
            ax.plot(metrics_df[f'{metric}_val'], 'r', linewidth=2, label='Validation Loss')

            ax.set_xlabel('Training Loss')
            ax.set_ylabel('Validation Loss')
            ax.set_title(metric)
            ax.legend()

## Base Model (Train)
---

Below we define the base model alongside the addition of country holidays in the US

In [89]:
base_model = NeuralProphet(
    growth='discontinuous',  # Determine trend types: 'linear', 'discontinuous', 'off'
    changepoints=None, # list of dates that may include change points (None -> automatic )
    n_changepoints=52, # Number of weeks in a month, looks at a potential changepoint evey month
    changepoints_range=1, # Set to 1 for use of changepoints throughout the time-series
    trend_reg=0, # Changed to 0 for no flexibility in selection of changepoints
    trend_reg_threshold=False, # Requires regularization for trend to be changed
    yearly_seasonality=True, # Automatically fits yearly seasonality
    weekly_seasonality='auto', # Automatically fits weekly seasonality
    daily_seasonality='auto', # Automatically fits daily seasonality
    seasonality_mode='additive', # Additional (adds) seasonality instead of multiplicative (multiplies) seasonality
    seasonality_reg=0, # Due to the dataset being based on bike shares, visually will be very impacted by seasonality: set at maximum
    n_forecasts=1, # Set at default 1 for forecasting one step ahead
    n_lags=7, # To include 7 days worth of AR lag
    num_hidden_layers=8, # 8 hidden layers for the network
    d_hidden=64, # Dimension of hidden layers of AR-Net
    learning_rate=None, # Automatically determines learning rate
    epochs=None, # Automatically sets the learning rate (based on data size)
    loss_func='Huber', # Sets Huber loss function for robustness to outliers ### Describe Huber in presentation/write-up
    # normalize='auto',  # Type of normalization ('minmax', 'standardize', 'soft', 'auto', None) ## Why normalize again?
)

base_model.add_country_holidays('USA') # Adds effects of US holidays

<neuralprophet.forecaster.NeuralProphet at 0x21065b312a0>

## Additive Events

In [80]:
CES_dates_df = pd.DataFrame(
    {
        "event": "CES Show",
        "ds": pd.to_datetime(
            [
                "2012-01-11",
                "2013-01-10",
                "2013-01-09",
                "2014-01-05",
                "2015-01-09",
                "2016-01-07",
                "2017-01-06",
                "2018-01-10",
                "2019-01-09",
                "2020-01-08",
                "2021-01-12",
                "2022-01-05",
                "2023-01-06",
            ]
        ),
    }
)

AAPL_dates_df = pd.DataFrame(
    {
        "event": "AAPL Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['aapl']
        ),
    }
)

META_dates_df = pd.DataFrame(
    {
        "event": "META Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['meta']
        ),
    }
)

GOOG_dates_df = pd.DataFrame(
    {
        "event": "GOOG Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['goog']
        ),
    }
)

NFLX_dates_df = pd.DataFrame(
    {
        "event": "NFLX Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['nflx']
        ),
    }
)

AMZN_dates_df = pd.DataFrame(
    {
        "event": "AMZN Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['amzn']
        ),
    }
)

MSFT_dates_df = pd.DataFrame(
    {
        "event": "MSFT Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['msft']
        ),
    }
)

NVDA_dates_df = pd.DataFrame(
    {
        "event": "NVDA Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['nvda']
        ),
    }
)

AMD_dates_df = pd.DataFrame(
    {
        "event": "AMD Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['amd']
        ),
    }
)

INTC_dates_df = pd.DataFrame(
    {
        "event": "INTC Earnings",
        "ds": pd.to_datetime(
            earnings_10yr_df['intc']
        ),
    }
)

### Create Events Dataframe for IT Sector

In [131]:
event_names = [
    'CES Show',
    'AAPL Earnings', 
    'META Earnings', 
    'GOOG Earnings', 
    'NFLX Earnings', 
    'AMZN Earnings',
    'MSFT Earnings',
    'NVDA Earnings',
    'AMD Earnings',
    'INTC Earnings',
]

In [132]:
events_df = pd.concat(
    (
        CES_dates_df,
        AAPL_dates_df,
        META_dates_df,
        GOOG_dates_df,
        NFLX_dates_df,
        AMZN_dates_df,
        MSFT_dates_df,
        NVDA_dates_df,
        AMD_dates_df,
        INTC_dates_df
    ),
)

In [133]:
events_df

Unnamed: 0,event,ds
0,CES Show,2012-01-13
1,CES Show,2013-01-11
2,CES Show,2013-01-10
3,CES Show,2014-01-07
4,CES Show,2015-01-11
...,...,...
43,INTC Earnings,2013-01-17
44,INTC Earnings,2012-10-16
45,INTC Earnings,2012-07-17
46,INTC Earnings,2012-04-17


In [None]:
has_historical_df = False
read_in_model_and_metrics = False

models_dict = dict()

for symbol in train.columns:
    print(symbol)
    if symbol == 'ds': continue

    if read_in_model_and_metrics:
        with open(f'./model/model_pickles/{symbol}_model.pkl', 'rb') as f:
            model = pickle.load(f)
        with open(f'./model/model_metrics/{symbol}_metrics.csv', 'rb') as f:
            metrics = pd.read_csv(f)
        with open(f'./model/model_historical_train_df/{symbol}_historical_train_df.csv', 'rb') as f:
            historical_train_df = pd.read_csv(f)
        with open(f'./model/model_historical_val_df/{symbol}_historical_val_df.csv', 'rb') as f:
            historical_val_df = pd.read_csv(f)

    else:
        model = copy.deepcopy(base_model)
        model = model.add_events(event_names, lower_window=-1, upper_window=1)

        train_with_dates_df = train[[symbol, 'ds']].rename(columns={symbol: 'y'})
        val_with_dates_df = val[[symbol, 'ds']].rename(columns={symbol: 'y'})

        historical_train_df = model.create_df_with_events(train_with_dates_df, events_df)
        historical_val_df = model.create_df_with_events(val_with_dates_df, events_df)

        metrics = model.fit(
            df=historical_train_df,
            freq='D',
            validation_df=historical_val_df,
            progress='bar'
        )

        pickle.dump(model, open(f'./model/model_pickles/{symbol}_model.pkl', 'wb'))
        historical_train_df.to_csv(open(f'./model/model_historical_train_df/{symbol}_historical_val_df.csv', 'wb'))
        historical_val_df.to_csv(open(f'./model/model_historical_val_df/{symbol}_historical_val_df.csv', 'wb'))
        pd.Dataframe(metrics).to_csv(open(f'./model/model_metrics/{symbol}_metrics.csv', 'wb'))

    models_dict[symbol] = [model, metrics, historical_train_df, historical_val_df]

## Creating & Predicting Into Future

In [None]:
model = models_dict['AAPL'][0]
metrics = models_dict['AAPL'][1]
historical_train_df = models_dict['AAPL'][2]
historical_val_df = models_dict['AAPL'][3]

In [None]:
future = model.make_future_dataframe(df=historical_train_df, events_df=events_df, periods=365, n_historic_predictions=len(train))
forecast = model.predict(df=future)

## Deprecated Code with Multiple Industries

In [None]:
has_historical_df = False
read_in_model_and_metrics = False
model_dict = defaultdict()

for industry, symbols in industry_to_symbol_map.items():
    model_dict[industry] = defaultdict()
    for symbol in symbols:
        model_dict[industry][symbol] = {
            'model': None,
            'metrics': None,
        }

        if read_in_model_and_metrics:
            model_dict[industry][symbol]['model'] = pickle.load('./model/' + industry_to_folder_map[industry] + '/' + symbol + '/model.pkl')
            model_dict[industry][symbol]['metrics'] = pickle.load('./model/' + industry_to_folder_map[industry] + '/' + symbol + '/metrics.pkl')
        else:
            model = base_model
            # model.add_seasonality(name='monthly', period=30.5, fourier_order=5, prior_scale=0.1)
            model = model.add_events(additive_events[industry])

            ## Reads in historical_df if has_historical_df else creates & writes historical_df
            if has_historical_df:
                historical_df = pd.read_csv('./model/' + industry_to_folder_map[industry] + '/' + symbol + '/historical_df.csv')
            else:
                historical_df = model.create_df_with_events(train[:, symbol_to_index_map[symbol]], additive_events[industry])
                historical_df.to_csv('./model/' + industry_to_folder_map[industry] + '/' + symbol + '/historical_df.csv', index=False)
            
            metrics = model.fit(historical_df, freq="D", validation_df=val[symbol_to_index_map[symbol]])
            
            ## Updating defaultdict of mdoels
            model_dict[industry][symbol]['model'] = model
            model_dict[industry][symbol]['metrics'] = metrics

            ## Pickling model and metrics
            model_path = './model/' + industry_to_folder_map[industry] + '/' + symbol + '/model'
            metrics_path = './model/' + industry_to_folder_map[industry] + '/' + symbol + '/metrics'

            pickle.dump(model, model_path)
            pickle.dump(metrics, metrics_path)