# Nowcasting

In [None]:
# !pip3 install numpy pandas matplotlib statsmodels scipy scikit-learn openpyxl
# !pip3 freeze > requirements.txt

## Packages and Setting

In [2]:
import warnings
import numpy as np
import pandas as pd
import datetime as dt
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from functools import reduce
from sklearn.preprocessing import StandardScaler
from statsmodels.tsa.ar_model import AutoReg
from sklearn.linear_model import ElasticNet, Ridge, Lasso
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor
# from ExtendedDynamicFactor import ExtendedDynamicFactor, OptimizeExtendedDynamicFactor
import dynamicfactoranalysis.dynamicfactoranalysis as dfa
from NowcastingPipeline import NowcastingPH

import multiprocessing as mp
mp_fork = mp.get_context('fork')
processes = int(mp.cpu_count() * 2/3)

%matplotlib inline
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

## Dynamic Factor Model

In [3]:
class NowcastingDFM(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = ('DFM_Opt' if self.kwargs.get("optimize_order") else f'DFM{self.kwargs.get("DFM_order")}')   # Override class name
    def fit_model(self, vintage, window, DFM_order, optimize_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        factor_order, error_order, k_factors, factor_lag = DFM_order
        # drop row if not enough non-missing (max safety)
        df = df.dropna(thresh = k_factors * (1 + factor_lag))

        if optimize_order:
            model_ = dfa.DynamicFactorModelOptimizer(
                endog=df, k_factors_max=k_factors, factor_lag_max=factor_lag, factor_order_max=factor_order, 
                error_order_max=error_order, verbose=True, **kwargs).fit(**kwargs)
        else:
            model_ = dfa.DynamicFactorModel(
                endog=df, k_factors=k_factors, factor_lag=factor_lag, factor_order=factor_order, 
                error_order=error_order, **kwargs)
        
        model = model_.fit(disp=False, maxiter=1000, method='powell', ftol=1e-5, **kwargs)
        # model = model_.fit(disp=False, maxiter=10, method='powell', ftol=1e-3, **kwargs)
        DFM_order = (model_.factor_order, model_.error_order, model_.k_factors, model_.factor_lag)
        self.prefix = 'DFM_Opt' if optimize_order else f'DFM{DFM_order}'    # Override class name
        
        nowcasts = model.predict(start=f'{vintage.year}Q1', end=f'{vintage.year}Q4')[['target']]
        nowcasts = list(target_scaler.inverse_transform(nowcasts[['target']]).flatten())
        model_desc = f'DFM{DFM_order}'

        return nowcasts, model_desc

In [None]:
### Sample DFM_Opt(2,2,2,0) E
target = 'GDP'
# target = 'GDPG_Est'
kmpair = {}
window = 1000
model = NowcastingDFM(DFM_order=(2,2,2,0), target=target, optimize_order=True, kmpair=kmpair, enforce_stationarity=True) # DFM_order = factor_order, error_order, k_factors, factor_lag
summary = model.run(window=window, save_aggregate=True, with_econ=True, with_tweets=False, multiprocess=1, start=pd.to_datetime('2017-01-31'))
# summary = pd.read_csv('Results/DFM_Opt_W1000_GDP_E_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', window=window+72, kmpair=kmpair).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Econ Metrics')
fig.show()
summary

## Elastic Net

In [None]:
class NowcastingML(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'ENet{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = ElasticNet()
        model.fit(X_train, y_train)
        self.prefix = f'ENet{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'ENet{lag_order}'

        return nowcasts, model_desc

In [None]:
target = 'GDP'
kmpair = {}
window = 37
model = NowcastingML(lag_order=(4,1,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01').loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## MLP

In [None]:
class NowcastingMLP(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'MLP{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = MLPRegressor(random_state=42)
        model.fit(X_train, y_train)
        self.prefix = f'MLP{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'MLP{lag_order}'

        return nowcasts, model_desc

In [None]:
target = 'GDP'
kmpair = {}
window = 37
model = NowcastingMLP(lag_order=(4,1,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window+72).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## SVR

In [None]:
class NowcastingSVR(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'SVR{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = SVR()
        model.fit(X_train, y_train)
        self.prefix = f'SVR{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'SVR{lag_order}'

        return nowcasts, model_desc

In [None]:
target = 'GDP'
kmpair = {}
window = 37
model = NowcastingSVR(lag_order=(4,1,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window+72).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## Extra Trees

In [None]:
class NowcastingExtraTree(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'ExtraTrees{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = ExtraTreesRegressor(random_state=42)
        model.fit(X_train, y_train)
        self.prefix = f'ExtraTrees{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'ExtraTrees{lag_order}'

        return nowcasts, model_desc

In [None]:
target = 'GDP'
kmpair = {}
window = 37
model = NowcastingExtraTree(lag_order=(4,1,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window+72).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## XGBoost

In [12]:
class NowcastingXGBoost(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'XGBoost{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df
    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = XGBRegressor(objective='reg:squarederror', n_estimators=1000, nthread=1)
        model.fit(X_train, y_train)
        self.prefix = f'XGBoost{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'XGBoost{lag_order}'

        return nowcasts, model_desc

In [None]:
target='GDP'
kmpair = {}
window = 25
model = NowcastingXGBoost(lag_order=(4,1,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## 3PRF

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression as LR

class TPRF():
    def __init__(self, **kwargs):
        self.kwargs = kwargs
    def autoproxy(self, X, y, n_proxy):
        """
        Use the autoproxy algorithm for calculating the proxies,
        given an array of predictors and corresponding target values

        :param X: Array of predictors
        :param y: Array of returns (target values)
        :param n_proxy: number of proxies to be calculated
        :return r0: Array of proxies
        """
        r0 = np.array(y)
        yhatt = 1
        for i in range(0, n_proxy - 1):
            (yhat, yhatt) = self.tprf(X, y, r0, False)
            r0 = np.hstack([y - yhat, r0])
        return r0
    def tprf(self, X, y, Z, oos_present, oos=[]):
        """
        Computes the returns (y) based on the set of predictors
        (X) and proxies (Z) using the three pass regression filter method

        :param X: Array of predictors (Shape: T x N, where
        T = number of timestamps in the training set,
        N = number of predictors)
        :param y: Array of returns (Shape: T x 1, where
        T = number of timestamps in the training set)
        :param Z: Array of proxies (Shape: T x L, where
        T = number of timestamps in the training set,
        L = number of proxies)
        :param oos_present: True if out of sample data is present (oos),
        False otherwise
        :param oos: Out of sample data (predictors)(Shape: 1 x N)
        :return yhat: Forecasted returns for in sample data (Shape: T x 1)
        :return yhatt: Forecasted return for out of sample array of predictors (float)
        """

        # Pass 1 (Dependent - Value of predictor i across given time intervals,
        # Independent - Set of proxies)

        phi = np.ndarray(shape=(X.shape[1], Z.shape[1]))
        eta = []
        for i in range(0, X.shape[1]):
            first_pass_model = LR()
            first_pass_model = first_pass_model.fit(X=Z, y=X[:, i])
            phi[i, :] = first_pass_model.coef_
            eta.append(first_pass_model.intercept_)

        # Pass 2 (Dependant - Cross section of predictor values at time t,
        # Independent - phi (from Pass 1)

        eta = np.array(eta).reshape(X.shape[1], 1)
        sigma = np.ndarray(shape=(X.shape[0], Z.shape[1]))
        eta1 = []
        for t in range(0, X.shape[0]):
            second_pass_model = LR()
            second_pass_model.fit(X=phi, y=X[t, :].T)
            sigma[t, :] = second_pass_model.coef_.flatten()
            eta1.append(second_pass_model.intercept_)

        eta1 = np.array(eta1)

        # Pass 3 (Dependant - Array of returns, Independent - sigma (from Pass 2)

        third_pass_model = LR()
        third_pass_model.fit(X=sigma, y=y)
        coeff, intercept = (third_pass_model.coef_, third_pass_model.intercept_)
        yhat = np.dot(sigma, coeff.T) + intercept

        # If out of sample set of predictors is present, compute the forecasted
        # return by running the second pass with out of sample predictors as the
        # dependant variable, and multiplying the resultant sigma with beta (coeff) from
        # the previous third pass and adding the intercept

        yhatt = np.nan
        if oos_present:
            second_pass_model = LR()
            second_pass_model.fit(X=phi, y=oos)
            sigma = second_pass_model.coef_.flatten()
            yhatt = np.dot(sigma, coeff.T) + intercept
        return yhat, yhatt
    def predict(self, X_train, y_train, X_test_, n_proxies=3, **kwargs):
        Z = self.autoproxy(X_train.to_numpy(), y_train.to_numpy().reshape(-1, 1), n_proxies)
        X_train = X_train.to_numpy()
        X_test_ = X_test_.to_numpy().flatten()
        X_train = (X_train.T/np.std(X_train, axis = 0).reshape(-1, 1)).T
        X_test_ = (X_test_.T/np.std(X_test_, axis = 0).reshape(-1, 1).flatten()).T
        yhat, yhatt = self.tprf(X_train, y_train, Z, True, X_test_)
        return yhatt

In [19]:
class Nowcasting3PRF(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'TPRF{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)

        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        model = TPRF()
        # model.fit(X_train, y_train)
        self.prefix = f'TPRF{lag_order}'    # Override class name

        nowcasts = [(model.predict(X_train, y_train, X_test_.to_frame().T) if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        model_desc = f'TPRF{lag_order}'

        return nowcasts, model_desc
    def load_tweets(self, vintage, window, kmpair, freq='M', **kwargs):
        vintage = pd.to_datetime(vintage)
        tweets = pd.read_csv('data/PH_Tweets_v4.csv')
        tweets['date'] = pd.to_datetime(tweets['date']) + pd.offsets.MonthEnd(0)
        tweets = tweets.set_index('date')

        if len(kmpair) == 0:
            kmpair = {keyword: list(tweets.columns.drop('keyword')) for keyword in tweets['keyword'].unique()}
        data = [tweets[tweets['keyword'] == keyword][kmpair[keyword]].add_suffix(f'_{keyword}') for keyword in kmpair.keys()]
        tweets = reduce(lambda left, right: pd.merge(left, right, on='date', how='outer', sort=True), data)
        
        # tweets = tweets.loc[dt.datetime(2010,1,1) : pd.to_datetime(vintage), :]
        tweets = tweets.loc[pd.to_datetime(vintage)  - relativedelta(months =  (pd.to_datetime(vintage).month - 1)%3 + window) : pd.to_datetime(vintage), :]
        tweets.index = pd.PeriodIndex(tweets.index, freq=freq)
        
        cols = ['C_00_PE', 'L_00_PE', 'R_00_PE', 'C_00_PU+', 'L_00_PU+', 'R_00_PU+']
        for col in cols:
            if list(tweets.columns).count(col) > 1:
                tweets[col] = tweets[col].clip(lower=1)
                tweets[col] = tweets[col].pct_change()
            # tweets[col] = scaler.fit_transform(tweets[col].values.reshape(-1, 1))
        tweets.loc[:,:] = StandardScaler().fit_transform(tweets)
        
        ## PCA
        # tweets_std = StandardScaler().fit_transform(tweets.values)
        # tweets_pca = PCA(n_components=self.kwargs.get("n_components")).fit_transform(tweets_std)
        # tweets = pd.DataFrame(tweets_pca, index=tweets.index)
        
        return tweets

In [None]:
target = 'GDP'
kmpair = {}
window=13
model = Nowcasting3PRF(lag_order=(0,0,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window+72).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend(loc=1)
axs[2].set_title('Tweet Metrics - SD')
fig.show()
summary

## adaptive-Lasso

In [21]:
class NowcastingALAS(NowcastingPH):
    def set_classname(self, **kwargs):
        self.prefix = f'ALAS{self.kwargs.get("lag_order")}'    # Override class name
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)
        df = self.lag_data(df, lag_order)
        
        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        df_train = df.loc[: vintage - relativedelta(months=3), :].dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']
        
        weights = asgl.WEIGHTS(penalization='alasso', weight_technique='pls_pct', lasso_power_weight=[1.2],
                     gl_power_weight=[1.2], variability_pct=0.5)
        lasso_weights, gl_weights = weights.fit(x=X_train.to_numpy(), y=y_train.to_numpy())
        model = asgl.ASGL(model='lm', penalization='alasso', lambda1 = [0.1], lasso_weights = lasso_weights)
        model.fit(x=X_train.to_numpy(),y=y_train.to_numpy())
        self.prefix = f'ALAS{lag_order}'    # Override class name

        nowcasts = [(model.predict(x_new=X_test_.to_frame().T.to_numpy()) if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        # nowcasts = list(target_scaler.inverse_transform(np.array(nowcasts).reshape(-1,1)).flatten())
        nowcasts = [(target_scaler.inverse_transform(nowcast).flatten().item() if not np.isnan(nowcast) else np.nan) for nowcast in nowcasts]
        model_desc = f'ALAS{lag_order}'

        return nowcasts, model_desc
    def load_tweets(self, vintage, window, kmpair, freq='M', **kwargs):
        vintage = pd.to_datetime(vintage)
        tweets = pd.read_csv('data/PH_Tweets_v3.csv')
        tweets['date'] = pd.to_datetime(tweets['date']) + pd.offsets.MonthEnd(0)
        tweets = tweets.set_index('date')

        if len(kmpair) == 0:
            kmpair = {keyword: list(tweets.columns.drop('keyword')) for keyword in tweets['keyword'].unique()}
        data = [tweets[tweets['keyword'] == keyword][kmpair[keyword]].add_suffix(f'_{keyword}') for keyword in kmpair.keys()]
        tweets = reduce(lambda left, right: pd.merge(left, right, on='date', how='outer', sort=True), data)

        # tweets = tweets.loc[dt.datetime(2010,1,1) : pd.to_datetime(vintage), :]
        tweets = tweets.loc[pd.to_datetime(vintage)  - relativedelta(months =  (pd.to_datetime(vintage).month - 1)%3 + window) : pd.to_datetime(vintage), :]
        tweets.index = pd.PeriodIndex(tweets.index, freq=freq)

        tweets.loc[:,:] = StandardScaler().fit_transform(tweets)

        return tweets

In [None]:
target = 'GDP'
kmpair = {}
window=13
model = NowcastingALAS(lag_order=(0,0,0), kmpair=kmpair, target=target) # lag_order = target_lag, tweet_lag, econ_lag
summary = model.run(window=window, save_aggregate=True, with_econ=False, with_tweets=True, multiprocess=processes)
# summary = pd.read_csv('Results/ENet(2, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01', kmpair=kmpair, window=window+72).loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend(loc=1)
axs[2].set_title('Tweet Metrics - SD')
fig.show()
summary

## Run all ML models in a loop

In [None]:
## ML models
import itertools
# windows = [37, 43, 49, 55, 61, 67]
windows = [37]
models = [NowcastingML, NowcastingMLP, NowcastingSVR, NowcastingExtraTree]
run_params = list(itertools.product(windows, models))
for param in run_params:
    model = param[1](lag_order=(4,1,0), target='GDP', kmpair={})
    summary = model.run(window=param[0], save_aggregate=True, with_econ=False, with_tweets=True)

## Autoregression

In [None]:
class NowcastingAR(NowcastingPH):
    def fit_model(self, vintage, window, AR_order, **kwargs):
        df, target_scaler, econ_scaler = self.load_data(vintage, window=window, **kwargs)

        model = AutoReg(df['target'].dropna(), lags=AR_order).fit()
        self.prefix = 'AR(1)'    # Override class name
        
        nowcasts = model.predict(start=f'{vintage.year}Q1', end=f'{vintage.year}Q4').to_numpy().reshape(-1,1)
        nowcasts = list(target_scaler.inverse_transform(nowcasts).flatten())
        model_desc = 'AR(1)'

        return nowcasts, model_desc

In [None]:
model = NowcastingAR(AR_order=1, kmpair={}, target='GDP')
summary = model.run(window=25, save_aggregate=True)
# summary = pd.read_csv('Results/AR(1)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01').loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary

## Run AR models in a loop

In [None]:
## AR models
import itertools
windows = [25, 31, 37, 43, 49, 55]
for window in windows:
    model = NowcastingAR(AR_order=1, target='GDP', kmpair={})
    summary = model.run(window=window, save_aggregate=True)

## Machine Learning x Dynamic Factor Model (do not use)

In [None]:
class NowcastingMLxDFM(NowcastingPH):
    def extend_data(self, df, vintage, DFM_order, optimize_order=False, **kwargs):
        factor_order, error_order, k_factors, factor_lag = DFM_order

        if optimize_order:
            model = OptimizeExtendedDynamicFactor(
                endog=df, k_factors_max=k_factors, factor_lag_max=factor_lag, factor_order_max=factor_order, 
                error_order=error_order, **kwargs).optimize(**kwargs)
        else:
            model = ExtendedDynamicFactor(
                endog=df, k_factors=k_factors, factor_lag=factor_lag, factor_order=factor_order, 
                error_order=error_order, **kwargs)
        results = model.fit(disp=False, maxiter=1000, method='powell', ftol=1e-5, **kwargs)
        
        df_extended = pd.DataFrame()
        for col in df.columns:
            col_extended = pd.concat([df[[col]].dropna(), 
                                    results.predict(start=df[col].dropna().index[-1], end=vintage + pd.offsets.YearEnd(0))[[col]].iloc[1:]])
            df_extended = pd.concat([df_extended, col_extended], axis=1)
        df_extended.index.name = df.index.name

        return df_extended

    def load_econ_m(self, vintage, freq='M', extend=False, **kwargs):
        econ_m = super().load_econ_m(vintage, freq='M', **kwargs)
        econ_m = self.extend_data(econ_m, vintage, **kwargs) if extend else econ_m
        econ_m.index = pd.PeriodIndex(econ_m.index, freq=freq)
        return econ_m
    
    def load_econ_q(self, vintage, freq='Q', extend=False, **kwargs):
        econ_q = super().load_econ_q(vintage, freq='Q', **kwargs)
        econ_q = self.extend_data(econ_q, vintage, **kwargs) if extend else econ_q
        econ_q.index = pd.PeriodIndex(econ_q.index, freq=freq)
        return econ_q
    
    def load_tweets(self, vintage, freq='M', extend=False, **kwargs):
        tweets = super().load_tweets(vintage, freq='M', **kwargs)
        tweets = self.extend_data(tweets, vintage, **kwargs) if extend else tweets
        tweets.index = pd.PeriodIndex(tweets.index, freq=freq)
        return tweets
    
    def lag_data(self, df, lag_order):
        target_lag, tweet_lag, econ_lag = lag_order
        lagged_df = ([df] + [df[['target']].shift(l).add_suffix(f'.Q{l}') for l in range(1, target_lag + 1)] + 
                        [df[[col for col in df.columns if 'TWT' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, tweet_lag + 1)] +
                        [df[[col for col in df.columns if 'ECN' in col]].shift(l).add_suffix(f'.Q{l}') for l in range(1, econ_lag + 1)])
        df = pd.concat(lagged_df, axis=1)
        df = df.loc[:, ~df.T.duplicated(keep='first')]

        return df

    def fit_model(self, vintage, window, lag_order, DFM_order, optimize_order=False, **kwargs):
        df_train, _, _ = self.load_data(vintage, window=window, scaled=False, **kwargs)
        df_train = self.lag_data(df_train, lag_order).dropna()
        X_train = df_train.loc[:, df_train.columns.drop('target')]
        y_train = df_train.loc[:, 'target']

        df, _, _ = self.load_data(vintage, window=window, scaled=False, extend=True, DFM_order=DFM_order, optimize_order=optimize_order, **kwargs)
        df = self.lag_data(df, lag_order)
        X_test = df.loc[vintage + relativedelta(month=3) :, df.columns.drop('target')]
        
        model = ElasticNet()
        model.fit(X_train, y_train)
        self.prefix = f'ENet{lag_order} x ' + ('DFM_Opt' if optimize_order else f'DFM{DFM_order}')   # Override class name

        nowcasts = [(model.predict(X_test_.to_frame().T)[0] if not X_test_.isnull().values.any() else np.nan) for _, X_test_ in X_test.iterrows()]
        model_desc = f'ENet{lag_order} x ' + ('DFM_Opt' if optimize_order else f'DFM{DFM_order}')

        return nowcasts, model_desc

In [None]:
# lag_order = target_lag, tweet_lag, econ_lag 
# DFM_order = factor_order, error_order, k_factors, factor_lag
model = NowcastingMLxDFM(lag_order=(1,0,0), DFM_order=(1,0,1,0), optimize_order=False)
summary = model.run(window=25, save_aggregate=False)
# summary = pd.read_csv('Results/ENet(1, 0, 0) x DFM(1, 0, 1, 0)_W25_TE_summary.csv', parse_dates=['date'])
tweets = model.load_tweets('2023-01-01').loc[dt.datetime(2017,1,1):,:]
tweets.index = tweets.index.to_timestamp()

fig, axs = plt.subplots(3, 1, figsize=(10, 10), sharex=True)
axs[0].plot(summary['date'], summary['Nowcast_A'], linewidth=0, marker='*', label='Nowcast')
axs[0].plot(summary['date'], summary['Actual_A'], label='Actual')
axs[0].legend()
axs[0].set_title('Annual GDP Growth')
axs[1].plot(summary['date'], summary['Nowcast_Q'], linewidth=0, marker='*', label='Nowcast')
axs[1].plot(summary['date'], summary['Actual_Q'], label='Actual')
axs[1].legend()
axs[1].set_title('Quarter GDP Growth')
for metric in tweets.columns:
    axs[2].plot(tweets.index, tweets[metric], label=metric, alpha=0.5)
axs[2].legend()
axs[2].set_title('Tweet Metrics')
fig.show()
summary