# Time Series Model Validation POC

## Import Libraries 

In [None]:
# System libraries
import glob

# ML libraries
import pickle
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import coint
from arch.unitroot import PhillipsPerron, DFGLS
import xgboost as xgb
from numpy import argmax
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load Time Series Models

In [None]:
with open('../models/time_series/fred_loan_rates_model_1.pkl', 'rb') as f:
    model_1 = pickle.load(f)
print(

)

In [None]:
with open('../models/time_series/fred_loan_rates_model_2.pkl', 'rb') as f:
    model_2 = pickle.load(f)
print(model_2.summary())

In [None]:
with open('../models/time_series/fred_loan_rates_model_3.pkl', 'rb') as f:
    model_3 = pickle.load(f)
print(model_3.summary())

In [None]:
with open('../models/time_series/fred_loan_rates_model_4.pkl', 'rb') as f:
    model_4 = pickle.load(f)
print(model_4.summary())

In [None]:
with open('../models/time_series/fred_loan_rates_model_5.pkl', 'rb') as f:
    model_5 = pickle.load(f)
print(model_5.summary())

In [None]:
def extract_coef_stats(summary, model_name):
    table = summary.tables[1].data
    headers = table.pop(0)
    headers[0] = 'Feature'
    df = pd.DataFrame(table, columns=headers)
    df['Model'] = model_name
    return df

def extract_coefficients_summary(summaries):
    coef_stats_df = pd.DataFrame()

    for i, summary in enumerate(summaries):
        model_name = f'Model {i+1}'
        coef_stats_df = pd.concat([coef_stats_df, extract_coef_stats(summary, model_name)])

    # Reorder columns to have 'Model' as the first column and reset the index
    coef_stats_df = coef_stats_df.reset_index(drop=True)[['Model'] + [col for col in coef_stats_df.columns if col != 'Model']]

    return coef_stats_df



# Example usage:
summaries = [model_1.summary(), model_2.summary(), model_3.summary()]
coef_stats_df = extract_coefficients_summary(summaries)


In [None]:
coef_stats_df

## Load Training Datasets

In [None]:
# Extract the endogenous (target) variable from the model fit
train_df = pd.Series(model_1.model.endog, index=model_1.model.data.row_labels)
train_df = train_df.to_frame()
target_var_name = model_1.model.endog_names
train_df.columns = [target_var_name]

# Extract the exogenous (explanatory) variables from the model fit
exog_df = pd.DataFrame(model_1.model.exog, index=model_1.model.data.row_labels, columns=model_1.model.exog_names)

# Concatenate the endogenous (target) and exogenous (explanatory) variables
train_df = pd.concat([train_df, exog_df], axis=1)
train_df.head()

In [None]:
train_df.tail()

## Load Test Datasets

Load raw test dataset.

In [None]:
file = '../datasets/time_series/fred_loan_rates_test_1.csv'
raw_test_df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')
display(raw_test_df)

Transform raw test dataset using same transformation used in the train dataset. 

In [None]:
transform_func = 'diff'
if transform_func == 'diff':
    test_df = raw_test_df.diff().dropna()

## Load Predictions

In [None]:
def get_model_prediction(model_fits_dict, df_test):
    # Extract the training data from the first model fit
    first_model_fit = list(model_fits_dict.values())[0]
    train_data = pd.Series(first_model_fit.model.endog, index=first_model_fit.model.data.row_labels)
    train_data = train_data.to_frame()
    target_var_name = first_model_fit.model.endog_names
    train_data.columns = [f'{target_var_name}_train']

    # Initialize an empty DataFrame to store the predictions
    prediction_df = pd.DataFrame(index=df_test.index)
    prediction_df[f'{target_var_name}_test'] = np.nan

    # Concatenate the train_data and prediction_df
    combined_df = pd.concat([train_data, prediction_df], axis=0)

    # Loop through each model fit
    for model_name, model_fit in model_fits_dict.items():
        # Prepare the test dataset
        exog_names = model_fit.model.exog_names
        X_test = df_test.copy()

        # Add the constant if it's missing
        if 'const' in exog_names and 'const' not in X_test.columns:
            X_test['const'] = 1.0

        # Select the necessary columns
        X_test = X_test[exog_names]

        # Generate the predictions
        predictions = model_fit.predict(X_test)

        # Add the predictions to the DataFrame
        combined_df[model_name] = np.nan
        combined_df[model_name].iloc[len(train_data):] = predictions

    # Add the test data to the '<target_variable>_test' column
    combined_df[f'{target_var_name}_test'].iloc[len(train_data):] = df_test[target_var_name]

    return combined_df



In [None]:
# Replace with your list of model fits
model_fits = {
    'model_1': model_1,
    'model_3': model_3
}
prediction_df = get_model_prediction(model_fits, test_df)
display(prediction_df)

In [None]:
def plot_predictions(prediction_df, subplot=True):
    n_models = prediction_df.shape[1] - 2

    if subplot:
        fig, axes = plt.subplots(n_models, 1, figsize=(12, 6 * n_models), sharex=True)

        for i in range(n_models):
            axes[i].plot(prediction_df.index, prediction_df.iloc[:, 0], label=prediction_df.columns[0], color='grey')
            axes[i].plot(prediction_df.index, prediction_df.iloc[:, 1], label=prediction_df.columns[1], color='lightgrey')
            axes[i].plot(prediction_df.index, prediction_df.iloc[:, i + 2], label=prediction_df.columns[i + 2], linestyle='-')
            axes[i].set_ylabel('Target Variable')
            axes[i].set_title(f'Test Data vs. {prediction_df.columns[i + 2]}')
            axes[i].legend()
            axes[i].grid(True)
        plt.xlabel('Date')
        plt.tight_layout()
        plt.show()

    else:
        plt.figure(figsize=(12, 6))
        plt.plot(prediction_df.index, prediction_df.iloc[:, 0], label=prediction_df.columns[0], color='grey')
        plt.plot(prediction_df.index, prediction_df.iloc[:, 1], label=prediction_df.columns[1], color='lightgrey')

        for i in range(2, prediction_df.shape[1]):
            plt.plot(prediction_df.index, prediction_df.iloc[:, i], label=prediction_df.columns[i], linestyle='-')

        plt.xlabel('Date')
        plt.ylabel('Target Variable')
        plt.title('Test Data vs. Model Forecasts')
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
plot_predictions(prediction_df, subplot=True)