# Loan Rates Forecast Models Comparison test plan

#### Setup

In [None]:
%load_ext dotenv
%dotenv .env

import os
os.chdir(os.path.join(os.getcwd(), "../.."))

In [None]:
import validmind as vm
vm.init(  api_host = "https://api.prod.validmind.ai/api/v1/tracking",
  project = "clhhzo21s006wl9rl0swhv40h")


In [None]:
# System libraries
import glob

# ML libraries
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima.model import ARIMA
from arch.unitroot import PhillipsPerron, DFGLS
import xgboost as xgb
from numpy import argmax
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Load FRED Data

In [None]:
def merge_fred_csv_files(file_pattern):
    # Use glob to find all files matching the specified pattern
    file_list = glob.glob(file_pattern)

    # Initialize an empty list to store individual DataFrames
    dataframes = []

    # Iterate through each file in the file list
    for file in file_list:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')

        # Add the DataFrame to the list of DataFrames
        dataframes.append(df)

    # Merge all the DataFrames in the list into a single DataFrame
    merged_df = pd.concat(dataframes, axis=1)

    return merged_df


In [None]:
file_pattern = './notebooks/datasets/time_series/raw/fred/*.csv'
df = merge_fred_csv_files(file_pattern)
display(df)

Preselection of variables.

In [None]:
selected_cols = ['MORTGAGE30US', 'UNRATE', 'GS10', 'FEDFUNDS']
df = df[selected_cols]

Plot time series.

In [None]:
def plot_time_series(df, cols_to_plot=None, title=''):
    """
    Plot multiple time-series in the same axes using seaborn.

    :param df: DataFrame with time-series data
    :param cols_to_plot: List of column names to plot. If None, plot all columns in df.
    :param title: Title of the plot, default is ''
    """
    if cols_to_plot is None:
        cols_to_plot = df.columns.tolist()

    # Create a new DataFrame with the columns to plot
    plot_df = df[cols_to_plot]

    # Set seaborn plot style
    sns.set(style="darkgrid")

    # Plot the time-series data
    plt.figure(figsize=(12, 6))
    for col in plot_df.columns:
        sns.lineplot(data=plot_df[col], label=col)

    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.title(title)
    plt.legend()
    plt.show()


In [None]:
plot_time_series(df, title='All Variables')

#### Data Description

In [None]:
df.info()

#### Data Quality

In [None]:
df = df.resample('MS').last()
df = df.dropna()

### Exploratory Data Analysis

In [None]:
df_diff = df.diff().dropna()

### Training Data

#### Sampling 

**Step 1: Split dataset into Training and Test**

In [None]:
split_index = int(len(df) * 0.8)   # use 80% of the data for training
df_train, df_test = df[:split_index], df[split_index:]

**Step 2: Create a Stationary Train and Test Dataset**

In [None]:
# Apply first difference to both training and test df
df_train_diff = df_train.diff().dropna()
df_test_diff = df_test.diff().dropna()


#### Model Training

#### Model 1: Loan Rates and FEDFUNDS

**Step 1: Fit Model**

In [None]:
m1_training_dataset = df_train_diff[['FEDFUNDS','MORTGAGE30US']]
m1_test_dataset = df_test_diff[['FEDFUNDS','MORTGAGE30US']]

# Add the independent variables with no intercept
X_1 = df_train_diff['FEDFUNDS']

# Define the dependent variable
y_1 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_1 = sm.OLS(y_1, X_1).fit()

# Display the model summary
print(model_1.summary())

## Model Methodology

#### Model Training

#### Model 1: Loan Rates and FEDFUNDS

**Step 1: Fit Model**

In [None]:
m1_training_dataset = df_train_diff[['FEDFUNDS','MORTGAGE30US']]
m1_test_dataset = df_test_diff[['FEDFUNDS','MORTGAGE30US']]

# Add the independent variables with no intercept
X_1 = df_train_diff['FEDFUNDS']

# Define the dependent variable
y_1 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_1 = sm.OLS(y_1, X_1).fit()

# Display the model summary
print(model_1.summary())

### Model 2: Loan Rates, constant and FEDFUNDS

**Step 1: Fit Model**

In [None]:
m2_training_dataset = df_train_diff[['FEDFUNDS','MORTGAGE30US']]
m2_training_dataset['const'] = 1.0
m2_test_dataset = df_test_diff[['FEDFUNDS','MORTGAGE30US']]
m2_test_dataset['const'] = 1.0



# Add a constant to the independent variables for the linear regression model
X_2 = sm.add_constant(df_train_diff['FEDFUNDS'])

# Define the dependent variable
y_2 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_2 = sm.OLS(y_2, X_2).fit()

# Display the model summary
print(model_2.summary())

**Step 2: Reasoning**

### Model 3: Loan Rates and GS10

In [None]:
m3_training_dataset = df_train_diff[['GS10','MORTGAGE30US']]
m3_test_dataset = df_test_diff[['GS10','MORTGAGE30US']]


# Add a constant to the independent variables for the linear regression model
X_3 = df_train_diff['GS10']

# Define the dependent variable
y_3 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_3 = sm.OLS(y_3, X_3).fit()

# Display the model summary
print(model_3.summary())

#### VM model summary test plan

In [None]:
vm.test_suites.list_plans()

#### Model perfomance Test

In [None]:
vm_train_ds_1 = vm.init_dataset(dataset=m1_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_1 = vm.init_dataset(dataset=m1_test_dataset, type="generic", target_column="MORTGAGE30US")
vm_model_1 = vm.init_model(model_1, train_ds=vm_train_ds_1, test_ds=vm_test_ds_1, validation_ds=vm_test_ds_1)

model_performance_test_suite = vm.run_test_suite("regression_model_description",
                                             model=vm_model_1
                                            )

vm_train_ds_2 = vm.init_dataset(dataset=m2_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_2 = vm.init_dataset(dataset=m2_test_dataset, type="generic", target_column="MORTGAGE30US")
vm_model_2 = vm.init_model(model_2, train_ds=vm_train_ds_2, test_ds=vm_test_ds_2, validation_ds=vm_test_ds_2)


vm_train_ds_3 = vm.init_dataset(dataset=m3_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_3 = vm.init_dataset(dataset=m3_test_dataset, type="generic", target_column="MORTGAGE30US")
vm_model_3 = vm.init_model(model_3, train_ds=vm_train_ds_3, test_ds=vm_test_ds_3, validation_ds=vm_test_ds_3)


model_comparison_test_suite = vm.run_test_suite("regression_models_evaluation",
                                             model = vm_model_1,
                                             models= [vm_model_2, vm_model_3],
                                            )

