# Loan Rates Forecast Models Comparison test plan

#### Setup

In [1]:
%load_ext dotenv
%dotenv .env

import os
os.chdir(os.path.join(os.getcwd(), "../.."))

In [2]:
import validmind as vm
vm.init(  api_host = "http://localhost:3000/api/v1/tracking",
  project = "clhhzo21s006wl9rl0swhv40h")


Connected to ValidMind. Project: Stock Price Prediction Model - Initial Validation (clhhzo21s006wl9rl0swhv40h)


In [3]:
# System libraries
import glob

# ML libraries
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.tsa.arima.model import ARIMA
from arch.unitroot import PhillipsPerron, DFGLS
import xgboost as xgb
from numpy import argmax
from sklearn.metrics import accuracy_score, precision_recall_curve
from sklearn.model_selection import train_test_split

# Plotting libraries 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#### Load FRED Data

In [4]:
def merge_fred_csv_files(file_pattern):
    # Use glob to find all files matching the specified pattern
    file_list = glob.glob(file_pattern)

    # Initialize an empty list to store individual DataFrames
    dataframes = []

    # Iterate through each file in the file list
    for file in file_list:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')

        # Add the DataFrame to the list of DataFrames
        dataframes.append(df)

    # Merge all the DataFrames in the list into a single DataFrame
    merged_df = pd.concat(dataframes, axis=1)

    return merged_df


In [5]:
file_pattern = './notebooks/datasets/time_series/raw/fred/*.csv'
df = merge_fred_csv_files(file_pattern)
display(df)

Unnamed: 0_level_0,GDPC1,GS5,GS10,GS3,MORTGAGE30US,UNRATE,CPIAUCSL,FEDFUNDS,GDP
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1947-01-01,2034.450,,,,,,21.48,,243.164
1947-02-01,,,,,,,21.62,,
1947-03-01,,,,,,,22.00,,
1947-04-01,2029.024,,,,,,22.00,,245.968
1947-05-01,,,,,,,21.95,,
...,...,...,...,...,...,...,...,...,...
2023-04-01,,,3.46,,,,,,
2023-04-06,,,,,6.28,,,,
2023-04-13,,,,,6.27,,,,
2023-04-20,,,,,6.39,,,,


Preselection of variables.

In [6]:
selected_cols = ['MORTGAGE30US', 'UNRATE', 'GS10', 'FEDFUNDS'] 
df = df[selected_cols]

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3551 entries, 1947-01-01 to 2023-04-27
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MORTGAGE30US  2718 non-null   float64
 1   UNRATE        903 non-null    float64
 2   GS10          841 non-null    float64
 3   FEDFUNDS      825 non-null    float64
dtypes: float64(4)
memory usage: 138.7 KB


In [8]:
df = df.resample('MS').last()
df = df.dropna()
df_diff = df.diff().dropna()

## Model Methodology

### Training Data

#### Sampling 

**Step 1: Split dataset into Training and Test**

In [9]:
split_index = int(len(df) * 0.8)   # use 80% of the data for training
df_train, df_test = df[:split_index], df[split_index:]

**Step 2: Create a Stationary Train and Test Dataset**

In [10]:
# Apply first difference to both training and test df
df_train_diff = df_train.diff().dropna()
df_test_diff = df_test.diff().dropna()


#### Model Training

#### Model 1: Loan Rates and FEDFUNDS

**Step 1: Fit Model**

In [11]:
m1_training_dataset = df_train_diff[['FEDFUNDS','MORTGAGE30US']]
m1_test_dataset = df_test_diff[['FEDFUNDS','MORTGAGE30US']]

# Add the independent variables with no intercept
X_1 = df_train_diff['FEDFUNDS']

# Define the dependent variable 
y_1 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_1 = sm.OLS(y_1, X_1).fit()

# Display the model summary
print(model_1.summary())

                                 OLS Regression Results                                
Dep. Variable:           MORTGAGE30US   R-squared (uncentered):                   0.286
Model:                            OLS   Adj. R-squared (uncentered):              0.284
Method:                 Least Squares   F-statistic:                              198.8
Date:                Thu, 11 May 2023   Prob (F-statistic):                    3.22e-38
Time:                        14:49:40   Log-Likelihood:                         -57.220
No. Observations:                 498   AIC:                                      116.4
Df Residuals:                     497   BIC:                                      120.7
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

### Model 2: Loan Rates, constant and FEDFUNDS

**Step 1: Fit Model**

In [12]:
m2_training_dataset = df_train_diff[['FEDFUNDS','MORTGAGE30US']]
m2_training_dataset['const'] = 1.0
m2_test_dataset = df_test_diff[['FEDFUNDS','MORTGAGE30US']]
m2_test_dataset['const'] = 1.0



# Add a constant to the independent variables for the linear regression model
X_2 = sm.add_constant(df_train_diff['FEDFUNDS'])

# Define the dependent variable 
y_2 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_2 = sm.OLS(y_2, X_2).fit()

# Display the model summary
print(model_2.summary())

                            OLS Regression Results                            
Dep. Variable:           MORTGAGE30US   R-squared:                       0.286
Model:                            OLS   Adj. R-squared:                  0.284
Method:                 Least Squares   F-statistic:                     198.3
Date:                Thu, 11 May 2023   Prob (F-statistic):           3.99e-38
Time:                        14:49:40   Log-Likelihood:                -57.120
No. Observations:                 498   AIC:                             118.2
Df Residuals:                     496   BIC:                             126.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -0.0055      0.012     -0.448      0.6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


**Step 2: Reasoning**

### Model 3: Loan Rates and GS10

In [13]:
m3_training_dataset = df_train_diff[['GS10','MORTGAGE30US']]
m3_test_dataset = df_test_diff[['GS10','MORTGAGE30US']]


# Add a constant to the independent variables for the linear regression model
X_3 = df_train_diff['GS10']

# Define the dependent variable 
y_3 = df_train_diff['MORTGAGE30US']

# Fit the linear regression model
model_3 = sm.OLS(y_3, X_3).fit()

# Display the model summary
print(model_3.summary())

                                 OLS Regression Results                                
Dep. Variable:           MORTGAGE30US   R-squared (uncentered):                   0.529
Model:                            OLS   Adj. R-squared (uncentered):              0.528
Method:                 Least Squares   F-statistic:                              558.1
Date:                Thu, 11 May 2023   Prob (F-statistic):                    2.80e-83
Time:                        14:49:40   Log-Likelihood:                          46.439
No. Observations:                 498   AIC:                                     -90.88
Df Residuals:                     497   BIC:                                     -86.67
Df Model:                           1                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------

#### VM model summary test plan

In [14]:
vm.test_plans.list_plans()

ID,Name,Description
binary_classifier_metrics,BinaryClassifierMetrics,Test plan for sklearn classifier metrics
binary_classifier_validation,BinaryClassifierPerformance,Test plan for sklearn classifier models
binary_classifier_model_diagnosis,BinaryClassifierDiagnosis,Test plan for sklearn classifier model diagnosis tests
binary_classifier,BinaryClassifier,Test plan for sklearn classifier models that includes  both metrics and validation tests
tabular_dataset,TabularDataset,Test plan for generic tabular datasets
tabular_dataset_description,TabularDatasetDescription,Test plan to extract metadata and descriptive  statistics from a tabular dataset
tabular_data_quality,TabularDataQuality,Test plan for data quality on tabular datasets
normality_test_plan,NormalityTestPlan,Test plan to perform normality tests.
autocorrelation_test_plan,AutocorrelationTestPlan,Test plan to perform autocorrelation tests.
seasonality_test_plan,SesonalityTestPlan,Test plan to perform seasonality tests.


#### Model perfomance Test

In [15]:
vm_model_1 = vm.init_model(model_1)
vm_train_ds_1 = vm.init_dataset(dataset=m1_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_1 = vm.init_dataset(dataset=m1_test_dataset, type="generic", target_column="MORTGAGE30US")

model_performance_test_plan = vm.run_test_plan("regression_model_performance", 
                                             model=vm_model_1
                                            )

vm_train_ds_2 = vm.init_dataset(dataset=m2_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_2 = vm.init_dataset(dataset=m2_test_dataset, type="generic", target_column="MORTGAGE30US")
vm_model_2 = vm.init_model(model_2, train_ds=vm_train_ds_2, test_ds=vm_test_ds_2, validation_ds=vm_test_ds_2)


vm_train_ds_3 = vm.init_dataset(dataset=m3_training_dataset, type="generic", target_column="MORTGAGE30US")
vm_test_ds_3 = vm.init_dataset(dataset=m3_test_dataset, type="generic", target_column="MORTGAGE30US")
vm_model_3 = vm.init_model(model_3, train_ds=vm_train_ds_3, test_ds=vm_test_ds_3, validation_ds=vm_test_ds_3)


model_comparison_test_plan = vm.run_test_plan("regression_models_comparison", 
                                             model = vm_model_1,
                                             models= [vm_model_2, vm_model_3],
                                            )



Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


                                                                                                                                          

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...
Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


                                                                                                                                                 