# Forecasting US Interest Rate: Regression with Decision Tree

Keypoints:
The program is organized by following steps:
1. Data import
2. Data processing: Translate Non-stationary data into stationary data using the give transformation code. Select the target column which is 'FEDFUNDS',transform the order of the column. Data cleaning : drop some columns and rows without data.
3. Feature selection: Use Recursive Feature Elimination (RFE) to select the most relevent feature.(Similar to code sample given by Pro.Ng).
4. Optimal hyperparameters: Grid search for optimal features number, module max depth,min split and min leaf number.
5. Moduling：Regression with Decision Tree, feature number is 5, max_depth is 3,min_samples_split is 2,min_samples_leaf is 1.
6. Performance: The mse between predicted value and actual value is 0.42, not too bad. But there are some problems：1. Underfitting. The predicted value doesn't fit the actual value vary well although the mse is small.(Check the table and two pictures) 2.Computationally expensive. The grid search part cost lots of time to building a decision tree, if there are more feature to check, the time will be so long which also limit the selection of parameters.

In [3]:
#Standard libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from pandas import read_csv
import time
sns.set(style='whitegrid', palette='muted')
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 14, 8

#sklearn
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn import linear_model
from sklearn.linear_model import LinearRegression

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.feature_selection import RFE

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

In [4]:
# Imports data
macro = pd.read_csv('US FRED-MD Macro Dataset.csv' ,header=0, index_col=0, parse_dates=True)
macro.shape

(773, 127)

In [5]:
macro

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST,VIXCLSx
sasdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959-01-01,2442.158,2293.2,17.272,2.922664e+05,18235.77392,21.9665,23.3891,22.2688,31.7011,19.0149,...,17.791,11.326,2.13,2.45,2.04,,6476.00,12298.00,84.2043,
1959-01-02,2451.778,2301.5,17.452,2.944247e+05,18369.56308,22.3966,23.7048,22.4617,31.9337,19.1147,...,17.798,11.343,2.14,2.46,2.05,,6476.00,12298.00,83.5280,
1959-01-03,2467.594,2318.5,17.617,2.934187e+05,18523.05762,22.7193,23.8483,22.5719,31.9337,19.4890,...,17.785,11.363,2.15,2.45,2.07,,6508.00,12349.00,81.6405,
1959-01-04,2483.671,2334.9,17.553,2.993228e+05,18534.46600,23.2032,24.1927,22.9026,32.4374,19.6138,...,17.796,11.403,2.16,2.47,2.08,,6620.00,12484.00,81.8099,
1959-01-05,2498.026,2350.4,17.765,3.013643e+05,18679.66354,23.5528,24.3936,23.1231,32.5925,20.0130,...,17.777,11.421,2.17,2.48,2.08,95.3,6753.00,12646.00,80.7315,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-01,17821.000,14623.3,130.028,1.581882e+06,692501.00000,102.5080,101.2863,101.3295,102.0755,104.8646,...,114.885,134.903,28.91,33.44,25.82,64.9,383384.74,720635.04,5505.6964,20.2269
2023-01-02,17839.528,14622.7,129.991,1.575290e+06,687942.00000,102.5023,101.0873,101.1983,102.1753,105.0877,...,115.279,135.397,28.98,33.61,25.81,67.0,385165.01,718917.35,5491.3981,20.2141
2023-01-03,17885.887,14664.3,129.948,1.560472e+06,681673.00000,102.6521,101.0102,101.1678,102.4580,103.9786,...,114.930,135.782,29.12,33.79,25.95,62.0,388912.72,719670.11,5366.2449,22.0446
2023-01-04,17866.773,14659.0,130.255,1.556928e+06,684213.00000,103.1748,101.7373,102.1572,103.4504,108.3743,...,115.341,136.315,29.22,33.92,26.03,63.5,390133.21,720155.25,5250.7099,17.4930


In [6]:
# data processing
tran_code = pd.read_csv('Definition_Transformation Code.csv', header=0, index_col=0, parse_dates=False)
tran_code.shape

(1, 127)

In [7]:
tran_code

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST,VIXCLSx
fred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tcode,5,5,5,5,5,5,5,5,5,5,...,6,6,6,6,6,2,6,6,6,1


In [8]:
def data_transform(x, tran_code):
    
    if tran_code == 1:
        x_tr = x        
        
    elif tran_code == 2:
        x_tr = x.diff()
            
    elif tran_code == 3:
        x_tr = x.diff().diff()
        
    elif tran_code == 4:
        x_tr = np.log(x)
        
    elif tran_code == 5:
        x_tr = np.log(x).diff()*100
    
    elif tran_code == 6:
        x_tr = np.log(x).diff().diff()*100
    else :
        x_tr = (x.pct_change()-1)*100
    
    return x_tr 

In [9]:
macro_tr=[]

for col in macro.columns:
    tr_code = tran_code[col].values
    data_tr = data_transform(macro[col], tr_code)
    macro_tr.append(data_tr)

macro_tr = pd.DataFrame(macro_tr).T

In [10]:
macro_tr

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST,VIXCLSx
sasdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959-01-01,,,,,,,,,,,...,,,,,,,,,,
1959-01-02,0.393140,0.361286,1.036756,0.735762,0.730985,1.939060,1.340746,0.862504,0.731050,0.523479,...,,,,,,,,,,
1959-01-03,0.643011,0.735934,0.941009,-0.342293,0.832120,1.430562,0.603538,0.489413,0.000000,1.939253,...,-0.112407,0.026180,-0.002184,-0.814665,0.481883,,0.492915,0.413844,-1.479236,
1959-01-04,0.649412,0.704864,-0.363947,1.992210,0.061571,2.107542,1.433800,1.454467,1.565020,0.638320,...,0.134899,0.175237,-0.002163,1.220345,-0.488953,,1.213403,0.673430,2.492924,
1959-01-05,0.576311,0.661646,1.200535,0.679731,0.780340,1.495451,0.826987,0.958168,0.477012,2.014866,...,-0.168653,-0.193673,-0.002143,-0.408972,-0.481929,,0.282830,0.202040,-1.534223,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-01,0.087570,0.130699,1.316033,0.190398,2.764501,1.003181,0.343776,-0.014901,-0.334291,0.217564,...,1.425376,0.033243,0.170724,-0.004386,0.309575,5.2,0.071035,0.049043,-0.142513,20.2269
2023-01-02,0.103913,-0.004103,-0.028459,-0.417590,-0.660515,-0.005561,-0.196666,-0.129562,0.097723,0.212525,...,-0.416042,-0.224790,-0.417543,-0.152983,-0.621376,2.1,-0.237727,-0.068985,-0.121427,20.2141
2023-01-03,0.259530,0.284085,-0.033085,-0.945104,-0.915446,0.146036,-0.076300,-0.030143,0.276299,-1.061013,...,-0.645568,-0.081575,0.240091,0.027041,0.579697,-5.0,0.505030,0.343295,-2.045412,22.0446
2023-01-04,-0.106924,-0.036149,0.235970,-0.227369,0.371920,0.507904,0.717250,0.973228,0.963931,4.140587,...,0.660174,0.107827,-0.139110,-0.150135,-0.233149,1.5,-0.654981,-0.037264,0.128939,17.4930


In [11]:
# Show rows and columns
print("Rows, Columns:");print(macro_tr.shape);print("\n")

# Describe DataFrame columns
print("Columns:");print(macro_tr.columns);print("\n")

# Show info on DataFrame
print("Info:");print(macro_tr.info(max_cols=1000)); print("\n")

# Count Non-NA values
print("Non-NA:");print(macro_tr.count()); print("\n")

# Show head
print("Head");print(macro_tr.head()); print("\n")

# Show tail
print("Tail");print(macro_tr.tail());print("\n")

# Show summary statistics
print("Summary statistics:");print(macro_tr.describe());print("\n")

Rows, Columns:
(773, 127)


Columns:
Index(['RPI', 'W875RX1', 'DPCERA3M086SBEA', 'CMRMTSPLx', 'RETAILx', 'INDPRO',
       'IPFPNSS', 'IPFINAL', 'IPCONGD', 'IPDCONGD',
       ...
       'DNDGRG3M086SBEA', 'DSERRG3M086SBEA', 'CES0600000008', 'CES2000000008',
       'CES3000000008', 'UMCSENTx', 'DTCOLNVHFNM', 'DTCTHFNM', 'INVEST',
       'VIXCLSx'],
      dtype='object', length=127)


Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 773 entries, 1959-01-01 to 2023-01-05
Data columns (total 127 columns):
 #    Column           Non-Null Count  Dtype  
---   ------           --------------  -----  
 0    RPI              772 non-null    float64
 1    W875RX1          772 non-null    float64
 2    DPCERA3M086SBEA  772 non-null    float64
 3    CMRMTSPLx        771 non-null    float64
 4    RETAILx          772 non-null    float64
 5    INDPRO           772 non-null    float64
 6    IPFPNSS          772 non-null    float64
 7    IPFINAL          772 non-null    float64
 8    IPCONGD 

In [12]:
FED = macro_tr['FEDFUNDS']
macro_tr = macro_tr.drop(columns=['FEDFUNDS'])
macro_tr = pd.concat([macro_tr, FED], axis=1)
macro_tr

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,UMCSENTx,DTCOLNVHFNM,DTCTHFNM,INVEST,VIXCLSx,FEDFUNDS
sasdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1959-01-01,,,,,,,,,,,...,,,,,,,,,,
1959-01-02,0.393140,0.361286,1.036756,0.735762,0.730985,1.939060,1.340746,0.862504,0.731050,0.523479,...,,,,,,,,,,-0.05
1959-01-03,0.643011,0.735934,0.941009,-0.342293,0.832120,1.430562,0.603538,0.489413,0.000000,1.939253,...,0.026180,-0.002184,-0.814665,0.481883,,0.492915,0.413844,-1.479236,,0.37
1959-01-04,0.649412,0.704864,-0.363947,1.992210,0.061571,2.107542,1.433800,1.454467,1.565020,0.638320,...,0.175237,-0.002163,1.220345,-0.488953,,1.213403,0.673430,2.492924,,0.16
1959-01-05,0.576311,0.661646,1.200535,0.679731,0.780340,1.495451,0.826987,0.958168,0.477012,2.014866,...,-0.193673,-0.002143,-0.408972,-0.481929,,0.282830,0.202040,-1.534223,,-0.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-01-01,0.087570,0.130699,1.316033,0.190398,2.764501,1.003181,0.343776,-0.014901,-0.334291,0.217564,...,0.033243,0.170724,-0.004386,0.309575,5.2,0.071035,0.049043,-0.142513,20.2269,0.23
2023-01-02,0.103913,-0.004103,-0.028459,-0.417590,-0.660515,-0.005561,-0.196666,-0.129562,0.097723,0.212525,...,-0.224790,-0.417543,-0.152983,-0.621376,2.1,-0.237727,-0.068985,-0.121427,20.2141,0.24
2023-01-03,0.259530,0.284085,-0.033085,-0.945104,-0.915446,0.146036,-0.076300,-0.030143,0.276299,-1.061013,...,-0.081575,0.240091,0.027041,0.579697,-5.0,0.505030,0.343295,-2.045412,22.0446,0.08
2023-01-04,-0.106924,-0.036149,0.235970,-0.227369,0.371920,0.507904,0.717250,0.973228,0.963931,4.140587,...,0.107827,-0.139110,-0.150135,-0.233149,1.5,-0.654981,-0.037264,0.128939,17.4930,0.18


In [13]:
#Data cleaning
threshold = 0.05
missing_ratio_c = macro_tr.isnull().mean()
columns_to_drop = missing_ratio_c[missing_ratio_c > threshold].index
macro_tr_cleaned_c = macro_tr.drop(columns=columns_to_drop)
print(columns_to_drop)

Index(['ACOGNO', 'ANDENOx', 'TWEXAFEGSMTHx', 'UMCSENTx', 'VIXCLSx'], dtype='object')


In [14]:
macro_tr_cleaned_r = macro_tr_cleaned_c.dropna()
macro_tr_cleaned_r.shape

(757, 122)

In [15]:
 macro_final=macro_tr_cleaned_r
# Show rows and columns
print("Rows, Columns:");print(macro_final.shape);print("\n")

# Describe DataFrame columns
print("Columns:");print(macro_final.columns);print("\n")

# Show info on DataFrame
print("Info:");print(macro_final.info(max_cols=1000)); print("\n")

# Count Non-NA values
print("Non-NA:");print(macro_final.count()); print("\n")

# Show head
print("Head");print(macro_final.head()); print("\n")

# Show tail
print("Tail");print(macro_final.tail());print("\n")

# Show summary statistics
print("Summary statistics:");print(macro_final.describe());print("\n")

Rows, Columns:
(757, 122)


Columns:
Index(['RPI', 'W875RX1', 'DPCERA3M086SBEA', 'CMRMTSPLx', 'RETAILx', 'INDPRO',
       'IPFPNSS', 'IPFINAL', 'IPCONGD', 'IPDCONGD',
       ...
       'DDURRG3M086SBEA', 'DNDGRG3M086SBEA', 'DSERRG3M086SBEA',
       'CES0600000008', 'CES2000000008', 'CES3000000008', 'DTCOLNVHFNM',
       'DTCTHFNM', 'INVEST', 'FEDFUNDS'],
      dtype='object', length=122)


Info:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 757 entries, 1960-01-01 to 2023-01-03
Data columns (total 122 columns):
 #    Column           Non-Null Count  Dtype  
---   ------           --------------  -----  
 0    RPI              757 non-null    float64
 1    W875RX1          757 non-null    float64
 2    DPCERA3M086SBEA  757 non-null    float64
 3    CMRMTSPLx        757 non-null    float64
 4    RETAILx          757 non-null    float64
 5    INDPRO           757 non-null    float64
 6    IPFPNSS          757 non-null    float64
 7    IPFINAL          757 non-null    float64
 8    

In [16]:
#Define Data for Machine Learning
X_start = 0
X_lag = 1

# Features and Target
X, y = macro_final.iloc[X_start:,:-1].shift(X_lag).dropna(), macro_final.iloc[X_start+X_lag:,121]

In [17]:
X

Unnamed: 0_level_0,RPI,W875RX1,DPCERA3M086SBEA,CMRMTSPLx,RETAILx,INDPRO,IPFPNSS,IPFINAL,IPCONGD,IPDCONGD,...,PCEPI,DDURRG3M086SBEA,DNDGRG3M086SBEA,DSERRG3M086SBEA,CES0600000008,CES2000000008,CES3000000008,DTCOLNVHFNM,DTCTHFNM,INVEST
sasdate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1960-01-02,0.319409,0.463825,0.278847,1.695646,2.660551,2.591713,2.409495,2.902087,3.123538,10.383083,...,-0.171620,-0.259079,-0.178584,-0.128845,-0.468036,0.779728,-0.970531,0.429236,-1.173886,-1.333027
1960-01-03,0.114254,0.091674,0.433454,1.437148,0.369645,-0.893885,-0.568540,-0.343503,-1.145653,-1.385485,...,0.159303,0.376462,0.167438,0.085463,-0.455545,0.385334,-0.475018,0.826845,0.538766,-1.894229
1960-01-04,0.190876,0.091590,1.404092,-2.802612,-0.110199,-0.901948,-0.342682,-0.115042,0.115277,-1.996461,...,-0.049056,-0.455511,0.178561,-0.111313,-0.002011,3.013134,-0.468385,0.347209,0.095090,0.348999
1960-01-05,0.341874,0.361379,1.535528,0.982647,2.590339,-0.795880,0.228983,0.115042,0.687998,-0.118913,...,0.311430,0.411911,0.443860,0.145034,-0.894856,-6.807429,0.000000,0.867818,0.545173,2.413598
1960-01-06,0.240054,0.244332,-2.034337,-3.153052,-1.504683,-0.114277,0.568790,0.686244,0.569884,0.945486,...,-0.262920,-0.129782,-0.672338,0.059133,0.894856,4.161967,0.000000,-0.482728,-0.206712,0.379810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-01-11,0.129404,-0.239825,0.265240,-0.106164,1.039717,-0.112980,0.242440,0.427547,0.515647,1.338680,...,0.075852,-0.838316,1.249500,-0.134901,0.033090,-0.155321,0.037512,0.034115,0.214359,-0.438250
2022-01-12,0.000898,-0.075962,-0.414455,-0.947270,-1.320097,-0.330005,-0.332552,-0.400833,-0.179349,-2.240469,...,-0.248867,-0.174864,-0.884680,-0.056635,0.102595,-0.062401,0.389154,0.162379,-0.007056,0.959749
2023-01-01,-0.013781,-0.019171,-0.193846,1.490076,-0.719328,-1.550613,-1.202485,-0.846966,-0.623823,-1.031771,...,0.030879,0.344041,-0.613838,0.175427,-0.107940,0.300155,-0.590025,-0.283245,-0.123254,0.880929
2023-01-02,0.087570,0.130699,1.316033,0.190398,2.764501,1.003181,0.343776,-0.014901,-0.334291,0.217564,...,0.382455,0.474433,1.425376,0.033243,0.170724,-0.004386,0.309575,0.071035,0.049043,-0.142513


In [18]:
y

sasdate
1960-01-02   -0.02
1960-01-03   -0.13
1960-01-04    0.08
1960-01-05   -0.07
1960-01-06   -0.53
              ... 
2022-01-11    0.70
2022-01-12    0.32
2023-01-01    0.23
2023-01-02    0.24
2023-01-03    0.08
Name: FEDFUNDS, Length: 756, dtype: float64

#Feature Selection: Recursive Feature Elimination

In [20]:
ols = LinearRegression()
model_rfe = RFE(ols, n_features_to_select=10)
model_rfe = model_rfe.fit(X, y)

In [21]:
rfe = model_rfe.support_
rfe

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
        True,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False,  True, False,
       False,  True, False, False, False, False,  True,  True, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False])

In [22]:
#Identifying the selected features

rfe_features = []

for i in range(0,rfe.shape[0]):
    if rfe[i] == True:
        rfe_feature = X.columns[i]
        rfe_features.append(rfe_feature)
    
rfe_features

['HWIURATIO',
 'MANEMP',
 'DMANEMP',
 'NDMANEMP',
 'ISRATIOx',
 'CONSPI',
 'S&P div yield',
 'GS1',
 'GS5',
 'AAA']

#Define Training,Validation,and Test Sets

In [24]:
train_pct=0.7
valid_pct=0.15
test_pct=0.15

n_train=int(train_pct*len(X))
n_valid=int(valid_pct*len(X))
n_test=len(X)-n_train-n_valid

print(n_train,n_valid,n_test)

529 113 114


#Grid Search for Optimal Hypermeters

In [26]:
# pipeline
def pipeline(config):

    # unpack config
    n_depth, n_split, n_leaf, n_features = config

    # Steps
    steps = [('rfe', RFE(LinearRegression(), n_features_to_select=n_features)),
             ('dtr', DecisionTreeRegressor(max_depth=n_depth,min_samples_split=n_split,min_samples_leaf=n_leaf))
            ]

    pipeline = Pipeline(steps)

    return pipeline

In [27]:
def walk_forward_validation(cfg):
    
    n_trains = n_train

    n_records = n_train + n_valid

    y_test_list = []
    
    y_pred_list = []

    j = 0

    for i in range(n_trains, n_records):
    
        X_train, X_test, y_train, y_test = X[j:i], X[i:i+1], y[j:i], y[i:i+1]
       
        model = pipeline(cfg).fit(X_train, y_train)        
    
        y_pred = model.predict(X_test)
    
        y_pred_list.extend(y_pred)
    
        y_test_list.extend(y_test)

        j += 1
    
    score_rmse = metrics.mean_squared_error(y_test_list, y_pred_list,)
    
    print(' > %.3f' % score_rmse)
    
    return score_rmse 

In [28]:
# score a model, return None on failure
def repeat_evaluate(config, n_repeats=1):
    # convert config to a key
    key = str(config)
    # fit and evaluate the model n times
    scores = [walk_forward_validation(config) for _ in range(n_repeats)]
    # summarize score
    result = np.mean(scores)
    print('> Model[%s] %.3f' % (key, result))
    return (key, result)

In [29]:
# grid search configs
def grid_search(cfg_list):
    # evaluate configs
    scores = [repeat_evaluate(cfg) for cfg in cfg_list]
    # sort configs by score_rmse, asc
    #scores.sort(key=lambda tup: tup[1])
    scores.sort(key=lambda tup: tup[1], reverse=True)
    return scores

In [30]:
# create a list of configs to try
def model_configs():
    # define scope of configs

    n_depth=[3,5]
    n_split=[2,5]
    n_leaf=[1,2]
    n_features = [5,10]

    #n_features = [10]
     
    # create configs
    configs = list()
    
    for k in n_depth:
        for l in n_split:
            for m in n_leaf:
                for n in n_features:
                    cfg = [k,l,m,n]
                    configs.append(cfg)

    print('Total configs: %d' % len(configs))
    return configs

In [31]:
# model configs
cfg_list = model_configs()

Total configs: 16


In [None]:
# grid search
scores = grid_search(cfg_list)
print('done')

 > 0.034
> Model[[3, 2, 1, 5]] 0.034
 > 0.038
> Model[[3, 2, 1, 10]] 0.038
 > 0.036
> Model[[3, 2, 2, 5]] 0.036
 > 0.039
> Model[[3, 2, 2, 10]] 0.039
 > 0.034
> Model[[3, 5, 1, 5]] 0.034
 > 0.038
> Model[[3, 5, 1, 10]] 0.038
 > 0.036
> Model[[3, 5, 2, 5]] 0.036
 > 0.039
> Model[[3, 5, 2, 10]] 0.039
 > 0.062
> Model[[5, 2, 1, 5]] 0.062
 > 0.043
> Model[[5, 2, 1, 10]] 0.043
 > 0.109
> Model[[5, 2, 2, 5]] 0.109
 > 0.099
> Model[[5, 2, 2, 10]] 0.099
 > 0.062
> Model[[5, 5, 1, 5]] 0.062
 > 0.043
> Model[[5, 5, 1, 10]] 0.043
 > 0.116
> Model[[5, 5, 2, 5]] 0.116


In [None]:
scores

In [None]:
# Model Estimation & Evaluation

In [None]:
# Pipeline

# Steps
steps_final = [('rfe', RFE(LinearRegression(), n_features_to_select=5)),
               ('dtr', DecisionTreeRegressor(max_depth=3,min_samples_split=2,min_samples_leaf=1))
            ]

pipeline_final = Pipeline(steps_final)

In [None]:
X_train, X_valid, X_test = X[:n_train], X[n_train:n_train+n_valid], X[n_train+n_valid:n_train+n_valid+n_test]
y_train, y_valid, y_test = y[:n_train], y[n_train:n_train+n_valid], y[n_train+n_valid:n_train+n_valid+n_test]

pipeline_final.fit(X_train, y_train)

#Performance

In [None]:
y_pred = pipeline_final.predict(X_test)

In [None]:
mse = metrics.mean_squared_error(y_test, y_pred)
mse

In [None]:
pd.options.display.max_rows=300
result = pd.DataFrame({'Interest_rate_Actual': y[-n_test:], 'Interest_rate_Predicted': y_pred}, columns=['Interest_rate_Actual', 'Interest_rate_Predicted'])
result

In [None]:
result['Interest_rate_Actual'].plot(legend=True)
result['Interest_rate_Predicted'].plot(legend=True)

In [None]:
plt.scatter(y_test, y_pred)
plt.plot(y_test, y_test,'k-') 

#Reference:
1.Scikit-learn developers. (2023). Decision Tree Regression. In Scikit-learn Documentation. Retrieved from https://scikit-learn.org/stable/modules/tree.html#regression