# Decision Trees

## Read Data

In [1]:
from IPython.display import display_html
import pandas as pd

# a function to display multiple dataframes side by side
def display_side_by_side(*args, titles=()):
    html_str = ''
    for i, df in enumerate(args):
        title = titles[i] if i < len(titles) else f'DF{i+1}'
        html_str += f'<div style="display:inline-block; margin-right:20px;">'
        html_str += f'<h3>{title}</h3>'
        html_str += df.to_html()
        html_str += '</div>'
    display_html(html_str, raw=True)

df = pd.read_csv('../data/df_merged.csv', index_col=0)
display_side_by_side(
    df.head().T, df.tail().T, titles=['Data (Head)', 'Data (Tail)']
)

Unnamed: 0,2006-04-01,2006-05-01,2006-06-01,2006-07-01,2006-08-01
PCEPILFE_YoY,2.362185,2.408566,2.595788,2.545755,2.671352
PCEPILFE_MoM,3.664276,3.008452,3.029509,1.16884,2.449328
UNRATE,4.7,4.6,4.6,4.7,4.7
NROU,5.045673,5.04203,5.011462,5.034624,5.030861
USREC,0.0,0.0,0.0,0.0,0.0
ZLB_dummy,0.0,0.0,0.0,0.0,0.0
COVID_dummy,0.0,0.0,0.0,0.0,0.0
GDPC1,2.98566,2.933985,1.035011,2.436585,2.040797
GDPPOT,2.470276,2.468015,2.063006,2.463421,2.461089
INDPRO,3.552753,0.301573,4.041034,-0.783927,5.208583

Unnamed: 0,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01
PCEPILFE_YoY,2.949316,2.699686,2.610938,2.732352,2.774467
PCEPILFE_MoM,5.71029,1.142409,2.02369,2.3937,3.153393
UNRATE,4.1,4.2,4.2,4.2,4.1
NROU,4.310591,4.320248,4.306004,4.303689,4.318228
USREC,0.0,0.0,0.0,0.0,0.0
ZLB_dummy,0.0,0.0,0.0,0.0,0.0
COVID_dummy,0.0,0.0,0.0,0.0,0.0
GDPC1,1.249098,-0.503467,1.598321,2.007021,3.250411
GDPPOT,2.000434,2.247706,1.996844,1.995021,2.286258
INDPRO,11.547378,-2.69759,0.377471,0.912129,4.225222


In [2]:
# a function to generate lag/rolling features
def feature_engineering(df, lags, windows):
    # Create a list to hold all new feature Series
    new_features_list = []
    
    # Create Lagged Variables
    if len(lags) > 0:
        cols_to_exclude = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE']
        cols_to_lag = [col for col in df.columns if col not in cols_to_exclude]
        
        for col in cols_to_lag:
            for lag in lags:
                # Create the new column as a Series and add it to the list
                new_col = df[col].shift(lag).rename(f'{col}_lag_{lag}')
                new_features_list.append(new_col)
    
    # Create Rolling Statistics
    if len(windows) > 0:
        cols_to_exclude = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE']
        cols_to_roll = [col for col in df.columns if col not in cols_to_exclude]
        
        for col in cols_to_roll:
            for window in windows:
                # Create rolling mean and add to the list
                roll_mean = df[col].rolling(window=window).mean().rename(f'{col}_roll_mean_{window}')
                new_features_list.append(roll_mean)
                
                # Create rolling std and add to the list
                roll_std = df[col].rolling(window=window).std().rename(f'{col}_roll_std_{window}')
                new_features_list.append(roll_std)
    
    if (len(lags) > 0) | (len(windows) > 0):
        # Concatenate all new features at once
        # This is much more efficient than adding columns one by one in a loop
        df = pd.concat([df] + new_features_list, axis=1)

        # Handle Missing Values
        # Drop rows with NaN values that were generated by shift() and rolling()
        original_rows = df.shape[0]
        df.dropna(inplace=True)
        new_rows = df.shape[0]
    
    return df

## Decision Trees

### Define Regimes (High Inflation & High Unemployment)

In [3]:
# Use the median as the threshold to determine "high"
inflation_median_YoY = df['PCEPILFE_YoY'].median()
inflation_median_MoM = df['PCEPILFE_MoM'].median()
unemployment_median = df['UNRATE'].median()

df['high_inflation_YoY'] = (df['PCEPILFE_YoY'] > inflation_median_YoY).astype(int)
df['high_inflation_MoM'] = (df['PCEPILFE_MoM'] > inflation_median_MoM).astype(int)
df['high_unemployment'] = (df['UNRATE'] > unemployment_median).astype(int)

print('Data Preparation Complete')
print(f'Median Inflation (PCE, YoY): {inflation_median_YoY:.2f}')
print(f'Median Inflation (PCE, MoM): {inflation_median_MoM:.2f}')
print(f'Median Unemployment (UNRATE): {unemployment_median:.2f}')

Data Preparation Complete
Median Inflation (PCE, YoY): 1.72
Median Inflation (PCE, MoM): 1.82
Median Unemployment (UNRATE): 5.00


### Analysis for High Inflation

#### YoY

In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
print('\n--- Analysis 1: Predicting High Inflation Periods (YoY) with Decision Tree ---')

# Generate lag/rolling features
df_inf_YoY = feature_engineering(df, lags=[], windows=[9]) # 0.9254

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_inf_YoY.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_inf_YoY.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_inf_YoY = features
y_inf_YoY = df_inf_YoY['high_inflation_YoY']

# Split Data into Training and Testing Sets
X_inf_YoY_train, X_inf_YoY_test, y_inf_YoY_train, y_inf_YoY_test = train_test_split(
    X_inf_YoY, y_inf_YoY, test_size=0.3, random_state=42, stratify=y_inf_YoY
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_inf_YoY_train.head().T)
print('')
print('Target (y)')
display(y_inf_YoY_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_inf_YoY_test.head().T)
print('')
print('Target (y)')
display(y_inf_YoY_test.head().T)

# Set up the ColumnTransformer
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_inf_YoY_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), continuous_cols),
        ('passthrough', 'passthrough', dummy_cols)
    ],
    remainder='passthrough'
)

# Scale the Features
X_inf_YoY_train_scaled = preprocessor.fit_transform(X_inf_YoY_train)
X_inf_YoY_test_scaled = preprocessor.transform(X_inf_YoY_test)

# Train and Evaluate the Decision Tree Model
dt_inf_YoY = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_inf_YoY.fit(X_inf_YoY_train_scaled, y_inf_YoY_train)
y_inf_YoY_pred = dt_inf_YoY.predict(X_inf_YoY_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_inf_YoY_test, y_inf_YoY_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_inf_YoY_test, y_inf_YoY_pred))
print('\nClassification Report:')
print(classification_report(y_inf_YoY_test, y_inf_YoY_pred, target_names=['Low Inflation', 'High Inflation']))


--- Analysis 1: Predicting High Inflation Periods (YoY) with Decision Tree ---


Training Data

Features (X)


Unnamed: 0,2019-04-01,2008-07-01,2021-08-01,2007-12-01,2011-02-01
NROU,4.502626,4.946077,4.420142,5.016878,4.832980
USREC,0.000000,1.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,0.000000,1.000000,0.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,2.925297,1.910984,3.567698,2.512947,1.135752
...,...,...,...,...,...
T10YIE_roll_std_9,0.126808,0.068259,0.168498,0.066795,0.255336
PX_MD_roll_mean_9,2.711111,4.255556,3.711111,3.288889,2.877778
PX_MD_roll_std_9,0.169148,0.809492,0.822260,0.126930,0.376755
PX5_MD_roll_mean_9,2.477778,3.122222,2.766667,2.977778,2.822222



Target (y)


2019-04-01    0
2008-07-01    1
2021-08-01    1
2007-12-01    1
2011-02-01    0
Name: high_inflation_YoY, dtype: int64



Test Data

Features (X)


Unnamed: 0,2020-01-01,2020-05-01,2008-02-01,2014-12-01,2012-04-01
NROU,4.475293,4.463484,4.964320,4.719510,4.782985
USREC,0.000000,0.000000,1.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,0.000000,1.000000,1.000000
COVID_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
GDPC1,3.584354,-31.252519,-1.033765,2.022933,3.490380
...,...,...,...,...,...
T10YIE_roll_std_9,0.088964,0.292991,0.057794,0.193444,0.145125
PX_MD_roll_mean_9,2.611111,2.500000,3.333333,3.066667,3.333333
PX_MD_roll_std_9,0.183333,0.331662,0.165831,0.200000,0.239792
PX5_MD_roll_mean_9,2.433333,2.411111,2.955556,2.800000,2.811111



Target (y)


2020-01-01    0
2020-05-01    0
2008-02-01    1
2014-12-01    0
2012-04-01    1
Name: high_inflation_YoY, dtype: int64


Model Accuracy: 0.9254

Confusion Matrix:
[[32  3]
 [ 2 30]]

Classification Report:
                precision    recall  f1-score   support

 Low Inflation       0.94      0.91      0.93        35
High Inflation       0.91      0.94      0.92        32

      accuracy                           0.93        67
     macro avg       0.93      0.93      0.93        67
  weighted avg       0.93      0.93      0.93        67



#### MoM

In [12]:
print('\n--- Analysis 2: Predicting High Inflation Periods (MoM) ---')

# Generate lag/rolling features
df_inf_MoM = feature_engineering(df, lags=[], windows=[4]) # 0.6522

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_inf_MoM.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_inf_MoM.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_inf_MoM = features
y_inf_MoM = df_inf_MoM['high_inflation_MoM']

# Split Data into Training and Testing Sets
X_inf_MoM_train, X_inf_MoM_test, y_inf_MoM_train, y_inf_MoM_test = train_test_split(
    X_inf_MoM, y_inf_MoM, test_size=0.3, random_state=42, stratify=y_inf_MoM
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_inf_MoM_train.head().T)
print('')
print('Target (y)')
display(y_inf_MoM_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_inf_MoM_test.head().T)
print('')
print('Target (y)')
display(y_inf_MoM_test.head().T)

# Set up the ColumnTransformer
# It applies different transformers to different columns.
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_inf_MoM_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_cols),    # Apply scaler to continuous columns
        ('passthrough', 'passthrough', dummy_cols)         # Leave dummy columns untouched
    ],
    remainder='passthrough' # Ensure no columns are accidentally dropped
)

# Scale the Features (Standardization)
X_inf_MoM_train_scaled = preprocessor.fit_transform(X_inf_MoM_train)
X_inf_MoM_test_scaled = preprocessor.transform(X_inf_MoM_test)

# Train and Evaluate the Decision Tree Model
dt_inf_MoM = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_inf_MoM.fit(X_inf_MoM_train_scaled, y_inf_MoM_train)
y_inf_MoM_pred = dt_inf_MoM.predict(X_inf_MoM_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_inf_MoM_test, y_inf_MoM_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_inf_MoM_test, y_inf_MoM_pred))
print('\nClassification Report:')
print(classification_report(y_inf_MoM_test, y_inf_MoM_pred, target_names=['Low Inflation', 'High Inflation']))


--- Analysis 2: Predicting High Inflation Periods (MoM) ---


Training Data

Features (X)


Unnamed: 0,2018-08-01,2007-05-01,2014-12-01,2025-02-01,2009-06-01
NROU,4.527322,4.997741,4.719510,4.310591,5.013500
USREC,0.000000,0.000000,0.000000,0.000000,1.000000
ZLB_dummy,0.000000,0.000000,1.000000,0.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,2.261293,2.473807,2.022933,1.249098,-0.714604
...,...,...,...,...,...
T10YIE_roll_std_4,0.017674,0.033639,0.154817,0.059073,0.318484
PX_MD_roll_mean_4,2.925000,3.150000,2.875000,3.250000,2.675000
PX_MD_roll_std_4,0.095743,0.173205,0.095743,0.759386,0.471699
PX5_MD_roll_mean_4,2.525000,3.000000,2.750000,3.225000,2.825000



Target (y)


2018-08-01    0
2007-05-01    0
2014-12-01    0
2025-02-01    1
2009-06-01    0
Name: high_inflation_MoM, dtype: int64



Test Data

Features (X)


Unnamed: 0,2019-10-01,2020-05-01,2008-07-01,2014-02-01,2013-11-01
NROU,4.484362,4.463484,4.946077,4.705600,4.716082
USREC,0.000000,0.000000,1.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,0.000000,1.000000,1.000000
COVID_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
GDPC1,3.577553,-31.252519,1.910984,-0.562901,4.347356
...,...,...,...,...,...
T10YIE_roll_std_4,0.088160,0.279701,0.068547,0.031684,0.019105
PX_MD_roll_mean_4,2.650000,2.475000,5.050000,3.050000,3.050000
PX_MD_roll_std_4,0.129099,0.499166,0.173205,0.129099,0.173205
PX5_MD_roll_mean_4,2.450000,2.450000,3.300000,2.850000,2.900000



Target (y)


2019-10-01    0
2020-05-01    0
2008-07-01    1
2014-02-01    0
2013-11-01    1
Name: high_inflation_MoM, dtype: int64


Model Accuracy: 0.6522

Confusion Matrix:
[[21 14]
 [10 24]]

Classification Report:
                precision    recall  f1-score   support

 Low Inflation       0.68      0.60      0.64        35
High Inflation       0.63      0.71      0.67        34

      accuracy                           0.65        69
     macro avg       0.65      0.65      0.65        69
  weighted avg       0.65      0.65      0.65        69



### Analysis for High Unemployment

In [7]:
print("\n--- Analysis 3: Predicting High Unemployment Periods ---")

# Generate lag/rolling features
df_unemp = feature_engineering(df, lags=[8], windows=[3]) # 0.9701

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_unemp.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_unemp.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_unemp = features
y_unemp = df_unemp['high_unemployment']

# Split Data into Training and Testing Sets
X_unemp_train, X_unemp_test, y_unemp_train, y_unemp_test = train_test_split(
    X_unemp, y_unemp, test_size=0.3, random_state=42, stratify=y_unemp
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_unemp_train.head().T)
print('')
print('Target (y)')
display(y_unemp_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_unemp_test.head().T)
print('')
print('Target (y)')
display(y_unemp_test.head().T)

# Set up the ColumnTransformer
# It applies different transformers to different columns.
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_unemp_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_cols),    # Apply scaler to continuous columns
        ('passthrough', 'passthrough', dummy_cols)         # Leave dummy columns untouched
    ],
    remainder='passthrough' # Ensure no columns are accidentally dropped
)

# Scale the Features (Standardization)
X_unemp_train_scaled = preprocessor.fit_transform(X_unemp_train)
X_unemp_test_scaled = preprocessor.transform(X_unemp_test)

# Train and Evaluate the Decision Tree Model
dt_unemp = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_unemp.fit(X_unemp_train_scaled, y_unemp_train)
y_unemp_pred = dt_unemp.predict(X_unemp_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_unemp_test, y_unemp_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_unemp_test, y_unemp_pred))
print('\nClassification Report:')
print(classification_report(y_unemp_test, y_unemp_pred, target_names=['Low Unemployment', 'High Unemployment']))


--- Analysis 3: Predicting High Unemployment Periods ---


Training Data

Features (X)


Unnamed: 0,2022-12-01,2013-01-01,2021-02-01,2011-03-01,2023-07-01
NROU,4.353271,4.751018,4.437063,4.973791,4.358013
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,1.000000,1.000000,0.000000
COVID_dummy,0.000000,0.000000,1.000000,0.000000,0.000000
GDPC1,3.311312,2.068054,1.998151,-0.948820,2.894413
...,...,...,...,...,...
T10YIE_roll_std_3,0.071744,0.048425,0.133465,0.066138,0.052058
PX_MD_roll_mean_3,4.766667,3.200000,2.933333,3.800000,3.633333
PX_MD_roll_std_3,0.404145,0.100000,0.404145,0.692820,0.493288
PX5_MD_roll_mean_3,2.966667,2.866667,2.633333,3.000000,3.033333



Target (y)


2022-12-01    0
2013-01-01    1
2021-02-01    1
2011-03-01    1
2023-07-01    0
Name: high_unemployment, dtype: int64



Test Data

Features (X)


Unnamed: 0,2024-06-01,2024-10-01,2014-10-01,2019-05-01,2010-12-01
NROU,4.321120,4.320291,4.678244,4.499611,4.982664
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,0.000000,1.000000,0.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,2.955918,1.813095,4.256934,3.529302,2.100441
...,...,...,...,...,...
T10YIE_roll_std_3,0.069626,0.102290,0.138375,0.058044,0.117551
PX_MD_roll_mean_3,3.166667,2.733333,3.033333,2.633333,2.900000
PX_MD_roll_std_3,0.152753,0.057735,0.152753,0.230940,0.173205
PX5_MD_roll_mean_3,3.000000,3.033333,2.833333,2.466667,2.800000



Target (y)


2024-06-01    0
2024-10-01    0
2014-10-01    1
2019-05-01    0
2010-12-01    1
Name: high_unemployment, dtype: int64


Model Accuracy: 0.9701

Confusion Matrix:
[[35  0]
 [ 2 30]]

Classification Report:
                   precision    recall  f1-score   support

 Low Unemployment       0.95      1.00      0.97        35
High Unemployment       1.00      0.94      0.97        32

         accuracy                           0.97        67
        macro avg       0.97      0.97      0.97        67
     weighted avg       0.97      0.97      0.97        67

