# Naïve Bayes 

## Read Data

In [1]:
from IPython.display import display_html
import pandas as pd

# a function to display multiple dataframes side by side
def display_side_by_side(*args, titles=()):
    html_str = ''
    for i, df in enumerate(args):
        title = titles[i] if i < len(titles) else f'DF{i+1}'
        html_str += f'<div style="display:inline-block; margin-right:20px;">'
        html_str += f'<h3>{title}</h3>'
        html_str += df.to_html()
        html_str += '</div>'
    display_html(html_str, raw=True)

df = pd.read_csv('../data/df_merged.csv', index_col=0)
display_side_by_side(
    df.head().T, df.tail().T, titles=['Data (Head)', 'Data (Tail)']
)

Unnamed: 0,2006-04-01,2006-05-01,2006-06-01,2006-07-01,2006-08-01
PCEPILFE_YoY,2.362185,2.408566,2.595788,2.545755,2.671352
PCEPILFE_MoM,3.664276,3.008452,3.029509,1.16884,2.449328
UNRATE,4.7,4.6,4.6,4.7,4.7
NROU,5.045673,5.04203,5.011462,5.034624,5.030861
USREC,0.0,0.0,0.0,0.0,0.0
ZLB_dummy,0.0,0.0,0.0,0.0,0.0
COVID_dummy,0.0,0.0,0.0,0.0,0.0
GDPC1,2.98566,2.933985,1.035011,2.436585,2.040797
GDPPOT,2.470276,2.468015,2.063006,2.463421,2.461089
INDPRO,3.552753,0.301573,4.041034,-0.783927,5.208583

Unnamed: 0,2025-02-01,2025-03-01,2025-04-01,2025-05-01,2025-06-01
PCEPILFE_YoY,2.949316,2.699686,2.610938,2.732352,2.774467
PCEPILFE_MoM,5.71029,1.142409,2.02369,2.3937,3.153393
UNRATE,4.1,4.2,4.2,4.2,4.1
NROU,4.310591,4.320248,4.306004,4.303689,4.318228
USREC,0.0,0.0,0.0,0.0,0.0
ZLB_dummy,0.0,0.0,0.0,0.0,0.0
COVID_dummy,0.0,0.0,0.0,0.0,0.0
GDPC1,1.249098,-0.503467,1.598321,2.007021,3.250411
GDPPOT,2.000434,2.247706,1.996844,1.995021,2.286258
INDPRO,11.547378,-2.69759,0.377471,0.912129,4.225222


In [2]:
# a function to generate lag/rolling features
def feature_engineering(df, lags, windows):
    # Create a list to hold all new feature Series
    new_features_list = []
    
    # Create Lagged Variables
    if len(lags) > 0:
        cols_to_exclude = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE']
        cols_to_lag = [col for col in df.columns if col not in cols_to_exclude]
        
        for col in cols_to_lag:
            for lag in lags:
                # Create the new column as a Series and add it to the list
                new_col = df[col].shift(lag).rename(f'{col}_lag_{lag}')
                new_features_list.append(new_col)
    
    # Create Rolling Statistics
    if len(windows) > 0:
        cols_to_exclude = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE']
        cols_to_roll = [col for col in df.columns if col not in cols_to_exclude]
        
        for col in cols_to_roll:
            for window in windows:
                # Create rolling mean and add to the list
                roll_mean = df[col].rolling(window=window).mean().rename(f'{col}_roll_mean_{window}')
                new_features_list.append(roll_mean)
                
                # Create rolling std and add to the list
                roll_std = df[col].rolling(window=window).std().rename(f'{col}_roll_std_{window}')
                new_features_list.append(roll_std)
    
    if (len(lags) > 0) | (len(windows) > 0):
        # Concatenate all new features at once
        # This is much more efficient than adding columns one by one in a loop
        df = pd.concat([df] + new_features_list, axis=1)

        # Handle Missing Values
        # Drop rows with NaN values that were generated by shift() and rolling()
        original_rows = df.shape[0]
        df.dropna(inplace=True)
        new_rows = df.shape[0]
    
    return df

## Naïve Bayes 

### Define Regimes (High Inflation & High Unemployment)

In [3]:
# Use the median as the threshold to determine "high"
inflation_median_YoY = df['PCEPILFE_YoY'].median()
inflation_median_MoM = df['PCEPILFE_MoM'].median()
unemployment_median = df['UNRATE'].median()

df['high_inflation_YoY'] = (df['PCEPILFE_YoY'] > inflation_median_YoY).astype(int)
df['high_inflation_MoM'] = (df['PCEPILFE_MoM'] > inflation_median_MoM).astype(int)
df['high_unemployment'] = (df['UNRATE'] > unemployment_median).astype(int)

print('Data Preparation Complete')
print(f'Median Inflation (PCE, YoY): {inflation_median_YoY:.2f}')
print(f'Median Inflation (PCE, MoM): {inflation_median_MoM:.2f}')
print(f'Median Unemployment (UNRATE): {unemployment_median:.2f}')

Data Preparation Complete
Median Inflation (PCE, YoY): 1.72
Median Inflation (PCE, MoM): 1.82
Median Unemployment (UNRATE): 5.00


### Analysis for High Inflation

##### YoY

In [4]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.compose import ColumnTransformer
import numpy as np

In [5]:
print('\n--- Analysis 1: Predicting High Inflation Periods (YoY) ---')

# Generate lag/rolling features
df_inf_YoY = feature_engineering(df, lags=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], windows=[]) # 0.8636

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_inf_YoY.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_inf_YoY.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_inf_YoY = features
y_inf_YoY = df_inf_YoY['high_inflation_YoY']

# Split Data into Training and Testing Sets
X_inf_YoY_train, X_inf_YoY_test, y_inf_YoY_train, y_inf_YoY_test = train_test_split(
    X_inf_YoY, y_inf_YoY, test_size=0.3, random_state=42, stratify=y_inf_YoY
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_inf_YoY_train.head().T)
print('')
print('Target (y)')
display(y_inf_YoY_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_inf_YoY_test.head().T)
print('')
print('Target (y)')
display(y_inf_YoY_test.head().T)

# Set up the ColumnTransformer
# It applies different transformers to different columns.
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_inf_YoY_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers = [
        ('scaler', StandardScaler(), continuous_cols),    # Apply scaler to continuous columns
        ('passthrough', 'passthrough', dummy_cols)         # Leave dummy columns untouched
    ],
    remainder='passthrough' # Ensure no columns are accidentally dropped
)

# Scale the Features (Standardization)
X_inf_YoY_train_scaled = preprocessor.fit_transform(X_inf_YoY_train)
X_inf_YoY_test_scaled = preprocessor.transform(X_inf_YoY_test)

# Train and Evaluate the Naive Bayes Model
gnb_inf = GaussianNB()
gnb_inf.fit(X_inf_YoY_train_scaled, y_inf_YoY_train)
y_inf_YoY_pred = gnb_inf.predict(X_inf_YoY_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_inf_YoY_test, y_inf_YoY_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_inf_YoY_test, y_inf_YoY_pred))
print('\nClassification Report:')
print(classification_report(y_inf_YoY_test, y_inf_YoY_pred, target_names=['Low Inflation', 'High Inflation']))


--- Analysis 1: Predicting High Inflation Periods (YoY) ---


Training Data

Features (X)


Unnamed: 0,2024-01-01,2011-07-01,2020-02-01,2019-04-01,2016-02-01
NROU,4.342561,4.815260,4.472255,4.502626,4.624119
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,3.227165,0.821254,0.979738,2.925297,1.750628
...,...,...,...,...,...
PX5_MD_lag_9,3.000000,2.800000,2.600000,2.400000,2.800000
PX5_MD_lag_10,2.900000,2.700000,2.300000,2.600000,2.600000
PX5_MD_lag_11,2.900000,2.800000,2.500000,2.500000,2.800000
PX5_MD_lag_12,2.900000,2.900000,2.300000,2.500000,2.700000



Target (y)


2024-01-01    1
2011-07-01    1
2020-02-01    0
2019-04-01    0
2016-02-01    0
Name: high_inflation_YoY, dtype: int64



Test Data

Features (X)


Unnamed: 0,2018-06-01,2018-12-01,2023-01-01,2007-11-01,2012-03-01
NROU,4.466487,4.444159,4.373590,4.975451,4.926579
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,0.000000,0.000000,0.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,2.123301,0.566577,3.147075,2.097444,3.354363
...,...,...,...,...,...
PX5_MD_lag_9,2.500000,2.500000,3.000000,2.900000,3.000000
PX5_MD_lag_10,2.500000,2.500000,3.000000,3.000000,2.900000
PX5_MD_lag_11,2.600000,2.500000,3.000000,3.000000,2.900000
PX5_MD_lag_12,2.500000,2.400000,3.100000,3.000000,3.200000



Target (y)


2018-06-01    1
2018-12-01    1
2023-01-01    1
2007-11-01    1
2012-03-01    1
Name: high_inflation_YoY, dtype: int64


Model Accuracy: 0.8636

Confusion Matrix:
[[33  2]
 [ 7 24]]

Classification Report:
                precision    recall  f1-score   support

 Low Inflation       0.82      0.94      0.88        35
High Inflation       0.92      0.77      0.84        31

      accuracy                           0.86        66
     macro avg       0.87      0.86      0.86        66
  weighted avg       0.87      0.86      0.86        66



##### MoM

In [6]:
print('\n--- Analysis 2: Predicting High Inflation Periods (MoM) ---')

# Generate lag/rolling features
df_inf_MoM = feature_engineering(df, lags=[], windows=[8]) # 0.7059

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_inf_MoM.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_inf_MoM.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_inf_MoM = features
y_inf_MoM = df_inf_MoM['high_inflation_MoM']

# Split Data into Training and Testing Sets
X_inf_MoM_train, X_inf_MoM_test, y_inf_MoM_train, y_inf_MoM_test = train_test_split(
    X_inf_MoM, y_inf_MoM, test_size=0.3, random_state=42, stratify=y_inf_MoM
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_inf_MoM_train.head().T)
print('')
print('Target (y)')
display(y_inf_MoM_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_inf_MoM_test.head().T)
print('')
print('Target (y)')
display(y_inf_MoM_test.head().T)

# Set up the ColumnTransformer
# It applies different transformers to different columns.
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_inf_MoM_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_cols),    # Apply scaler to continuous columns
        ('passthrough', 'passthrough', dummy_cols)         # Leave dummy columns untouched
    ],
    remainder='passthrough' # Ensure no columns are accidentally dropped
)

# Scale the Features (Standardization)
X_inf_MoM_train_scaled = preprocessor.fit_transform(X_inf_MoM_train)
X_inf_MoM_test_scaled = preprocessor.transform(X_inf_MoM_test)

# Define the custom sampling strategy
# First, get the counts of each class using .value_counts()
y_train_counts = y_inf_MoM_train.value_counts()

# Then, get the count for each specific class from the value_counts() result
count_class_0 = y_train_counts.get(0, 0)
count_class_1 = y_train_counts.get(1, 0)

# The rest of the logic remains the same
if count_class_1 < count_class_0:
    majority_label, majority_count = 0, count_class_0
    minority_label, _ = 1, count_class_1
else:
    majority_label, majority_count = 1, count_class_1
    minority_label, _ = 0, count_class_0
    
n_minority_new = int(majority_count * 2.0)

desired_strategy = {
    majority_label: majority_count,
    minority_label: n_minority_new
}

# Apply SMOTE to the TRAINING data only
smote = SMOTE(sampling_strategy=desired_strategy, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_inf_MoM_train_scaled, y_inf_MoM_train)

# Train and Evaluate the Naive Bayes Model
gnb_inf = GaussianNB()
gnb_inf.fit(X_train_resampled, y_train_resampled)
y_inf_MoM_pred = gnb_inf.predict(X_inf_MoM_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_inf_MoM_test, y_inf_MoM_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_inf_MoM_test, y_inf_MoM_pred))
print('\nClassification Report:')
print(classification_report(y_inf_MoM_test, y_inf_MoM_pred, target_names=['Low Inflation', 'High Inflation']))


--- Analysis 2: Predicting High Inflation Periods (MoM) ---


Training Data

Features (X)


Unnamed: 0,2017-05-01,2020-06-01,2025-04-01,2016-02-01,2018-06-01
NROU,4.575059,4.397909,4.306004,4.624119,4.466487
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
COVID_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
GDPC1,2.728147,-31.635861,1.598321,1.750628,2.123301
...,...,...,...,...,...
T10YIE_roll_std_8,0.120410,0.308055,0.098456,0.149131,0.108874
PX_MD_roll_mean_8,2.487500,2.525000,3.737500,2.675000,2.737500
PX_MD_roll_std_8,0.155265,0.384522,1.417178,0.128174,0.140789
PX5_MD_roll_mean_8,2.450000,2.437500,3.437500,2.637500,2.487500



Target (y)


2017-05-01    0
2020-06-01    1
2025-04-01    1
2016-02-01    1
2018-06-01    0
Name: high_inflation_MoM, dtype: int64



Test Data

Features (X)


Unnamed: 0,2019-11-01,2020-10-01,2009-10-01,2014-03-01,2014-07-01
NROU,4.481298,4.448747,4.891111,4.783165,4.688606
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,0.000000,1.000000,1.000000,1.000000,1.000000
COVID_dummy,0.000000,1.000000,0.000000,0.000000,0.000000
GDPC1,3.488819,29.821405,2.349854,-1.380384,5.752823
...,...,...,...,...,...
T10YIE_roll_std_8,0.132726,0.287304,0.276607,0.024650,0.039584
PX_MD_roll_mean_8,2.650000,2.725000,2.687500,3.087500,3.175000
PX_MD_roll_std_8,0.151186,0.416619,0.379614,0.135620,0.103510
PX5_MD_roll_mean_8,2.437500,2.550000,2.850000,2.875000,2.837500



Target (y)


2019-11-01    0
2020-10-01    0
2009-10-01    1
2014-03-01    0
2014-07-01    1
Name: high_inflation_MoM, dtype: int64


Model Accuracy: 0.7059

Confusion Matrix:
[[30  5]
 [15 18]]

Classification Report:
                precision    recall  f1-score   support

 Low Inflation       0.67      0.86      0.75        35
High Inflation       0.78      0.55      0.64        33

      accuracy                           0.71        68
     macro avg       0.72      0.70      0.70        68
  weighted avg       0.72      0.71      0.70        68



### Analysis for High Unemployment

In [7]:
print("\n--- Analysis 3: Predicting High Unemployment Periods ---")

# Generate lag/rolling features
df_unemp = feature_engineering(df, lags=[], windows=[12]) # 0.8636

# Define the prefixes of columns to drop
prefixes_to_drop = ['PCEPILFE_YoY', 'PCEPILFE_MoM', 'UNRATE', 'high_inflation_YoY', 'high_inflation_MoM', 'high_unemployment']

# Find all columns that start with any of the defined prefixes using a list comprehension
cols_to_drop = [col for col in df_unemp.columns for prefix in prefixes_to_drop if col.startswith(prefix)]

# Drop the identified columns from the DataFrame
features = df_unemp.drop(columns=cols_to_drop)

# Define Features (X) and Target (y)
X_unemp = features
y_unemp = df_unemp['high_unemployment']

# Split Data into Training and Testing Sets
X_unemp_train, X_unemp_test, y_unemp_train, y_unemp_test = train_test_split(
    X_unemp, y_unemp, test_size=0.3, random_state=42, stratify=y_unemp
)

# Display Training Data
print('')
print('')
print('Training Data')
print('')
print('Features (X)')
display(X_unemp_train.head().T)
print('')
print('Target (y)')
display(y_unemp_train.head().T)

# Display Test Data
print('')
print('')
print('Test Data')
print('')
print('Features (X)')
display(X_unemp_test.head().T)
print('')
print('Target (y)')
display(y_unemp_test.head().T)

# Set up the ColumnTransformer
# It applies different transformers to different columns.
dummy_cols = ['USREC', 'ZLB_dummy', 'COVID_dummy']
continuous_cols = [col for col in X_unemp_train.columns if col not in dummy_cols]
preprocessor = ColumnTransformer(
    transformers=[
        ('scaler', StandardScaler(), continuous_cols),    # Apply scaler to continuous columns
        ('passthrough', 'passthrough', dummy_cols)         # Leave dummy columns untouched
    ],
    remainder='passthrough' # Ensure no columns are accidentally dropped
)

# Scale the Features (Standardization)
X_unemp_train_scaled = preprocessor.fit_transform(X_unemp_train)
X_unemp_test_scaled = preprocessor.transform(X_unemp_test)

# Train and Evaluate the Naive Bayes Model
gnb_unemp = GaussianNB()
gnb_unemp.fit(X_unemp_train_scaled, y_unemp_train)
y_unemp_pred = gnb_unemp.predict(X_unemp_test_scaled)

# Display evaluation results
print(f'\nModel Accuracy: {accuracy_score(y_unemp_test, y_unemp_pred):.4f}')
print('\nConfusion Matrix:')
print(confusion_matrix(y_unemp_test, y_unemp_pred))
print('\nClassification Report:')
print(classification_report(y_unemp_test, y_unemp_pred, target_names=['Low Unemployment', 'High Unemployment']))


--- Analysis 3: Predicting High Unemployment Periods ---


Training Data

Features (X)


Unnamed: 0,2010-08-01,2015-03-01,2023-08-01,2011-12-01,2013-01-01
NROU,4.854817,4.698048,4.355382,4.940132,4.751018
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,1.000000,1.000000,0.000000,1.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,2.871684,3.601940,2.947611,4.492196,2.068054
...,...,...,...,...,...
T10YIE_roll_std_12,0.239970,0.238449,0.065400,0.219525,0.148673
PX_MD_roll_mean_12,2.733333,2.991667,4.150000,3.633333,3.258333
PX_MD_roll_std_12,0.238683,0.242930,0.614225,0.528004,0.260971
PX5_MD_roll_mean_12,2.816667,2.791667,2.950000,2.875000,2.841667



Target (y)


2010-08-01    1
2015-03-01    1
2023-08-01    0
2011-12-01    1
2013-01-01    1
Name: high_unemployment, dtype: int64



Test Data

Features (X)


Unnamed: 0,2021-08-01,2025-06-01,2021-06-01,2015-10-01,2014-04-01
NROU,4.420142,4.318228,4.380998,4.637622,4.698901
USREC,0.000000,0.000000,0.000000,0.000000,0.000000
ZLB_dummy,1.000000,0.000000,1.000000,1.000000,1.000000
COVID_dummy,0.000000,0.000000,0.000000,0.000000,0.000000
GDPC1,3.567698,3.250411,6.280179,1.853158,0.052786
...,...,...,...,...,...
T10YIE_roll_std_12,0.292916,0.096249,0.340626,0.138746,0.055880
PX_MD_roll_mean_12,3.450000,3.933333,3.183333,2.758333,3.091667
PX_MD_roll_std_12,0.846920,1.506551,0.637942,0.124011,0.116450
PX5_MD_roll_mean_12,2.708333,3.475000,2.675000,2.700000,2.875000



Target (y)


2021-08-01    1
2025-06-01    0
2021-06-01    1
2015-10-01    0
2014-04-01    1
Name: high_unemployment, dtype: int64


Model Accuracy: 0.8636

Confusion Matrix:
[[34  0]
 [ 9 23]]

Classification Report:
                   precision    recall  f1-score   support

 Low Unemployment       0.79      1.00      0.88        34
High Unemployment       1.00      0.72      0.84        32

         accuracy                           0.86        66
        macro avg       0.90      0.86      0.86        66
     weighted avg       0.89      0.86      0.86        66

