# Deposits Forecast Model using Random Forest

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from statsmodels.tsa.ar_model import AutoReg

In [2]:
from validmind.datasets.regression import fred_deposits as demo_dataset

deposits_df, deposits_seasonality_df, fedfunds_df, tb3ms_df, gs10_df, gs30_df = demo_dataset.load_data()

target_column = demo_dataset.target_column

df = deposits_df.copy()

# Adjust the noise level by changing the standard deviation
noise_level = 50  # You can change this value to increase or decrease the noise level

# Create a synthetic variable with added noise
np.random.seed(42)  # For reproducibility
noise = np.random.normal(0, noise_level, size=len(df))  # Adjust the noise level here
df['synthetic_var'] = df[target_column] + noise

df['FEDFUNDS'] = fedfunds_df['FEDFUNDS']

selected_variables = [demo_dataset.target_column, 'synthetic_var', 'FEDFUNDS']

In [3]:
def create_variable_figure(df, variable_name):
    fig = go.Figure()

    # Add the variable's time series as a line plot
    fig.add_trace(go.Scatter(
        x=df.index,
        y=df[variable_name],
        mode='lines',
        name=variable_name,
        line=dict(color='blue')
    ))

    # Update the layout
    fig.update_layout(
        title=f'Time Series of {variable_name}',
        xaxis_title='Date',
        yaxis_title=variable_name,
        legend_title='Legend'
    )

    # Show the plot
    fig.show()

In [4]:
variables = selected_variables

# Create a figure for each independent variable
for variable in variables:
    create_variable_figure(df, variable)

In [5]:
diff_df = df.diff().dropna()

In [6]:
diff_df

Unnamed: 0_level_0,DPSACBW027NBOG,synthetic_var,FEDFUNDS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-02-01,-5.65550,-37.404423,0.02
2010-03-01,25.17240,64.470042,0.03
2010-04-01,-0.99040,42.776666,0.04
2010-05-01,-54.65600,-142.515162,0.00
2010-06-01,21.36640,21.367221,-0.02
...,...,...,...
2022-08-01,28.55575,33.353518,0.65
2022-09-01,-106.20400,-157.527647,0.23
2022-10-01,-166.67950,-121.065579,0.52
2022-11-01,36.05110,39.092039,0.70


In [7]:
# Split the DataFrame into training and testing sets
train_df, test_df = train_test_split(diff_df, test_size=0.2, shuffle=False)

In [8]:
train_df.head()

Unnamed: 0_level_0,DPSACBW027NBOG,synthetic_var,FEDFUNDS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-02-01,-5.6555,-37.404423,0.02
2010-03-01,25.1724,64.470042,0.03
2010-04-01,-0.9904,42.776666,0.04
2010-05-01,-54.656,-142.515162,0.0
2010-06-01,21.3664,21.367221,-0.02


In [9]:
test_df.head()

Unnamed: 0_level_0,DPSACBW027NBOG,synthetic_var,FEDFUNDS
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2020-06-01,213.7225,293.902427,0.03
2020-07-01,78.83195,-80.217648,0.01
2020-08-01,-6.60295,14.60898,0.01
2020-09-01,145.95435,179.251805,-0.01
2020-10-01,81.3394,51.183049,0.0


In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Define the features and target variable for training and testing sets
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]
X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

# Initialize and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared (R²): {r2}')

Mean Squared Error (MSE): 9986.703402738682
Root Mean Squared Error (RMSE): 99.93349489905114
Mean Absolute Error (MAE): 80.11810803225775
R-squared (R²): 0.37670327087694044


In [11]:
# Visualize the predicted vs actual values using Plotly
import plotly.graph_objects as go

# Create the plot
fig = go.Figure()

# Add actual values as a dotted line plot
fig.add_trace(go.Scatter(
    x=test_df.index, 
    y=y_test, 
    mode='lines+markers', 
    name='Actual',
    line=dict(dash='dot', color='blue')
))

# Add predicted values as a solid line plot
fig.add_trace(go.Scatter(
    x=test_df.index, 
    y=y_pred, 
    mode='lines+markers', 
    name='Predicted',
    line=dict(color='red')
))

# Update the layout
fig.update_layout(
    title='Predicted vs Actual Values',
    xaxis_title='Date',
    yaxis_title=target_column,
    legend_title='Legend'
)

# Show the plot
fig.show()

In [12]:
# Calculate the levels from the differences
initial_level = df['DPSACBW027NBOG'].iloc[len(train_df)]
y_pred_levels = np.concatenate([[initial_level], y_pred]).cumsum()

# Adjust y_test to levels
y_test_levels = np.concatenate([[initial_level], y_test]).cumsum()

# Create the plot for levels
fig_levels = go.Figure()

# Add actual levels as a dotted line plot
fig_levels.add_trace(go.Scatter(
    x=test_df.index, 
    y=y_test_levels[1:], 
    mode='lines+markers', 
    name='Actual Levels',
    line=dict(dash='dot', color='blue')
))

# Add predicted levels as a solid line plot
fig_levels.add_trace(go.Scatter(
    x=test_df.index, 
    y=y_pred_levels[1:], 
    mode='lines+markers', 
    name='Predicted Levels',
    line=dict(color='red')
))

# Update the layout
fig_levels.update_layout(
    title='Predicted vs Actual Levels',
    xaxis_title='Date',
    yaxis_title='Levels of DPSACBW027NBOG',
    legend_title='Legend'
)

# Show the plot for levels
fig_levels.show()