# Macro to Micro Model Demo

## Introduction

#### Connect to ValidMind Project

In [None]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env

import pandas as pd

%matplotlib inline

**Connect to ValidMind Project**

In [None]:

import validmind as vm

vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "2494c3838f48efe590d531bfe225d90b",
  api_secret = "4f692f8161f128414fef542cab2a4e74834c75d01b3a8e088a1834f2afcfe838",
  project = "clk2jf1yy0005o5y6u8a30v6l"
)

**Check Available Tests**

In [None]:
vm.test_plans.describe_plan("time_series_data_quality")

## Data Description

#### Import Dataset

In [None]:
from validmind.datasets.regression import fred as fred

# Define target and feature columns
target_column = 'DRSFRMACBS'
feature_columns = ['GDPC1', 'CSUSHPISA', 'UNRATE', 'CPIAUCSL', 'FEDFUNDS']

# Load FRED data
df = fred.load_all_data()

# Select columns for analysis
df = df[[target_column] + feature_columns]

df.tail(10)

#### Missing Values

In [None]:
from validmind.vm_models.test_context import TestContext
from validmind.tests.data_validation.TimeSeriesMissingValues import TimeSeriesMissingValues

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)

params = {"min_threshold": 2}

metric = TimeSeriesMissingValues(test_context, params)
metric.run()
await metric.result.log()
metric.result.show()

#### Outliers 

In [None]:
from validmind.tests.data_validation.TimeSeriesOutliers import TimeSeriesOutliers

params = {"zscore_threshold": 3}

metric = TimeSeriesOutliers(test_context, params)
metric.run()
await metric.result.log()
metric.result.show()

#### Frequency

In [None]:
from validmind.tests.data_validation.TimeSeriesFrequency import TimeSeriesFrequency

metric = TimeSeriesFrequency(test_context)
metric.run()
await metric.result.log()
metric.result.show()

## Data Preparation

In [None]:
# Resample to quarterly data (end of October)
df = df.resample('QS-OCT').mean()

# Remove all missing values
df = df.dropna()

# Take the first difference across all variables
df = df.diff().dropna()

# Remove data from 2020 onwards
df = df[df.index.year < 2020]

In [None]:
from validmind.vm_models import TestPlan

class DemoPlan(TestPlan):

    name = "data_description"
    required_context = ["dataset"]
    tests = [
        TimeSeriesMissingValues,
        TimeSeriesOutliers,
        TimeSeriesFrequency,
    ]

config = {TimeSeriesMissingValues}

vm_df = vm.init_dataset(dataset=df)
test_context = TestContext(dataset=vm_df)
test_plan = DemoPlan(test_context=test_context, config=config)
test_plan.run()

In [None]:

from validmind.tests.data_validation.TimeSeriesMissingValues import TimeSeriesMissingValues

params = {"min_threshold": 2}

metric = TimeSeriesMissingValues(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.TimeSeriesOutliers import TimeSeriesOutliers

params = {"zscore_threshold": 3}

metric = TimeSeriesOutliers(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.TimeSeriesFrequency import TimeSeriesFrequency

metric = TimeSeriesFrequency(test_context)
metric.run()
await metric.result.log()
metric.result.show()

## Data Sampling

#### Sampling Method

We use time series sampling to create our training and testing sets, a crucial step in our macro-to-micro model. This method maintains the temporal order of the data, preserving the inherent dependencies in our time series of macroeconomic indicators and default rates.

In [None]:
# Define the split date
split_date = '2018-01-01'

# Split data into train and test 
df_train = df.loc[df.index < split_date]
df_test = df.loc[df.index >= split_date]

# Split the train and test sets into X and y
X_train = df_train.drop(target_column, axis=1)
y_train = df_train[target_column]
X_test = df_test.drop(target_column, axis=1)
y_test = df_test[target_column]

# Concatenate X_train with y_train to form df_train
df_train = pd.concat([X_train, y_train], axis=1)

# Concatenate X_test with y_test to form df_test
df_test = pd.concat([X_test, y_test], axis=1)

## Univariate Analysis

In [None]:
vm.test_plans.describe_plan("time_series_univariate")

In [None]:
from validmind.tests.data_validation.TimeSeriesLinePlot import TimeSeriesLinePlot

vm_df_train = vm.init_dataset(dataset=df_train)
test_context = TestContext(dataset=vm_df_train)

metric = TimeSeriesLinePlot(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.TimeSeriesHistogram import TimeSeriesHistogram

metric = TimeSeriesHistogram(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.ACFandPACFPlot import ACFandPACFPlot

metric = ACFandPACFPlot(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.SeasonalDecompose import SeasonalDecompose

params = {"seasonal_model": 'additive'}

metric = SeasonalDecompose(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.AutoSeasonality import AutoSeasonality

params = {"min_period": 1,
          "min_period": 3}

metric = AutoSeasonality(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.AutoStationarity import AutoStationarity

params = {"max_order": 3,
          "threshold": 0.05}

metric = AutoStationarity(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.RollingStatsPlot import RollingStatsPlot

params = {"window_size": 4}

metric = RollingStatsPlot(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()


In [None]:
from validmind.tests.data_validation.AutoAR import AutoAR

params = {"max_ar_order": 2}

metric = AutoAR(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.AutoMA import AutoMA

params = {"max_ar_order": 2}

metric = AutoMA(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

## Multivariate Analysis

In [None]:
vm.test_plans.describe_plan("time_series_multivariate")

In [None]:
from validmind.tests.data_validation.ScatterPlot import ScatterPlot

metric = ScatterPlot(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.LaggedCorrelationHeatmap import LaggedCorrelationHeatmap

params = {"target_col": target_column,
          "independent_vars": feature_columns}

metric = LaggedCorrelationHeatmap(test_context, params)
#metric.run()
# await metric.result.log()
#metric.result.show()

In [None]:
from validmind.tests.data_validation.EngleGrangerCoint import EngleGrangerCoint

params = {"threshold": 0.05}

metric = EngleGrangerCoint(test_context, params)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.data_validation.SpreadPlot import SpreadPlot

metric = SpreadPlot(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

## Feature Selection

## Feature Engineering

## Model Training

#### Fit Linear Regression Model

In [None]:
import statsmodels.api as sm

# Create X_train, y_train 
y_train = df_train[target_column]
X_train = df_train.drop(target_column, axis=1)

# Add constant to X_train for intercept term
X_train = sm.add_constant(X_train)
df_train = pd.concat([X_train, y_train], axis=1)

# Update df_test
y_test = df_test[target_column]
X_test = df_test.drop(target_column, axis=1)
X_test = sm.add_constant(X_test)
df_test = pd.concat([X_train, y_train], axis=1)

# Define the model
model = sm.OLS(y_train, X_train)

# Fit the model
model_fit = model.fit()

# Print out the statistics
print(model_fit.summary())


#### Remove Non-Significant Features

In [None]:
features_to_drop = ['GDPC1', 'CPIAUCSL']
df_train.drop(columns = features_to_drop, inplace=True)

# Update df_test 
df_test.drop(columns = features_to_drop, inplace=True)

#### Update Model Fit 

In [None]:
# Create X_train and y_train, X_test and y_test
y_train = df_train[target_column]
X_train = df_train.drop(target_column, axis=1)

# Define the model
model = sm.OLS(y_train, X_train)

# Fit the model
model_fit = model.fit()

# Print out the statistics
print(model_fit.summary())

##### Create ValidMind Models

In [None]:
# Update VM datasets
vm_train_ds = vm.init_dataset(dataset=df_train, type="generic", target_column=target_column)
vm_test_ds = vm.init_dataset(dataset=df_test, type="generic", target_column=target_column)

# Create VM model
vm_model = vm.init_model(
    model = model_fit, 
    train_ds=vm_train_ds, 
    test_ds=vm_test_ds)

In [None]:
vm.test_plans.describe_plan("regression_model_description")

In [None]:
from validmind.tests.data_validation.DatasetSplit import DatasetSplit

test_context = TestContext(model=vm_model)

metric = DatasetSplit(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.ModelMetadata import ModelMetadata


metric = ModelMetadata(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
vm.test_plans.describe_plan("regression_models_evaluation")

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionModelsCoeffs import RegressionModelsCoeffs

test_context = TestContext(models=[vm_model])
metric = RegressionModelsCoeffs(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionModelsPerformance import RegressionModelsPerformance

test_context = TestContext(models=[vm_model])
metric = RegressionModelsPerformance(test_context)
metric.run()
# await metric.result.log()
metric.result.show()

## Model Evaluation

In [None]:
vm.test_plans.describe_plan("time_series_forecast")

In [None]:
from validmind.tests.model_validation.statsmodels.RegressionModelForecastPlotLevels import RegressionModelForecastPlotLevels

test_context = TestContext(models=[vm_model])

params = {"transformation": "integrate"}

metric = RegressionModelForecastPlotLevels(test_context)
metric.run()
# await metric.result.log()
metric.result.show()