# Time Series Data Validation Demo

## Setup 

### Import Libraries

In [1]:
# Load API key and secret from environment variables
from dotenv import load_dotenv
load_dotenv()

# System libraries
import glob
import os
import pickle

# ML libraries
import pandas as pd

# ValidMind libraries 
import validmind as vm

### Connect to ValidMind MRM Platform

In [2]:
vm.init(
  api_host = "http://localhost:3000/api/v1/tracking",
  api_key = "e22b89a6b9c2a27da47cb0a09febc001",
  api_secret = "a61be901b5596e3c528d94231e4a3c504ef0bb803d16815f8dfd6857fac03e57",
  project = "clgo0g0rt0000fjy6ozl9pb69"
)

Connected to ValidMind


## Data Collection

In [3]:
file = '../datasets/time_series/fred_loan_rates.csv'
raw_df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')
display(raw_df)

Unnamed: 0_level_0,GDPC1,GS5,GS10,GS3,MORTGAGE30US,UNRATE,CPIAUCSL,FEDFUNDS,GDP
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1947-01-01,2034.450,,,,,,21.48,,243.164
1947-02-01,,,,,,,21.62,,
1947-03-01,,,,,,,22.00,,
1947-04-01,2029.024,,,,,,22.00,,245.968
1947-05-01,,,,,,,21.95,,
...,...,...,...,...,...,...,...,...,...
2023-04-01,,,3.46,,,,,,
2023-04-06,,,,,6.28,,,,
2023-04-13,,,,,6.27,,,,
2023-04-20,,,,,6.39,,,,


In [4]:
selected_cols = ['MORTGAGE30US', 'UNRATE', 'GS10', 'FEDFUNDS'] 
raw_df = raw_df[selected_cols]

## Data Preparation

### List of Available Test Plans

In [5]:
vm.test_plans.list_plans()

ID,Name,Description
sklearn_classifier_metrics,SKLearnClassifierMetrics,Test plan for sklearn classifier metrics
sklearn_classifier_validation,SKLearnClassifierPerformance,Test plan for sklearn classifier models
sklearn_classifier_model_diagnosis,SKLearnClassifierDiagnosis,Test plan for sklearn classifier model diagnosis tests
sklearn_classifier,SKLearnClassifier,Test plan for sklearn classifier models that includes  both metrics and validation tests
tabular_dataset,TabularDataset,Test plan for generic tabular datasets
tabular_dataset_description,TabularDatasetDescription,Test plan to extract metadata and descriptive  statistics from a tabular dataset
tabular_data_quality,TabularDataQuality,Test plan for data quality on tabular datasets
normality_test_plan,NormalityTestPlan,Test plan to perform normality tests.
autocorrelation_test_plan,AutocorrelationTestPlan,Test plan to perform autocorrelation tests.
seasonality_test_plan,SesonalityTestPlan,Test plan to perform seasonality tests.


### Data Quality

#### Run Data Quality Test Plan

In [6]:
vm_dataset = vm.init_dataset(
    dataset=raw_df,
    target_column="MORTGAGE30US"
)

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


In [7]:
vm.test_plans.describe_plan("time_series_data_quality")

Attribute,Value
ID,time_series_data_quality
Name,TimeSeriesDataQuality
Description,Test plan for data quality on time series datasets
Required Context,['dataset']
Tests,"TimeSeriesOutliers (ThresholdTest), TimeSeriesMissingValues (ThresholdTest), TimeSeriesFrequency (ThresholdTest)"
Test Plans,[]


In [8]:
config={
    "time_series_outliers": {
        "zscore_threshold": 3,

    },
    "time_series_missing_values":{
        "min_threshold": 2,
    }
}

plan = vm.run_test_plan("time_series_data_quality", dataset=vm_dataset, config=config)

                                                                                                                                       

Handling Frequencies.

In [9]:
def identify_frequencies(df):
    """
    Identify the frequency of each series in the DataFrame.

    :param df: Time-series DataFrame
    :return: DataFrame with two columns: 'Variable' and 'Frequency'
    """
    frequencies = []
    for column in df.columns:
        series = df[column].dropna()
        if not series.empty:
            freq = pd.infer_freq(series.index)
            if freq == 'MS' or freq == 'M':
                label = 'Monthly'
            elif freq == 'Q':
                label = 'Quarterly'
            elif freq == 'A':
                label = 'Yearly'
            else:
                label = freq
        else:
            label = None

        frequencies.append({'Variable': column, 'Frequency': label})

    freq_df = pd.DataFrame(frequencies)

    return freq_df

In [10]:
frequencies = identify_frequencies(raw_df)
display(frequencies)

Unnamed: 0,Variable,Frequency
0,MORTGAGE30US,
1,UNRATE,Monthly
2,GS10,Monthly
3,FEDFUNDS,Monthly


Resample.

In [11]:
preprocessed_df = raw_df.resample('MS').last()
frequencies = identify_frequencies(preprocessed_df)
display(frequencies)

Unnamed: 0,Variable,Frequency
0,MORTGAGE30US,Monthly
1,UNRATE,Monthly
2,GS10,Monthly
3,FEDFUNDS,Monthly


Run Data Quality Test Plan.

In [12]:
vm_dataset = vm.init_dataset(
    dataset=preprocessed_df,
    target_column="MORTGAGE30US"
)
plan = vm.run_test_plan("time_series_data_quality", dataset=vm_dataset, config=config)

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


                                                                                                                                       

Remove missing values.

In [13]:
preprocessed_df = preprocessed_df.dropna()

Run Data Quality Test Plan. 

In [14]:
vm_dataset = vm.init_dataset(
    dataset=preprocessed_df,
    target_column="MORTGAGE30US"
)
plan = vm.run_test_plan("time_series_data_quality", dataset=vm_dataset, config=config)

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


                                                                                                                                       

## Exploratory Data Analysis

### Univariate Analysis

#### Run Time Series Univariate Test Plan

In [15]:
vm.test_plans.describe_plan("time_series_univariate")

Attribute,Value
ID,time_series_univariate
Name,TimeSeriesUnivariate
Description,Test plan to perform time series univariate analysis.
Required Context,['dataset']
Tests,"TimeSeriesLinePlot (Metric), TimeSeriesHistogram (Metric), ACFandPACFPlot (Metric), RollingStatsPlot (Metric)"
Test Plans,[]


In [16]:
target_column = ['MORTGAGE30US']
feature_columns = ['UNRATE', 'GS10', 'FEDFUNDS']

test_plan_config = {
    "time_series_line_plot": {
        "columns": target_column + feature_columns
    },
    "time_series_histogram": {
        "columns": target_column + feature_columns
    },
    "acf_pacf_plot": {
        "columns": target_column + feature_columns
    },
    "auto_ar": {
        "max_ar_order": 3
    },
    "auto_ma": {
        "max_ma_order": 3
    },
    "seasonal_decompose": {
        "seasonal_model": 'additive',
         "fig_size": (40,30)
    },
    "auto_seasonality": {
        "min_period": 1,
        "max_period": 3
    },
      "auto_stationarity": {
        "max_order": 3,
        "threshold": 0.05
    },
      "rolling_stats_plot": {
        "window_size": 12    
    },
}

vm_dataset = vm.init_dataset(
    dataset=preprocessed_df,
    target_column="MORTGAGE30US"
)
vm.run_test_plan("time_series_univariate", config=test_plan_config, dataset=vm_dataset)

Pandas dataset detected. Initializing VM Dataset instance...
Inferring dataset types...


Running Metric: acf_pacf_plot:  50%|█████     | 2/4 [00:00<00:00,  5.01it/s]        The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
                                                                                                                             

TimeSeriesUnivariate(test_context=TestContext(dataset=Dataset(raw_dataset=            MORTGAGE30US  UNRATE  GS10  FEDFUNDS
DATE                                            
1971-04-01          7.29     5.9  5.83      4.16
1971-05-01          7.46     5.9  6.39      4.63
1971-06-01          7.54     5.9  6.52      4.91
1971-07-01          7.69     6.0  6.73      5.31
1971-08-01          7.69     6.1  6.58      5.57
...                  ...     ...   ...       ...
2022-11-01          6.58     3.6  3.89      3.78
2022-12-01          6.42     3.5  3.62      4.10
2023-01-01          6.13     3.4  3.53      4.33
2023-02-01          6.50     3.6  3.75      4.57
2023-03-01          6.32     3.5  3.66      4.65

[624 rows x 4 columns], fields=[{'id': 'MORTGAGE30US', 'type': 'Numeric'}, {'id': 'UNRATE', 'type': 'Numeric'}, {'id': 'GS10', 'type': 'Numeric'}, {'id': 'FEDFUNDS', 'type': 'Numeric'}], sample=[{'id': 'head', 'data': [{'MORTGAGE30US': 7.29, 'UNRATE': 5.9, 'GS10': 5.83, 'FEDFUNDS': 4.16}

### Multivariate Analysis

#### Run Time Series Multivariate Test Plan

In [17]:
vm.test_plans.describe_plan("time_series_multivariate")

Attribute,Value
ID,time_series_multivariate
Name,TimeSeriesMultivariate
Description,Test plan to perform time series multivariate analysis.
Required Context,['dataset']
Tests,"ScatterPlot (Metric), LaggedCorrelationHeatmap (Metric), SpreadPlot (Metric)"
Test Plans,[]


In [18]:
test_plan_config = {
    "scatter_plot": {
        "columns": target_column + feature_columns
    },
    "lagged_correlation_heatmap": {
        "target_col": target_column,
        "independent_vars": feature_columns
    },
    "engle_granger_coint": {
        "threshold": 0.05
    },
}

vm.run_test_plan("time_series_multivariate", config=test_plan_config, dataset=vm_dataset)

                                                                                                                                  

TimeSeriesMultivariate(test_context=TestContext(dataset=Dataset(raw_dataset=            MORTGAGE30US  UNRATE  GS10  FEDFUNDS
DATE                                            
1971-04-01          7.29     5.9  5.83      4.16
1971-05-01          7.46     5.9  6.39      4.63
1971-06-01          7.54     5.9  6.52      4.91
1971-07-01          7.69     6.0  6.73      5.31
1971-08-01          7.69     6.1  6.58      5.57
...                  ...     ...   ...       ...
2022-11-01          6.58     3.6  3.89      3.78
2022-12-01          6.42     3.5  3.62      4.10
2023-01-01          6.13     3.4  3.53      4.33
2023-02-01          6.50     3.6  3.75      4.57
2023-03-01          6.32     3.5  3.66      4.65

[624 rows x 4 columns], fields=[{'id': 'MORTGAGE30US', 'type': 'Numeric'}, {'id': 'UNRATE', 'type': 'Numeric'}, {'id': 'GS10', 'type': 'Numeric'}, {'id': 'FEDFUNDS', 'type': 'Numeric'}], sample=[{'id': 'head', 'data': [{'MORTGAGE30US': 7.29, 'UNRATE': 5.9, 'GS10': 5.83, 'FEDFUNDS': 4.1