## Time Series Data Quality Test Plan Demo

### Setup ValidMind environment

In [None]:
# Load API key and secret from environment variables
%load_ext dotenv
%dotenv .env
import validmind as vm
vm.init(  api_host = "https://api.prod.validmind.ai/api/v1/tracking",
  project = "clhhzo21s006wl9rl0swhv40h")

### Load libraries

In [None]:
import glob
# ML libraries
import pandas as pd
# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Load data

In [None]:
def merge_fred_csv_files(file_pattern):
    # Use glob to find all files matching the specified pattern
    file_list = glob.glob(file_pattern)

    # Initialize an empty list to store individual DataFrames
    dataframes = []

    # Iterate through each file in the file list
    for file in file_list:
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file, parse_dates=['DATE'], index_col='DATE')

        # Add the DataFrame to the list of DataFrames
        dataframes.append(df)

    # Merge all the DataFrames in the list into a single DataFrame
    merged_df = pd.concat(dataframes, axis=1)

    return merged_df


file_pattern = './../../notebooks/datasets/time_series/raw/fred/*.csv'
df = merge_fred_csv_files(file_pattern)
display(df)

In [None]:
selected_cols = ['MORTGAGE30US', 'UNRATE', 'GS10', 'FEDFUNDS']
df = df[selected_cols]

In [None]:
def plot_time_series(df, cols_to_plot=None, title=''):
    """
    Plot multiple time-series in the same axes using seaborn.

    :param df: DataFrame with time-series data
    :param cols_to_plot: List of column names to plot. If None, plot all columns in df.
    :param title: Title of the plot, default is ''
    """
    if cols_to_plot is None:
        cols_to_plot = df.columns.tolist()

    # Create a new DataFrame with the columns to plot
    plot_df = df[cols_to_plot]

    # Set seaborn plot style
    sns.set(style="darkgrid")

    # Plot the time-series data
    plt.figure(figsize=(12, 6))
    for col in plot_df.columns:
        sns.lineplot(data=plot_df[col], label=col)

    plt.xlabel('Date')
    plt.ylabel('Value')
    plt.title(title)
    plt.legend()
    plt.show()


In [None]:
plot_time_series(df, title='All Variables')
df.info()

### Examine data quality using ValidMind framework

In [None]:
vm_dataset = vm.init_dataset(
    dataset=df,
    target_column="MORTGAGE30US"
)

In [None]:
vm.test_suites.list_suites()

In [None]:
df.head()

In [None]:
config={
    "time_series_outliers": {
        "zscore_threshold": 3.5,

    },
    "time_series_missing_values":{
        "min_threshold": 2,
    }
}
test_suite = vm.run_test_suite("time_series_dataset", dataset=vm_dataset, config=config)

### Adjust Frequencies, Remove missing values


In [None]:
df = df.resample('MS').last()
df = df.dropna()

### Examin data quality again

In [None]:

vm_dataset = vm.init_dataset(
    dataset=df,
    target_column="MORTGAGE30US"
)
test_suite = vm.run_test_suite("time_series_dataset", dataset=vm_dataset, config=config)