In [1]:
# Importing dependencies
import pandas as pd
import numpy as np
from feast import FeatureStore
from feast.dqm.profilers.ge_profiler import ge_profiler
from great_expectations.core.expectation_suite import ExpectationSuite
from great_expectations.dataset import PandasDataset

  def add_data_context_id_to_url(self, jinja_context, url, add_datetime=True):



In [122]:
# Getting our feature store
store = FeatureStore(repo_path="driver_stats/")

# Getting a saved dataset
dataset = store.get_saved_dataset('driver_stats')

In [153]:
# Tolerance value for the mean
DELTA = 0.1

# Creating a profiler function
@ge_profiler
def stats_profiler(ds: PandasDataset) -> ExpectationSuite:
    # DEFINING MINIMUM AND MAXIMUM
    # EXPECTED VALUES

    observed_min = ds["avg_daily_trips"].min()
    observed_max = ds["avg_daily_trips"].max()
    # Setting the expected min and max values
    ds.expect_column_values_to_be_between(
        column="avg_daily_trips",
        mostly=0.99,
        min_value=observed_min,
        max_value=observed_max       
    )

    # DEFINING EXPECTED AVERAGE

    # Getting the average of the reference dataset
    observed_mean = ds["avg_daily_trips"].mean()
    
    # Setting the expected range
    ds.expect_column_mean_to_be_between(
        column="avg_daily_trips",        
        min_value=observed_mean * (1 - DELTA),
        max_value=observed_mean * (1 + DELTA)
    )

    # Retrieving comparison results
    return ds.get_expectation_suite(discard_failed_expectations=False)

In [154]:
# Checking the expectation function
dataset.get_profile(profiler=stats_profiler)

03/22/2022 04:17:37 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.


<GEProfile with expectations: [
  {
    "meta": {},
    "kwargs": {
      "column": "avg_daily_trips",
      "mostly": 0.99,
      "min_value": 2,
      "max_value": 998
    },
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "meta": {},
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 435.62050632911394,
      "max_value": 532.4250632911393
    },
    "expectation_type": "expect_column_mean_to_be_between"
  }
]>

In [155]:
# Saving the dataset as a reference for validation
validation_reference = dataset.as_reference(profiler=stats_profiler)

In [156]:
# Creating an entity DataFrame with timestamps
timestamps = pd.date_range(
    start="2021-09-05",    
    end="2021-09-06",     
    freq='H').to_frame(name="event_timestamp", index=False)

# Creating patient IDs for the entity DataFrame
driver_ids = pd.DataFrame([1001, 1002, 1003, 1004, 1005], columns=["driver_id"])

# Create the cartersian product of our timestamps and entities 
entity_df = timestamps.merge(right=driver_ids, how="cross")

# Getting the indicated historical features
# and joining them with our entity DataFrame
historical_features = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_stats_fv:conv_rate",
        "driver_stats_fv:acc_rate",
        "driver_stats_fv:avg_daily_trips",
    ]
)

In [157]:

# Converting the RetrievalJob to a DataFrame and validating it against our reference dataset
_ = historical_features.to_df(validation_reference=validation_reference)

03/22/2022 04:17:38 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.
03/22/2022 04:17:38 PM INFO:Validating data_asset_name None with expectation_suite_name default


ValidationFailed: [
  {
    "result": {
      "element_count": 125,
      "missing_count": 0,
      "missing_percent": 0.0,
      "unexpected_count": 2,
      "unexpected_percent": 1.6,
      "unexpected_percent_total": 1.6,
      "unexpected_percent_nonmissing": 1.6,
      "partial_unexpected_list": [
        0,
        1
      ],
      "partial_unexpected_index_list": [
        61,
        73
      ],
      "partial_unexpected_counts": [
        {
          "value": 0,
          "count": 1
        },
        {
          "value": 1,
          "count": 1
        }
      ],
      "unexpected_list": [
        0,
        1
      ],
      "unexpected_index_list": [
        61,
        73
      ]
    },
    "meta": {},
    "success": false,
    "exception_info": {
      "raised_exception": false,
      "exception_message": null,
      "exception_traceback": null
    },
    "expectation_config": {
      "meta": {},
      "kwargs": {
        "column": "avg_daily_trips",
        "mostly": 0.99,
        "min_value": 2,
        "max_value": 998,
        "result_format": "COMPLETE"
      },
      "expectation_type": "expect_column_values_to_be_between"
    }
  }
]

In [158]:
# Creating an entity DataFrame with timestamps
timestamps = pd.date_range(
    start="2021-09-05",    
    end="2021-09-15",     
    freq='H').to_frame(name="event_timestamp", index=False)

# Creating patient IDs for the entity DataFrame
driver_ids = pd.DataFrame([1001, 1002, 1003, 1004, 1005], columns=["driver_id"])

# Create the cartersian product of our timestamps and entities 
entity_df = timestamps.merge(right=driver_ids, how="cross")

# Getting the indicated historical features
# and joining them with our entity DataFrame
historical_features = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_stats_fv:conv_rate",
        "driver_stats_fv:acc_rate",
        "driver_stats_fv:avg_daily_trips",
    ]
)

In [159]:
# Converting the RetrievalJob to a DataFrame and validating it against our reference dataset
_ = historical_features.to_df(validation_reference=validation_reference)

03/22/2022 04:17:40 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.
03/22/2022 04:17:40 PM INFO:Validating data_asset_name None with expectation_suite_name default


In [162]:
# Getting our feature store
store = FeatureStore(repo_path="driver_stats/")

# Getting a saved dataset
dataset_1001 = store.get_saved_dataset('driver_stats_1001')

# Checking the expectation function
dataset_1001.get_profile(profiler=stats_profiler)

03/22/2022 04:19:13 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.


<GEProfile with expectations: [
  {
    "meta": {},
    "kwargs": {
      "column": "avg_daily_trips",
      "mostly": 0.99,
      "min_value": 2,
      "max_value": 998
    },
    "expectation_type": "expect_column_values_to_be_between"
  },
  {
    "meta": {},
    "kwargs": {
      "column": "avg_daily_trips",
      "min_value": 435.62050632911394,
      "max_value": 532.4250632911393
    },
    "expectation_type": "expect_column_mean_to_be_between"
  }
]>

In [151]:
# Creating an entity DataFrame with timestamps
timestamps = pd.date_range(
    start="2021-09-05",    
    end="2021-09-15",     
    freq='H').to_frame(name="event_timestamp", index=False)

# Creating patient IDs for the entity DataFrame
driver_ids = pd.DataFrame([1001], columns=["driver_id"])

# Create the cartersian product of our timestamps and entities 
entity_df = timestamps.merge(right=driver_ids, how="cross")

# Getting the indicated historical features
# and joining them with our entity DataFrame
historical_features = store.get_historical_features(
    entity_df=entity_df,
    features=[
        "driver_stats_fv:conv_rate",
        "driver_stats_fv:acc_rate",
        "driver_stats_fv:avg_daily_trips",
    ]
)

In [163]:
# Converting the RetrievalJob to a DataFrame and validating it against our reference dataset
_ = historical_features.to_df(validation_reference=validation_reference)

03/22/2022 04:20:24 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.
03/22/2022 04:20:24 PM INFO:Validating data_asset_name None with expectation_suite_name default


In [55]:
try:
    df = historical_features.to_df(validation_reference=validation_reference)
    print("success")
except ValidationFailed as exc:
    print(exc.validation_report)

03/21/2022 04:36:30 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.
03/21/2022 04:36:30 PM INFO:Validating data_asset_name None with expectation_suite_name default


[
  {
    "result": {
      "element_count": 1325,
      "missing_count": 0,
      "missing_percent": 0.0,
      "unexpected_count": 8,
      "unexpected_percent": 0.6037735849056604,
      "unexpected_percent_total": 0.6037735849056604,
      "unexpected_percent_nonmissing": 0.6037735849056604,
      "partial_unexpected_list": [
        0.9989146590232849,
        0.9969046711921692,
        0.9988154768943787,
        0.9993253350257874,
        0.0014650573721155524,
        0.0012101813917979598,
        0.9968545436859131,
        0.0014644580660387874
      ],
      "partial_unexpected_index_list": [
        144,
        326,
        704,
        761,
        773,
        1172,
        1174,
        1272
      ],
      "partial_unexpected_counts": [
        {
          "value": 0.0012101813917979598,
          "count": 1
        },
        {
          "value": 0.0014644580660387874,
          "count": 1
        },
        {
          "value": 0.0014650573721155524,
          "cou

In [56]:
df = historical_features.to_df(validation_reference=validation_reference)

03/21/2022 04:36:31 PM INFO:	2 expectation(s) included in expectation_suite. result_format settings filtered.
03/21/2022 04:36:31 PM INFO:Validating data_asset_name None with expectation_suite_name default


ValidationFailed: [
  {
    "result": {
      "element_count": 1325,
      "missing_count": 0,
      "missing_percent": 0.0,
      "unexpected_count": 8,
      "unexpected_percent": 0.6037735849056604,
      "unexpected_percent_total": 0.6037735849056604,
      "unexpected_percent_nonmissing": 0.6037735849056604,
      "partial_unexpected_list": [
        0.9989146590232849,
        0.9969046711921692,
        0.9988154768943787,
        0.9993253350257874,
        0.0014650573721155524,
        0.0012101813917979598,
        0.9968545436859131,
        0.0014644580660387874
      ],
      "partial_unexpected_index_list": [
        144,
        326,
        704,
        761,
        773,
        1172,
        1174,
        1272
      ],
      "partial_unexpected_counts": [
        {
          "value": 0.0012101813917979598,
          "count": 1
        },
        {
          "value": 0.0014644580660387874,
          "count": 1
        },
        {
          "value": 0.0014650573721155524,
          "count": 1
        },
        {
          "value": 0.9968545436859131,
          "count": 1
        },
        {
          "value": 0.9969046711921692,
          "count": 1
        },
        {
          "value": 0.9988154768943787,
          "count": 1
        },
        {
          "value": 0.9989146590232849,
          "count": 1
        },
        {
          "value": 0.9993253350257874,
          "count": 1
        }
      ],
      "unexpected_list": [
        0.9989146590232849,
        0.9969046711921692,
        0.9988154768943787,
        0.9993253350257874,
        0.0014650573721155524,
        0.0012101813917979598,
        0.9968545436859131,
        0.0014644580660387874
      ],
      "unexpected_index_list": [
        144,
        326,
        704,
        761,
        773,
        1172,
        1174,
        1272
      ]
    },
    "meta": {},
    "success": false,
    "expectation_config": {
      "kwargs": {
        "column": "conv_rate",
        "min_value": 0.002141552744433284,
        "max_value": 0.9957939982414246,
        "result_format": "COMPLETE"
      },
      "expectation_type": "expect_column_values_to_be_between",
      "meta": {}
    },
    "exception_info": {
      "raised_exception": false,
      "exception_message": null,
      "exception_traceback": null
    }
  },
  {
    "result": {
      "observed_value": 0.5067012782416851,
      "element_count": 1325,
      "missing_count": null,
      "missing_percent": null
    },
    "meta": {},
    "success": false,
    "expectation_config": {
      "kwargs": {
        "column": "conv_rate",
        "min_value": 0.4850536768952296,
        "max_value": 0.4850633780657792,
        "result_format": "COMPLETE"
      },
      "expectation_type": "expect_column_mean_to_be_between",
      "meta": {}
    },
    "exception_info": {
      "raised_exception": false,
      "exception_message": null,
      "exception_traceback": null
    }
  }
]