In [0]:
%pip install great-expectations

Python interpreter will be restarted.
Python interpreter will be restarted.


In [0]:
%run /Users/yashdholam04@gmail.com/DataQuality/expect_column_values

In [0]:
import datetime
import json
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, IntegerType
import great_expectations as gx
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset

In [0]:
class DataContextManager:
    def __init__(self, root_dir):
        self.context = self.init_data_context(root_dir)

    def init_data_context(self, root_dir: str) -> gx.DataContext:
        data_context_config = gx.data_context.types.base.DataContextConfig(
            store_backend_defaults=gx.data_context.types.base.FilesystemStoreBackendDefaults(
                root_directory=root_dir
            )
        )
        context = gx.get_context(project_config=data_context_config)
        return context

    def get_expectation_suites(self, suite_name):
        suite_name_ex = suite_name
        list_expectation_suite_names = self.context.list_expectation_suite_names()
        try:
            index = list_expectation_suite_names.index(suite_name_ex)
            suite_exists = True
        except ValueError:
            suite_exists = False

        if suite_exists:
            suite = self.context.get_expectation_suite(suite_name_ex)

        else:
            suite = self.context.add_expectation_suite(suite_name_ex)
        return suite

    def create_checkpoint(self, checkpoint_name, suite_name):
        self.context.add_or_update_checkpoint(
            name=checkpoint_name,
            config_version=1,
            class_name="SimpleCheckpoint",
            validations=[{"expectation_suite_name": suite_name}]
        )

    def display_checkpoint_results(self, result):
        result_ids = result.list_validation_result_identifiers()
        for result_id in result_ids:
            docs = self.context.get_docs_sites_urls(resource_identifier=result_id)
            for doc in docs:
                path = doc["site_url"]
                if path.startswith("file://"):
                    path = path[len("file://"):]
                with open(path, "r") as f:
                    displayHTML(f.read())

    def run_checkpoint(self, checkpoint_name, batch_request):
        return self.context.run_checkpoint(
            checkpoint_name=checkpoint_name,
            batch_request=batch_request
        )

    def add_or_update_expectation_suite(self, expectation_suite):
        self.context.add_or_update_expectation_suite(expectation_suite=expectation_suite)

In [0]:
class ExpectationConfigurationManager:
    @staticmethod
    def create_expectation_configuration(column_name, expectation_type, suite):
        column_name_ex = column_name
        expectation_type_ex = expectation_type
        if expectation_type_ex == 'expect_column_values_to_not_be_null':
            expectation_configuration = gx.core.ExpectationConfiguration(
                expectation_type="expect_column_values_to_not_be_null",
                kwargs={
                    "column": column_name_ex,
                },
                meta={
                    "notes": {
                        "format": "markdown",
                        "content": f"Null Check for {column_name_ex}",
                    }
                },
            )
        elif expectation_type_ex == 'expect_column_values_to_be_unique':
            expectation_configuration = gx.core.ExpectationConfiguration(
                expectation_type="expect_column_values_to_be_unique",
                kwargs={
                    "column": column_name_ex,
                },
                meta={
                    "notes": {
                        "format": "markdown",
                        "content": f"Unique Check for {column_name_ex}",
                    }
                },
            )
        elif expectation_type_ex == 'expect_column_values_to_equal_to_given':
            expectation_configuration = gx.core.ExpectationConfiguration(
                expectation_type="expect_column_values_to_equal_to_given",
                kwargs={
                    "column": column_name_ex,
                    "value" : 38,
                },
                meta={
                    "notes": {
                        "format": "markdown",
                        "content": f"Custom Check for {column_name_ex}",
                    }
                },
            )
        
        suite.add_expectation(expectation_configuration=expectation_configuration)

In [0]:
class SparkManager:
    def __init__(self):
        self.spark = SparkSession.builder.appName("DataQualityManager").getOrCreate()

    def create_meta_table(self):
        sql_query = f"""
        SELECT
            dc.column_id,
            dc.column_name,
            ds.datasource_id,
            ds.datasource_name,
            ds.datasource_type,
            dc.table_id,
            dt.table_name,
            des.suite_id,
            des.suite_name,
            dex.expectation_id,
            dex.expectation_type
        FROM
            dq_tables dt
        JOIN
            dq_columns dc ON dt.table_id = dc.table_id
        JOIN
            dq_datasources ds ON dt.datasource_id = ds.datasource_id
        JOIN
            dq_expectation_suites des ON dc.table_id = des.table_id
        JOIN
            dq_expectations dex ON des.suite_id = dex.suite_id
        AND
            dc.column_name=dex.column_name;
        """

        meta_table_df = self.spark.sql(sql_query)
        return meta_table_df

    def create_separate_meta_tables(self, table_name):
        sql_query = f"""
        SELECT
            dc.column_id,
            dc.column_name,
            ds.datasource_id,
            ds.datasource_name,
            ds.datasource_type,
            dc.table_id,
            dt.table_name,
            des.suite_id,
            des.suite_name,
            dex.expectation_id,
            dex.expectation_type
        FROM
            dq_tables dt
        JOIN
            dq_columns dc ON dt.table_id = dc.table_id
        JOIN
            dq_datasources ds ON dt.datasource_id = ds.datasource_id
        JOIN
            dq_expectation_suites des ON dc.table_id = des.table_id
        JOIN
            dq_expectations dex ON des.suite_id = dex.suite_id
        AND
            dc.column_name=dex.column_name
        WHERE dt.table_name='{table_name}';
        """

        meta_table_df = self.spark.sql(sql_query)
        return meta_table_df

    def parse_checkpoint_result_spark(self, checkpoint_result, table_name,validation_type):
        data = json.loads(checkpoint_result)
        rows = []
        result_id = 1
        if validation_type=="post":
            run_results_keys = data["run_results"].keys()
            validation_id = None
            for key in run_results_keys:
                if "ValidationResultIdentifier" in key:
                    validation_id = key
                    break

            results = data["run_results"][validation_id]["validation_result"]["results"]
            for idx, result in enumerate(results):
                row = {
                    "result_id": result_id,
                    "success": result["success"],
                    "table_name": table_name,
                    "expectation_type": result["expectation_config"]["expectation_type"],
                    "column": result["expectation_config"]["kwargs"]["column"],
                    "result": json.dumps(result["result"]),
                    "runtime": data["run_id"]["run_time"],
                    "validation_type": validation_type
                }
                rows.append(row)
                result_id += 1
        elif validation_type=="pre":
            results = data["results"]
            for result in results:
                row = {
                    "result_id": result_id,
                    "success": result["success"],
                    "table_name": table_name,
                    "expectation_type": result["expectation_config"]["expectation_type"],
                    "column": result["expectation_config"]["kwargs"]["column"],
                    "result": json.dumps(result["result"]),
                    "runtime": data["meta"]["run_id"]["run_time"],
                    "validation_type": validation_type
                }
                rows.append(row)
                result_id += 1

        schema = StructType([
            StructField("result_id", IntegerType(), True),
            StructField("success", BooleanType(), True),
            StructField("table_name", StringType(), True),
            StructField("expectation_type", StringType(), True),
            StructField("column", StringType(), True),
            StructField("result", StringType(), True),
            StructField("runtime", StringType(), True),
            StructField("validation_type", StringType(), True)
        ])
        df = self.spark.createDataFrame(self.spark.sparkContext.parallelize(rows), schema)

        return df

In [0]:
class DataQualityManager:
    def __init__(self, root_dir):
        self.context_manager = DataContextManager(root_dir)
        self.spark_manager = SparkManager()
        self.expectation_manager = ExpectationConfigurationManager()

    def register_spark_data_source(self, datasource_name: str):
        source = gx.datasource.Datasource(
            name=datasource_name,
            execution_engine={
                "module_name": "great_expectations.execution_engine",
                "class_name": "SparkDFExecutionEngine"
            },
            data_connectors={
                f"spark_data_source_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["timestamp"]
                }
            }
        )
        self.context_manager.context.add_or_update_datasource(datasource=source)

    def create_batch_request(self, df, df_name: str, timestamp: str, datasource_name: str) -> gx.core.batch.RuntimeBatchRequest:
        runtime_batch_request = gx.core.batch.RuntimeBatchRequest(
            datasource_name=datasource_name,
            data_connector_name="spark_data_source_connector",
            data_asset_name="spark_batch_{}_{}".format(df_name, timestamp),
            runtime_parameters={"batch_data": df},
            batch_identifiers={
                "timestamp": timestamp,
            }
        )
        return runtime_batch_request

    def current_timestamp(self) -> str:
        return datetime.datetime.utcnow().isoformat()

    def validate_dataframe(self, df, expectations):
        ge_df = SparkDFDataset(df)
        for expectation in expectations:
            func = getattr(ge_df, expectation["expectation"])
            func(*expectation["args"], **expectation["kwargs"])
        
        results = ge_df.validate()
        print(results)
        return results

    def process_table(self, table_name, validation_type="post", df=None, expectations=None):
        if validation_type == "post" and df is None and expectations is None:
            sep_meta_table = self.spark_manager.create_separate_meta_tables(table_name)
            sep_meta_table = sep_meta_table.collect()

            # Register the data source only once per table
            if sep_meta_table:
                datasource_name_ex = sep_meta_table[0]["datasource_name"]
                self.register_spark_data_source(datasource_name=datasource_name_ex)

            for row in sep_meta_table:
                column_name_ex = row["column_name"]
                expectation_type_ex = row["expectation_type"]
                suite_name_ex = row["suite_name"]

                suite = self.context_manager.get_expectation_suites(suite_name_ex)
                
                self.expectation_manager.create_expectation_configuration(column_name=column_name_ex, expectation_type=expectation_type_ex, suite=suite)
                self.context_manager.add_or_update_expectation_suite(expectation_suite=suite)

            if sep_meta_table:
                table_name_ex = sep_meta_table[0]["table_name"]
                suite_name_ex = sep_meta_table[0]["suite_name"]
                df = self.spark_manager.spark.table(table_name)
                timestamp = self.current_timestamp()
                batch_request = self.create_batch_request(df=df, df_name=table_name_ex, timestamp=timestamp, datasource_name=datasource_name_ex)
                checkpoint_name = f"{table_name_ex}_checkpoint"
                self.context_manager.create_checkpoint(checkpoint_name=checkpoint_name, suite_name=suite_name_ex)
                checkpoint_result = self.context_manager.run_checkpoint(
                    checkpoint_name=checkpoint_name,
                    batch_request=batch_request
                )
                print(checkpoint_result)
                checkpoint_result_str = str(checkpoint_result)
                validation_results = self.spark_manager.parse_checkpoint_result_spark(checkpoint_result_str, table_name=table_name_ex,validation_type="post")
                display(validation_results)
                validation_results.write.mode("append").option("mergeSchema", "true").saveAsTable("dq_validation_results")
                self.context_manager.display_checkpoint_results(checkpoint_result)
        elif validation_type == "pre" and df is not None and expectations is not None:
            # Validate the DataFrame with given expectations
            pre_validation_results=self.validate_dataframe(df, expectations)
            pre_validation_results_str = str(pre_validation_results)
            validation_results = self.spark_manager.parse_checkpoint_result_spark(pre_validation_results_str, table_name=table_name,validation_type="pre")
            display(validation_results)
            validation_results.write.mode("append").option("mergeSchema", "true").saveAsTable("dq_validation_results")
        else:
            print("error")

    def main(self, df=None, expectations=None):
        meta_table_df = self.spark_manager.create_meta_table()
        table_name_list = [row['table_name'] for row in meta_table_df.collect()]
        distinct_table_names = list(set(table_name_list))
        for table_name in distinct_table_names:
            self.process_table(table_name, validation_type="post")



# if __name__ == "__main__":
#     root_dir = "/dbfs/great_expectations/"
#     manager = DataQualityManager(root_dir)
#     manager.main()


[ { 'expect_column_values_to_be_unique': { 'column': 'User_ID',
                                           'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Payment_Method',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Location',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Gender',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Age',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Membership_Status',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Plan',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Subscri

Calculating Metrics:   0%|          | 0/69 [00:00<?, ?it/s]

{
  "run_id": {
    "run_name": null,
    "run_time": "2024-06-25T17:19:00.210565+00:00"
  },
  "run_results": {
    "ValidationResultIdentifier::amazonprime_suite_1/__none__/20240625T171900.210565Z/6bf2a99d3a1477b33ab756f35ae17749": {
      "validation_result": {
        "success": false,
        "results": [
          {
            "success": false,
            "expectation_config": {
              "expectation_type": "expect_column_values_to_not_be_null",
              "kwargs": {
                "column": "Payment_Method",
                "batch_id": "6bf2a99d3a1477b33ab756f35ae17749"
              },
              "meta": {
                "notes": {
                  "content": "Null Check for Payment_Method",
                  "format": "markdown"
                }
              }
            },
            "result": {
              "element_count": 40,
              "unexpected_count": 2,
              "unexpected_percent": 5.0,
              "partial_unexpected_list": [
      

result_id,success,table_name,expectation_type,column,result,runtime,validation_type
1,False,amazon_prime,expect_column_values_to_not_be_null,Payment_Method,"{""element_count"": 40, ""unexpected_count"": 2, ""unexpected_percent"": 5.0, ""partial_unexpected_list"": [null, null], ""partial_unexpected_counts"": [{""value"": null, ""count"": 2}]}",2024-06-25T17:19:00.210565+00:00,post
2,False,amazon_prime,expect_column_values_to_not_be_null,Location,"{""element_count"": 40, ""unexpected_count"": 1, ""unexpected_percent"": 2.5, ""partial_unexpected_list"": [null], ""partial_unexpected_counts"": [{""value"": null, ""count"": 1}]}",2024-06-25T17:19:00.210565+00:00,post
3,True,amazon_prime,expect_column_values_to_not_be_null,Gender,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
4,True,amazon_prime,expect_column_values_to_not_be_null,Age,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
5,True,amazon_prime,expect_column_values_to_not_be_null,Membership_Status,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
6,True,amazon_prime,expect_column_values_to_not_be_null,Plan,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
7,True,amazon_prime,expect_column_values_to_not_be_null,Subscription_End_Date,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
8,True,amazon_prime,expect_column_values_to_not_be_null,Subscription_Start_Date,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
9,True,amazon_prime,expect_column_values_to_not_be_null,Subscription_Type,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post
10,True,amazon_prime,expect_column_values_to_be_unique,User_ID,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""missing_count"": 0, ""missing_percent"": 0.0, ""unexpected_percent_total"": 0.0, ""unexpected_percent_nonmissing"": 0.0, ""partial_unexpected_counts"": []}",2024-06-25T17:19:00.210565+00:00,post


Unnamed: 0,Unnamed: 1
Evaluated Expectations,11
Successful Expectations,9
Unsuccessful Expectations,2
Success Percent,≈81.82%

Unnamed: 0,Unnamed: 1
Great Expectations Version,0.18.16
Run Name,__none__
Run Time,2024-06-25T17:19:00Z

Unnamed: 0,Unnamed: 1
ge_load_time,20240625T171900.216269Z

Unnamed: 0,Unnamed: 1
batch_data,SparkDataFrame
data_asset_name,spark_batch_amazon_prime_2024-06-25T17:19:00.153810

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
Unexpected Value,Count,Unnamed: 2_level_1
,values must never be null.  1 unexpected values found. 2.5% of 40 total rows.  Unexpected Value  Count  null1,97.5% not null
Unexpected Value,Count,
,1,

Unexpected Value,Count
,1

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
Unexpected Value,Count,Unnamed: 2_level_1
,values must never be null.  2 unexpected values found. 5% of 40 total rows.  Unexpected Value  Count  null2,95% not null
Unexpected Value,Count,
,2,

Unexpected Value,Count
,2

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must be unique.,0% unexpected
,values must never be null.,100% not null


[ { 'expect_column_values_to_be_unique': { 'column': 'User_ID',
                                           'domain': 'column'}},
  { 'expect_column_values_to_equal_to_given': { 'column': 'Age',
                                                'domain': 'column',
                                                'value': 38}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Plan_Duration',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Device',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Gender',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Age',
                                             'domain': 'column'}},
  { 'expect_column_values_to_not_be_null': { 'column': 'Country',
                                             'domain': 'column'}},
  { 'expec

Calculating Metrics:   0%|          | 0/75 [00:00<?, ?it/s]

An unexpected Exception occurred during data docs rendering.  Because of this error, certain parts of data docs will not be rendered properly and/or may not appear altogether.  Please use the trace, included in this message, to diagnose and repair the underlying issue.  Detailed information follows:
            TypeError: "_diagnostic_observed_value_renderer() missing 1 required positional argument: 'cls'".  Traceback: "Traceback (most recent call last):
  File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-b809192a-d939-499c-8408-a2b94249df8a/lib/python3.9/site-packages/great_expectations/render/renderer/content_block/validation_results_table_content_block.py", line 180, in row_generator_fn
    observed_value_renderer[1](result=result)
  File "/local_disk0/.ephemeral_nfs/envs/pythonEnv-b809192a-d939-499c-8408-a2b94249df8a/lib/python3.9/site-packages/great_expectations/render/renderer/renderer.py", line 22, in inner_func
    return renderer_fn(*args, **kwargs)
  File "/local_disk0/.epheme

{
  "run_id": {
    "run_name": null,
    "run_time": "2024-06-25T17:19:17.499015+00:00"
  },
  "run_results": {
    "ValidationResultIdentifier::netflix_suite_1/__none__/20240625T171917.499015Z/48a56c65ae9fd2bf3370200de4cc5a1c": {
      "validation_result": {
        "success": false,
        "results": [
          {
            "success": true,
            "expectation_config": {
              "expectation_type": "expect_column_values_to_not_be_null",
              "kwargs": {
                "column": "Plan_Duration",
                "batch_id": "48a56c65ae9fd2bf3370200de4cc5a1c"
              },
              "meta": {
                "notes": {
                  "content": "Null Check for Plan_Duration",
                  "format": "markdown"
                }
              }
            },
            "result": {
              "element_count": 2500,
              "unexpected_count": 0,
              "unexpected_percent": 0.0,
              "partial_unexpected_list": [],
         

result_id,success,table_name,expectation_type,column,result,runtime,validation_type
1,True,netflix,expect_column_values_to_not_be_null,Plan_Duration,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
2,True,netflix,expect_column_values_to_not_be_null,Device,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
3,True,netflix,expect_column_values_to_not_be_null,Gender,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
4,False,netflix,expect_column_values_to_equal_to_given,Age,"{""element_count"": 2500, ""unexpected_count"": 2412, ""unexpected_percent"": 96.51860744297718, ""partial_unexpected_list"": [""28"", ""35"", ""42"", ""51"", ""33"", ""29"", ""46"", ""39"", ""37"", ""44"", ""31"", ""45"", ""48"", ""27"", null, ""36"", ""30"", ""43"", ""32"", ""41""], ""missing_count"": 1, ""missing_percent"": 0.04, ""unexpected_percent_total"": 96.48, ""unexpected_percent_nonmissing"": 96.51860744297718}",2024-06-25T17:19:17.499015+00:00,post
5,False,netflix,expect_column_values_to_not_be_null,Age,"{""element_count"": 2500, ""unexpected_count"": 1, ""unexpected_percent"": 0.04, ""partial_unexpected_list"": [null], ""partial_unexpected_counts"": [{""value"": null, ""count"": 1}]}",2024-06-25T17:19:17.499015+00:00,post
6,True,netflix,expect_column_values_to_not_be_null,Country,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
7,True,netflix,expect_column_values_to_not_be_null,Last_Payment_Date,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
8,True,netflix,expect_column_values_to_not_be_null,Join_Date,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
9,True,netflix,expect_column_values_to_not_be_null,Monthly_Revenue,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post
10,True,netflix,expect_column_values_to_not_be_null,Subscription_Type,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-25T17:19:17.499015+00:00,post


Unnamed: 0,Unnamed: 1
Evaluated Expectations,12
Successful Expectations,9
Unsuccessful Expectations,3
Success Percent,75%

Unnamed: 0,Unnamed: 1
Great Expectations Version,0.18.16
Run Name,__none__
Run Time,2024-06-25T17:19:17Z

Unnamed: 0,Unnamed: 1
ge_load_time,20240625T171917.508640Z

Unnamed: 0,Unnamed: 1
batch_data,SparkDataFrame
data_asset_name,spark_batch_netflix_2024-06-25T17:19:17.467628

Status,Expectation,Observed Value
Sampled Unexpected Values,Unnamed: 1_level_1,Unnamed: 2_level_1
Unexpected Value,Count,Unnamed: 2_level_2
,"expect_column_values_to_equal_to_given(**{'batch_id': '48a56c65ae9fd2bf3370200de4cc5a1c', 'column': 'Age', 'value': 38})  2412 unexpected values found. ≈96.52% of 2500 total rows.  Sampled Unexpected Values  28  35  42  51  33  29  46  39  37  44  31  45  48  27  36  30  43  32  41",--
Sampled Unexpected Values,,
28,,
35,,
42,,
51,,
33,,
29,,
46,,
39,,

Sampled Unexpected Values
28
35
42
51
33
29
46
39
37
44

Unexpected Value,Count
,1

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
,values must never be null.,100% not null

Status,Expectation,Observed Value
Unexpected Value,Count,Unnamed: 2_level_1
,values must be unique.,0% unexpected
,values must never be null.  5 unexpected values found. 0.2% of 2500 total rows.  Unexpected Value  Count  null5,99.8% not null
Unexpected Value,Count,
,5,

Unexpected Value,Count
,5


In [0]:
%sql
select * from dq_validation_results



result_id,success,table_name,expectation_type,column,result,runtime,validation_type
4,False,netflix,expect_column_values_to_equal_to_given,Age,"{""element_count"": 2500, ""unexpected_count"": 2412, ""unexpected_percent"": 96.51860744297718, ""partial_unexpected_list"": [28, 35, 42, 51, 33, 29, 46, 39, 37, 44, 31, 45, 48, 27, null, 36, 30, 43, 32, 41], ""missing_count"": 1, ""missing_percent"": 0.04, ""unexpected_percent_total"": 96.48, ""unexpected_percent_nonmissing"": 96.51860744297718}",2024-06-29T03:16:54.083990+00:00,post
7,True,amazon_prime,expect_column_values_to_not_be_null,Subscription_End_Date,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:32.030542+00:00,post
8,True,amazon_prime,expect_column_values_to_not_be_null,Subscription_Start_Date,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:32.030542+00:00,post
1,True,demo,expect_column_values_to_be_unique,id,"{""element_count"": 2, ""missing_count"": 0, ""missing_percent"": 0.0, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""unexpected_percent_total"": 0.0, ""unexpected_percent_nonmissing"": 0.0, ""partial_unexpected_list"": []}",2024-06-29T03:23:46.841211+00:00,pre
1,False,amazon_prime,expect_column_values_to_not_be_null,Payment_Method,"{""element_count"": 40, ""unexpected_count"": 2, ""unexpected_percent"": 5.0, ""partial_unexpected_list"": [null, null], ""partial_unexpected_counts"": [{""value"": null, ""count"": 2}]}",2024-06-29T03:16:32.030542+00:00,post
8,True,netflix,expect_column_values_to_not_be_null,Join_Date,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:54.083990+00:00,post
9,True,netflix,expect_column_values_to_not_be_null,Monthly_Revenue,"{""element_count"": 2500, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:54.083990+00:00,post
2,False,amazon_prime,expect_column_values_to_not_be_null,Location,"{""element_count"": 40, ""unexpected_count"": 1, ""unexpected_percent"": 2.5, ""partial_unexpected_list"": [null], ""partial_unexpected_counts"": [{""value"": null, ""count"": 1}]}",2024-06-29T03:16:32.030542+00:00,post
3,True,amazon_prime,expect_column_values_to_not_be_null,Gender,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:32.030542+00:00,post
4,True,amazon_prime,expect_column_values_to_not_be_null,Age,"{""element_count"": 40, ""unexpected_count"": 0, ""unexpected_percent"": 0.0, ""partial_unexpected_list"": [], ""partial_unexpected_counts"": []}",2024-06-29T03:16:32.030542+00:00,post
