In [1]:
import os
import great_expectations as gx
from dotenv import load_dotenv
from great_expectations.exceptions import DataContextError
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
load_dotenv()

True

In [2]:
pg_host = os.getenv("POSTGRES_HOST")
pg_port = os.getenv("POSTGRES_PORT")
pg_user = os.getenv("POSTGRES_USER")
pg_password = os.getenv("POSTGRES_PASSWORD")
pg_db = os.getenv("POSTGRES_DB")

# Basic check to ensure variables are loaded
if not all([pg_host, pg_port, pg_user, pg_password, pg_db]):
    print("Error: One or more PostgreSQL environment variables are not set or not loaded.")
    print("Please ensure POSTGRES_HOST, POSTGRES_PORT, POSTGRES_USER, POSTGRES_PASSWORD, and POSTGRES_DB are in your .env file and that load_dotenv() in the previous cell found it.")
    # You might want to stop execution here or raise an error if variables are missing
else:
    print("PostgreSQL environment variables retrieved successfully.")
    connection_string = f"postgresql+psycopg2://{pg_user}:{pg_password}@{pg_host}:{pg_port}/{pg_db}"
    print(f"Connection string constructed (password is redacted for display): postgresql+psycopg2://{pg_user}:********@{pg_host}:{pg_port}/{pg_db}")

PostgreSQL environment variables retrieved successfully.
Connection string constructed (password is redacted for display): postgresql+psycopg2://silverlineage:********@localhost:5432/flight_db


In [11]:
context = gx.get_context(context_root_dir="/Users/danielmak/Documents/MLOps_project/gx")

In [4]:
datasource_name = "flight_data" 

datasource = context.sources.add_or_update_postgres(
    name=datasource_name,
    connection_string=connection_string
)
print(f"Datasource '{datasource_name}' ensured in context.")

Datasource 'flight_data' ensured in context.


In [5]:
table_name_to_test = "routes"  # Change to 'flights', 'airlines', or 'airports' as needed

try:
    # Try to get the asset if it already exists
    asset = datasource.get_asset(asset_name=table_name_to_test)
    print(f"Asset '{asset.name}' already exists and was retrieved from datasource '{datasource.name}'.")
except LookupError: # Or gx.exceptions.UnknownDataAssetNameError, depending on GX version
    print(f"Asset '{table_name_to_test}' not found in datasource '{datasource.name}'. Adding it now...")
    # Add the table asset if it doesn't exist
    asset = datasource.add_table_asset(
        name=table_name_to_test,      # This is the name GX will use for the asset
        table_name=table_name_to_test # This is the actual table name in your DB
        # schema_name="public"  # Optional: specify if not in default 'public' schema
    )
    print(f"Asset '{asset.name}' added to datasource '{datasource.name}'.")

# Build a batch request and get a validator to inspect data
batch_request = asset.build_batch_request()
validator = context.get_validator(batch_request=batch_request)


Asset 'routes' not found in datasource 'flight_data'. Adding it now...
Asset 'routes' added to datasource 'flight_data'.


In [6]:
print("--- Check to ensure all columns are present ---")
expected_columns = [
    "id", "raw_payload", "date_pulled", 
    "departure_iata", "arrival_iata", "departure_airport", "departure_timezone", 
    "departure_icao", "departure_terminal", "departure_time", 
    "arrival_airport", "arrival_timezone", "arrival_icao", "arrival_terminal", "arrival_time", 
    "airline_name", "airline_callsign", "airline_icao", "airline_iata", "flight_number"
]

# Use the correct snake_case method name
column_names_checker_result = validator.expect_table_columns_to_match_set(
    column_set=expected_columns,
    exact_match=True 
)
print(column_names_checker_result)


--- Check to ensure all columns are present ---


Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_columns_to_match_set",
    "kwargs": {
      "column_set": [
        "id",
        "raw_payload",
        "date_pulled",
        "departure_iata",
        "arrival_iata",
        "departure_airport",
        "departure_timezone",
        "departure_icao",
        "departure_terminal",
        "departure_time",
        "arrival_airport",
        "arrival_timezone",
        "arrival_icao",
        "arrival_terminal",
        "arrival_time",
        "airline_name",
        "airline_callsign",
        "airline_icao",
        "airline_iata",
        "flight_number"
      ],
      "exact_match": true,
      "batch_id": "flight_data-routes"
    },
    "meta": {}
  },
  "result": {
    "observed_value": [
      "id",
      "airline_iata",
      "flight_number",
      "departure_iata",
      "arrival_iata",
      "departure_airport",
      "departure_timezone",
      "departure_icao",
      "departure_terminal"

In [12]:
print("--- EDA for 'airline_iata' nulls ---")
result_null_airline_iata = validator.expect_column_values_to_not_be_null(column='airline_iata')
print(result_null_airline_iata)

--- EDA for 'airline_iata' nulls ---


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_not_be_null",
    "kwargs": {
      "column": "airline_iata",
      "batch_id": "flight_data-routes"
    },
    "meta": {}
  },
  "result": {
    "element_count": 207911,
    "unexpected_count": 3218,
    "unexpected_percent": 1.547777654861936,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


In [7]:
print("check for departure_iata, arrival_iata, airline_iata, departure_time, departure_timezone, arrival_time, arrival_timezone, arrival_icao, departure_icao to not be more than 5% missing")

validator.expect_column_values_to_not_be_null(column='departure_iata',mostly=0.95, meta={'notes': 'departure_iata is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='arrival_iata',mostly=0.95, meta={'notes': 'arrival_iata is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='departure_icao',mostly=0.95, meta={'notes': 'departure_icao is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='arrival_icao',mostly=0.95, meta={'notes': 'arrival_icao is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='airline_iata',mostly=0.95, meta={'notes': 'airline_iata is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='departure_time',mostly=0.95, meta={'notes': 'departure_time is missing in 5% of the rows'})
validator.expect_column_values_to_not_be_null(column='departure_timezone',mostly=0.95, meta={'notes': 'departure_timezone is missing in 5% of the rows'})

check for departure_iata, arrival_iata, airline_iata, departure_time, departure_timezone, arrival_time, arrival_timezone, arrival_icao, departure_icao to not be more than 5% missing


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 2480,
    "unexpected_percent": 1.192818080813425,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 2600,
    "unexpected_percent": 1.250535084723752,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 3218,
    "unexpected_percent": 1.547777654861936,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 207911,
    "unexpected_count": 26241,
    "unexpected_percent": 12.621265830090762,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 207911,
    "unexpected_count": 1859,
    "unexpected_percent": 0.8941325855774827,
    "partial_unexpected_list": [
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null,
      null
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:


validator.save_expectation_suite("routes_suite_v1")


In [25]:
import os
import great_expectations as gx

# Assuming 'context' is your already loaded Great Expectations Data Context object
# If you re-run this in a new session, you might need to get the context again:
# context = gx.get_context()

print(f"Notebook's Current Working Directory (CWD): {os.getcwd()}")
if 'context' in locals() and context:
    print(f"Great Expectations Context Root Directory: {context.root_directory}")
else:
    print("Great Expectations context not found. Please ensure it's loaded in a previous cell.")


Notebook's Current Working Directory (CWD): /Users/danielmak/Documents/MLOps_project
Great Expectations Context Root Directory: /Users/danielmak/Documents/MLOps_project/gx


In [10]:
import great_expectations as gx

context = gx.get_context(context_root_dir="/Users/danielmak/Documents/MLOps_project/gx")
print("Context Root Directory:", context.root_directory)


Context Root Directory: /Users/danielmak/Documents/MLOps_project/gx
