# Dataframe Schema Compliance Notebook

This Jupyter notebook demonstrates schema compliance validation for dataframes, focusing on data type consistency across Pandas and PyArrow frameworks. It explores different methods for writing and reading Parquet files, ensuring data types align with an expected schema.
Schema
## Overview
The notebook defines an `expected_schema` for columns (`order_id`, `customer_id`, `revenue`, `order_time`) with PyArrow data types (e.g., `int64[pyarrow]`, `timestamp[ns, tz=UTC][pyarrow]`). It then tests schema compliance using a `generate_compliance_report` function, which compares actual data types against the schema and reports compliance percentages.

## Key Sections
1. **Pandas Write/Read (Default)** ❌ : Writes a Pandas dataframe to Parquet and reads it back without PyArrow backend, resulting in non-compliant types (e.g., `object` instead of `string` and `int64` instead of `int64[pyarrow]`), yielding 0% compliance.
2. **Pandas Write/Read (PyArrow Backend)** ✅: Uses `dtype_backend='pyarrow'` when reading, achieving 100% compliance by preserving PyArrow types.
3. **PyArrow Write/Read** ❌ : Creates a PyArrow table, saves it to Parquet, and converts it to Pandas without PyArrow backend, resulting in 0% compliance due to type mismatches.
4. **PyArrow Write, Pandas Read (PyArrow Backend)** ✅: Reads the PyArrow-written Parquet file with `dtype_backend='pyarrow'`, ensuring 100% compliance.

## Requirements
- Python 3.10
- Pandas, PyArrow

This notebook highlights the importance of using the PyArrow backend for consistent data type handling across frameworks.

In [1]:
import pandas as pd
import pyarrow as pa
from pandas.api.types import is_integer_dtype, is_string_dtype, is_float_dtype

# Define the expected schema
expected_schema = {
    "order_id": {"data_type": "int64[pyarrow]", "nullable": False},
    "customer_id": {"data_type": "string[pyarrow]", "nullable": False},
    "revenue": {"data_type": "double[pyarrow]", "nullable": False},
    "order_time": {"data_type": "timestamp[ns, tz=UTC][pyarrow]", "nullable": False}
}

def generate_compliance_report(df, schema):
    """
    Calculate the percentage of rows in each column that have the correct data type and satisfy nullability.
    
    Args:
        df (pd.DataFrame): The dataframe to check.
        schema (dict): The expected schema with data types and nullability.
    
    Returns:
        dict: A report mapping each column to its compliance percentage.
    """
    report = {}
    total_rows = len(df)
    
    for col, spec in schema.items():
        if col not in df.columns:
            print(f"col:{col}, actual_dtype:missing")
            report[col] = 0.0
            continue
        
        expected_dtype = spec["data_type"]
        actual_dtype = str(df[col].dtype)
        print(f"col:{col}, actual_dtype:{actual_dtype}")
        
        # Initialize compliance array (True for compliant rows)
        compliance = pd.Series(True, index=df.index)
        
        # Check data type for each row
        if expected_dtype == "int64[pyarrow]":
            # Check if each value can be interpreted as an integer
            compliance &= df[col].apply(lambda x: isinstance(x, (int, pd.Int64Dtype)) and not pd.isna(x))
        elif expected_dtype == "string[pyarrow]":
            # Check if each value is a string
            compliance &= df[col].apply(lambda x: isinstance(x, str) and not pd.isna(x))
        elif expected_dtype == "double[pyarrow]":
            # Check if each value is a float
            compliance &= df[col].apply(lambda x: isinstance(x, float) and not pd.isna(x))
        elif expected_dtype == "timestamp[ns, tz=UTC][pyarrow]":
            # Check if each value is a timestamp with UTC timezoneUntitled
            compliance &= df[col].apply(
                lambda x: pd.api.types.is_datetime64_any_dtype(pd.Series([x])) and
                          hasattr(x, 'tz') and str(x.tz) == 'UTC' and
                          not pd.isna(x)
            )
        
        # Check nullability constraint
        if not spec["nullable"]:
            compliance &= df[col].notnull()
        
        # Calculate percentage of compliant rows
        compliant_rows = compliance.sum()
        compliance_percentage = (compliant_rows / total_rows) * 100 if total_rows > 0 else 0.0
        report[col] = compliance_percentage
    
    return report

# Sample dataset
df = pd.DataFrame({
    "order_id": [1, "invalid", 3],  # Second row has incorrect type
    "customer_id": ["A", "B", 123],  # Third row has incorrect type
    "revenue": [10.5, 20.0, "invalid"],  # Third row has incorrect type
    "order_time": pd.date_range("2023-01-01", periods=3, tz="UTC")  # All correct
})

# Generate and print the compliance report
report = generate_compliance_report(df, expected_schema)
for col, compliance in report.items():
    print(f"{col}: {compliance:.2f}%")

col:order_id, actual_dtype:object
col:customer_id, actual_dtype:object
col:revenue, actual_dtype:object
col:order_time, actual_dtype:datetime64[ns, UTC]
order_id: 66.67%
customer_id: 66.67%
revenue: 66.67%
order_time: 100.00%


# however, we cannot store a column with mixed datatypes in parquet

In [2]:
import pandas as pd
import pyarrow as pa

# Define the expected schema (unchanged)
expected_schema = {
    "order_id": {"data_type": "int64[pyarrow]", "nullable": False},
    "customer_id": {"data_type": "string[pyarrow]", "nullable": False},
    "revenue": {"data_type": "double[pyarrow]", "nullable": False},
    "order_time": {"data_type": "timestamp[ns, tz=UTC][pyarrow]", "nullable": False}
}

# Assuming generate_compliance_report is defined elsewhere, generate and print the report
# For demonstration, here's a placeholder if you don't have it:
def generate_compliance_report(df, schema):
    report = {}
    for col, spec in schema.items():        
        expected_dtype = spec["data_type"]
        actual_dtype = str(df[col].dtype)
        print(f"col:{col}, actual_dtype:{actual_dtype}")
        compliance = 100.0 if actual_dtype == expected_dtype else 0.0
        report[col] = compliance
    return report


# pandas write parquet, pandas read (normal) ❌

In [3]:
# Sample dataset (unchanged)
df = pd.DataFrame({
    "order_id": [1, 2, 3],
    "customer_id": ["A", "B", None],
    "revenue": [10.5, 20.0, 30.5],
    "order_time": pd.date_range("2023-01-01", periods=3, tz="UTC")
})
df.to_parquet('pd_write.parquet')

df=pd.read_parquet('pd_write.parquet')
report = generate_compliance_report(df, expected_schema)
for col, compliance in report.items():
    print(f"{col}: {compliance:.2f}%")

col:order_id, actual_dtype:int64
col:customer_id, actual_dtype:object
col:revenue, actual_dtype:float64
col:order_time, actual_dtype:datetime64[ns, UTC]
order_id: 0.00%
customer_id: 0.00%
revenue: 0.00%
order_time: 0.00%


# pandas write parquet, pandas read (dtype_backend='pyarrow') ✅

In [4]:
# Sample dataset (unchanged)
df = pd.DataFrame({
    "order_id": [1, 2, 3],
    "customer_id": ["A", "B", None],
    "revenue": [10.5, 20.0, 30.5],
    "order_time": pd.date_range("2023-01-01", periods=3, tz="UTC")
})
df.to_parquet('pd_write.parquet')

df=pd.read_parquet('pd_write.parquet',dtype_backend='pyarrow') #add this dtype_backend='pyarrow'
report = generate_compliance_report(df, expected_schema)
for col, compliance in report.items():
    print(f"{col}: {compliance:.2f}%")


col:order_id, actual_dtype:int64[pyarrow]
col:customer_id, actual_dtype:string[pyarrow]
col:revenue, actual_dtype:double[pyarrow]
col:order_time, actual_dtype:timestamp[ns, tz=UTC][pyarrow]
order_id: 100.00%
customer_id: 100.00%
revenue: 100.00%
order_time: 100.00%


# pyarrow write parquet, pyarrow read ❌

In [5]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# Step 1: Create PyArrow arrays for each column
order_ids = pa.array([1, 2, 3], type=pa.int64())
customer_ids = pa.array(["A", "B", None], type=pa.string())
revenues = pa.array([10.5, 20.0, 30.5], type=pa.float64())
order_times = pa.array([pd.Timestamp("2023-01-01 12:00:00", tz="UTC"),
                        pd.Timestamp("2023-01-02 12:00:00", tz="UTC"),
                        pd.Timestamp("2023-01-03 12:00:00", tz="UTC")],
                       type=pa.timestamp('ns', tz='UTC'))

# Step 2: Define the schema
schema = pa.schema([
    pa.field("order_id", pa.int64()),
    pa.field("customer_id", pa.string()),
    pa.field("revenue", pa.float64()),
    pa.field("order_time", pa.timestamp('ns', tz='UTC'))
])

# Step 3: Create the PyArrow table
table = pa.Table.from_arrays([order_ids, customer_ids, revenues, order_times], schema=schema)
[pyarrow]
# Step 4: Save the table to a Parquet file
pq.write_table(table, "pa_write.parquet")

# Step 5: Load the Parquet file back into a PyArrow table
table_loaded = pq.read_table("pa_write.parquet")

# Step 6: Convert the loaded table to a Pandas dataframe
df = table_loaded.to_pandas()

# Optional: Verify the dataframe by printing it and checking data types
# print("Loaded Pandas DataFrame:")
# print(df)
# print("\nData types:")
# print(df.dtypes)

report = generate_compliance_report(df, expected_schema)
for col, compliance in report.items():
    print(f"{col}: {compliance:.2f}%")

NameError: name 'pyarrow' is not defined

# pyarrow write parquet, pandas read ✅

In [None]:
df=pd.read_parquet('pa_write.parquet',dtype_backend='pyarrow')
report = generate_compliance_report(df, expected_schema)
for col, compliance in report.items():
    print(f"{col}: {compliance:.2f}%")