In [0]:
%pip install pytest
dbutils.library.restartPython()


Collecting pytest
  Downloading pytest-8.3.5-py3-none-any.whl.metadata (7.6 kB)
Collecting iniconfig (from pytest)
  Downloading iniconfig-2.0.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pluggy<2,>=1.5 (from pytest)
  Downloading pluggy-1.5.0-py3-none-any.whl.metadata (4.8 kB)
Downloading pytest-8.3.5-py3-none-any.whl (343 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/343.6 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m317.4/343.6 kB[0m [31m9.4 MB/s[0m eta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m343.6/343.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pluggy-1.5.0-py3-none-any.whl (20 kB)
Downloading iniconfig-2.0.0-py3-none-any.whl (5.9 kB)
Installing collected packages: pluggy, iniconfig, pytest
  Attempting uninstall: pluggy
    Found existing installation: pluggy 1.0.0
    Not uninstalling pluggy at /databricks/python3/l

In [0]:
import pytest
import logging
from pyspark.testing import assertSchemaEqual
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType, BooleanType, DoubleType, FloatType

# Enable logging
logging.basicConfig(level=logging.INFO)

# Define the expected schemas for different tables
expected_schemas = {
    "silver.appearance": StructType([
        StructField("appearance_id", StringType(), True),  
        StructField("game_id", IntegerType(), True),
        StructField("player_id", IntegerType(), True),
        StructField("player_club_id", IntegerType(), True),
        StructField("player_current_club_id", IntegerType(), True),
        StructField("date", DateType(), True), 
        StructField("player_name", StringType(), True),
        StructField("competition_id", StringType(), True),
        StructField("yellow_cards", IntegerType(), True),
        StructField("red_cards", IntegerType(), True),
        StructField("goals", IntegerType(), True),
        StructField("assists", IntegerType(), True),
        StructField("minutes_played", IntegerType(), True)
    ]),
    "silver.game_events": StructType([
        StructField("game_event_id", StringType(), True),
        StructField("date", DateType(), True),
        StructField("game_id", IntegerType(), True),
        StructField("minute", IntegerType(), True),
        StructField("type", StringType(), True),
        StructField("club_id", IntegerType(), True),
        StructField("player_id", IntegerType(), True),
        StructField("description", StringType(), True)
    ]),
    "silver.club_games": StructType([
        StructField("game_id", IntegerType(), True),
        StructField("club_id", IntegerType(), True),
        StructField("own_goals", IntegerType(), True),
        StructField("own_position", IntegerType(), True),
        StructField("own_manager_name", StringType(), True),
        StructField("opponent_id", IntegerType(), True),
        StructField("opponent_goals", IntegerType(), True),
        StructField("opponent_position", IntegerType(), True),
        StructField("opponent_manager_name", StringType(), True),
        StructField("hosting", StringType(), True),
        StructField("is_win", BooleanType(), True)
    ]), 
    "silver.clubs": StructType([
        StructField("club_id", IntegerType(), True),
        StructField("club_code", StringType(), True),
        StructField("name", StringType(), True),
        StructField("domestic_competition_id", IntegerType(), True),
        StructField("squad_size", IntegerType(), True),
        StructField("average_age", DoubleType(), True),
        StructField("foreigners_number", IntegerType(), True),
        StructField("foreigners_percentage", DoubleType(), True),
        StructField("national_team_players", IntegerType(), True),
        StructField("stadium_name", StringType(), True),
        StructField("stadium_seats", IntegerType(), True),
        StructField("net_transfer_record", StringType(), True),
        StructField("last_season", IntegerType(), True),
        StructField("filename", StringType(), True),
        StructField("url", StringType(), True)
    ]),
    "silver.competitions": StructType([
        StructField("competition_id", StringType(), True),
        StructField("competition_code", StringType(), True),
        StructField("name", StringType(), True),
        StructField("sub_type", StringType(), True),
        StructField("type", StringType(), True),
        StructField("country_id", IntegerType(), True),
        StructField("country_name", StringType(), True),
        StructField("domestic_league_code", StringType(), True),
        StructField("confederation", StringType(), True),
        StructField("is_major_national_league", BooleanType(), True)
    ]), 
    "silver.game_events": StructType([
        StructField("game_event_id", StringType(), True),
        StructField("date", DateType(), True),
        StructField("game_id", IntegerType(), True),
        StructField("minute", IntegerType(), True),
        StructField("type", StringType(), True),
        StructField("club_id", IntegerType(), True),
        StructField("player_id", IntegerType(), True),
        StructField("description", StringType(), True)
    ]), 
    "silver.game_lineups": StructType([
        StructField("date", StringType(), True),
        StructField("game_id", IntegerType(), True),
        StructField("player_id", IntegerType(), True),
        StructField("club_id", IntegerType(), True),
        StructField("player_name", StringType(), True),
        StructField("type", StringType(), True),
        StructField("position", StringType(), True),
        StructField("number", IntegerType(), True)
    ]),
    "silver.games": StructType([
        StructField("game_id", IntegerType(), True),
        StructField("competition_id", StringType(), True),
        StructField("season", IntegerType(), True),
        StructField("round", StringType(), True),
        StructField("date", DateType(), True),
        StructField("home_club_id", IntegerType(), True),
        StructField("away_club_id", IntegerType(), True),
        StructField("home_club_goals", IntegerType(), True),
        StructField("away_club_goals", IntegerType(), True),
        StructField("home_club_position", FloatType(), True),
        StructField("away_club_position", FloatType(), True),
        StructField("home_club_manager_name", StringType(), True),
        StructField("away_club_manager_name", StringType(), True),
        StructField("stadium", StringType(), True),
        StructField("attendance", IntegerType(), True),
        StructField("referee", StringType(), True),
        StructField("home_club_formation", StringType(), True),
        StructField("away_club_formation", StringType(), True),
        StructField("home_club_name", StringType(), True),
        StructField("away_club_name", StringType(), True),
        StructField("competition_type", StringType(), True)
    ]),
    "silver.player_valuations": StructType([
        StructField("player_id", IntegerType(), True),
        StructField("date", DateType(), True),
        StructField("market_value_in_eur", FloatType(), True),
        StructField("current_club_id", IntegerType(), True),
        StructField("player_club_domestic_competition_id", StringType(), True)
    ]),
    "silver.players": StructType([
        StructField("player_id", IntegerType(), True),
        StructField("first_name", StringType(), True),
        StructField("last_name", StringType(), True),
        StructField("last_season", DateType(), True),
        StructField("current_club_id", IntegerType(), True),
        StructField("player_code", StringType(), True),
        StructField("country_of_birth", StringType(), True),
        StructField("city_of_birth", StringType(), True),
        StructField("country_of_citizenship", StringType(), True),
        StructField("date_of_birth", DateType(), True),
        StructField("sub_position", StringType(), True),
        StructField("position", StringType(), True),
        StructField("foot", StringType(), True),
        StructField("height_in_cm", DoubleType(), True),
        StructField("contract_expiration_date", DateType(), True),
        StructField("agent_name", StringType(), True),
        StructField("current_club_domestic_competition_id", StringType(), True),
        StructField("current_club_name", StringType(), True),
        StructField("market_value_in_eur", FloatType(), True),
        StructField("highest_market_value_in_eur", FloatType(), True)
    ]),
    "silver.transfers": StructType([
        StructField("player_id", IntegerType(), True),
        StructField("transfer_date", DateType(), True),
        StructField("transfer_season", StringType(), True),
        StructField("from_club_id", IntegerType(), True),
        StructField("to_club_id", IntegerType(), True),
        StructField("from_club_name", StringType(), True),
        StructField("to_club_name", StringType(), True),
        StructField("market_value_in_eur", DoubleType(), True),
        StructField("player_name", StringType(), True)
    ])
}


In [0]:
# Load tables dynamically
tables = {table: spark.read.table(table) for table in expected_schemas.keys()}

In [0]:
# Strict Schema Validation
def test_strict_schema():
    for table_name, df in tables.items():
        expected_schema = expected_schemas[table_name]
        try:
            # Validate schema using assertSchemaEqual
            assertSchemaEqual(df.schema, expected_schema)
            # Log success if schemas match
            logging.info(f"Schema validation passed for {table_name}")
        except AssertionError as e:
            # Log and raise a detailed error message if schemas do not match
            error_message = f"Schema mismatch in {table_name}! Expected: {expected_schema}, but got: {df.schema}"
            logging.error(error_message)
            raise AssertionError(error_message) from e

In [0]:
# Schema Evolution: Ignore Extra Columns
def test_schema_ignore_extra():
    for table_name, df in tables.items():
        expected_fields = {f.name for f in expected_schemas[table_name].fields}
        actual_fields = {f.name for f in df.schema.fields}

        missing_columns = expected_fields - actual_fields
        assert not missing_columns, f"Missing columns in {table_name}: {missing_columns}"

        logging.info(f"Schema validation passed (ignoring extra columns) for {table_name}")


In [0]:
# Schema Evolution: Log Warnings Instead of Failing
def test_schema_with_warnings():
    for table_name, df in tables.items():
        actual_schema_dict = {field.name: field.dataType for field in df.schema.fields}
        expected_schema_dict = {field.name: field.dataType for field in expected_schemas[table_name].fields}

        for col, expected_dtype in expected_schema_dict.items():
            if col not in actual_schema_dict:
                logging.warning(f"Warning: Column {col} is missing in {table_name}!")
            elif actual_schema_dict[col] != expected_dtype:
                logging.warning(f"Warning: Column {col} changed from {expected_dtype} to {actual_schema_dict[col]} in {table_name}!")

        logging.info(f"Schema check completed with warnings (if any) for {table_name}")


In [0]:
# Dynamic Schema Update
def test_dynamic_schema_update():
    global expected_schemas
    for table_name, df in tables.items():
        expected_schemas[table_name] = df.schema
        logging.info(f"Updated expected schema for {table_name}")


In [0]:
# Check for Duplicate Rows
def test_duplicate_rows():
    for table_name, df in tables.items():
        total_count = df.count()
        unique_count = df.dropDuplicates().count()

        assert total_count == unique_count, f"Duplicate rows detected in {table_name}! Total: {total_count}, Unique: {unique_count}"
        logging.info(f"No duplicate rows found in {table_name}")

In [0]:
# Validate Data Types
def test_data_types():
    for table_name, df in tables.items():
        expected_dtypes = {field.name: field.dataType for field in expected_schemas[table_name].fields}
        actual_dtypes = {field.name: field.dataType for field in df.schema.fields}

        for col, expected_dtype in expected_dtypes.items():
            assert actual_dtypes.get(col) == expected_dtype, f"Column {col} type mismatch in {table_name}: Expected {expected_dtype}, got {actual_dtypes.get(col)}"

        logging.info(f"Data type validation passed for {table_name}")


In [0]:
# Run All Tests
if __name__ == "__main__":
    test_strict_schema()
    test_schema_ignore_extra()
    test_schema_with_warnings()
    test_dynamic_schema_update()
    test_duplicate_rows()
    test_data_types()

    print("All tests completed successfully!")

All tests completed successfully!
