In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS test;

In [0]:
# Import required libraries
import unittest
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import mlflow
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Create test tables
test_input_table = "test.player_stats_and_valuations"
test_output_table = "test.player_predictions"

# Create test data schema
schema = StructType([
    StructField("player_id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("position", StringType()),
    StructField("foot", StringType()),
    StructField("age", IntegerType()),
    StructField("height_in_cm", IntegerType()),
    StructField("contract_months_left_to_expire", IntegerType()),
    StructField("total_goals", IntegerType()),
    StructField("total_assists", IntegerType()),
    StructField("avg_minutes_played", FloatType()),
    StructField("total_yellow_cards", IntegerType()),
    StructField("total_red_cards", IntegerType()),
    StructField("total_game_events", IntegerType()),
    StructField("max_market_value_at_transfer", FloatType()),
    StructField("min_market_value_at_transfer", FloatType()),
    StructField("squad_size", IntegerType()),
    StructField("average_age", FloatType()),
    StructField("foreigners_percentage", FloatType()),
    StructField("is_major_national_league", BooleanType()),
    StructField("market_value_in_eur", FloatType())
])

# Create test data
test_data = [
    (1, "Test", "Player1", "Forward", "Right", 25, 180, 12, 10, 5, 90.0, 2, 0, 20, 1000000.0, 500000.0, 25, 26.0, 50.0, True, 750000.0),
    (2, "Test", "Player2", "Midfielder", "Left", 28, 175, 6, 5, 10, 85.5, 1, 0, 15, 800000.0, 400000.0, 30, 27.5, 60.0, False, 600000.0)
]

spark = SparkSession.builder.getOrCreate()
test_df = spark.createDataFrame(test_data, schema=schema)
test_df.write.format("delta").mode("overwrite").saveAsTable(test_input_table)


class TestPlayerValuationPipeline(unittest.TestCase):
    
    @classmethod
    def setUpClass(cls):
        cls.spark = SparkSession.builder.getOrCreate()
        
        # Define parameters here
        cls.parameters = {
            "colsample_bytree": 0.4816791503913271,
            "learning_rate": 0.012644116888864644,
            "max_depth": 9,
            "min_child_weight": 7,
            "n_estimators": 656,
            "n_jobs": 100,
            "subsample": 0.5964879726165608,
            "verbosity": 0,
            "random_state": 614612684,
        }

        # Run the production pipeline with test data
        global input_table, output_table
        input_table = test_input_table
        output_table = test_output_table
        
        try:
            # Load input data
            cls.data = cls.spark.read.format("delta").table(input_table)
            
            # Preprocessing
            from pyspark.ml.feature import StringIndexer, VectorAssembler
            
            indexers = [
                StringIndexer(inputCol=column, outputCol=f"{column}_indexed").fit(cls.data)
                for column in ["position", "foot"]
            ]
            for indexer in indexers:
                cls.data = indexer.transform(cls.data)
            cls.data = cls.data.drop("position", "foot")
            
            cls.feature_columns = [
                "position_indexed", "foot_indexed", "age", "height_in_cm",
                "contract_months_left_to_expire", "total_goals", "total_assists",
                "avg_minutes_played", "total_yellow_cards", "total_red_cards",
                "total_game_events", "max_market_value_at_transfer", 
                "min_market_value_at_transfer", "squad_size", "average_age",
                "foreigners_percentage", "is_major_national_league"
            ]
            
            assembler = VectorAssembler(inputCols=cls.feature_columns, outputCol="features")
            cls.data = assembler.transform(cls.data).select("player_id", "first_name", "last_name", 
                                                          "features", "market_value_in_eur")
            
            # Train-test split
            cls.train_data, cls.test_data = cls.data.randomSplit([0.8, 0.2], seed=42)
            
            # Model training
            cls.model = XGBRegressor(**cls.parameters)
            train_pdf = cls.train_data.toPandas()
            cls.X_train = train_pdf["features"].apply(lambda x: x.toArray()).tolist()
            cls.y_train = train_pdf["market_value_in_eur"].values
            cls.model.fit(cls.X_train, cls.y_train)
            
            # Generate predictions
            test_pdf = cls.test_data.toPandas()
            cls.X_test = test_pdf["features"].apply(lambda x: x.toArray()).tolist()
            cls.y_test = test_pdf["market_value_in_eur"].values
            cls.y_pred = cls.model.predict(cls.X_test)
            
            # Log metrics to MLflow
            mae = mean_absolute_error(cls.y_test, cls.y_pred)
            r2 = r2_score(cls.y_test, cls.y_pred)
            with mlflow.start_run() as run:
                mlflow.log_metric("MAE", mae)
                mlflow.log_metric("R2", r2)
                for param, value in cls.parameters.items():
                    mlflow.log_param(param, value)
            
            # Save predictions
            test_pdf['actual_market_value'] = cls.y_test
            test_pdf['predicted_market_value'] = cls.y_pred
            
            # Select only the required columns for the output
            output_columns = ["player_id", "first_name", "last_name", "actual_market_value", "predicted_market_value"]
            predictions_df = spark.createDataFrame(test_pdf[output_columns])
            predictions_df.write.format("delta").mode("overwrite").saveAsTable(output_table)
            
        except Exception as e:
            raise RuntimeError(f"Pipeline execution failed: {e}")

    def test_data_loading(self):
        """Test if input data is loaded correctly"""
        self.assertIsNotNone(self.data)
        self.assertGreater(self.data.count(), 0)
        self.assertTrue("market_value_in_eur" in self.data.columns)

    def test_preprocessing(self):
        """Test feature engineering steps"""
        columns = self.data.columns
        self.assertNotIn("position", columns)
        self.assertNotIn("foot", columns)
        self.assertIn("features", columns)

    def test_train_test_split(self):
        """Test data splitting"""
        total_count = self.data.count()
        train_count = self.train_data.count()
        test_count = self.test_data.count()
        
        self.assertGreater(train_count, 0)
        self.assertGreater(test_count, 0)
        self.assertEqual(train_count + test_count, total_count)

    def test_model_training(self):
        """Test model initialization and training"""
        self.assertEqual(len(self.model.feature_importances_), 
                         len(self.feature_columns))

    def test_output_generation(self):
        """Test output table creation"""
        output_df = self.spark.read.table(test_output_table)
        required_columns = {"player_id", "first_name", "last_name",
                           "actual_market_value", "predicted_market_value"}
        self.assertTrue(required_columns.issubset(set(output_df.columns)))

    @classmethod
    def tearDownClass(cls):
        """Clean up test tables"""
        cls.spark.sql(f"DROP TABLE IF EXISTS {test_input_table}")
        cls.spark.sql(f"DROP TABLE IF EXISTS {test_output_table}")


# Run Tests

suite = unittest.TestLoader().loadTestsFromTestCase(TestPlayerValuationPipeline)
runner = unittest.TextTestRunner(verbosity=2)
test_result = runner.run(suite)


# Verify MLflow logging
runs = mlflow.search_runs()
if runs.empty:
    raise RuntimeError("No MLflow runs found. Please ensure metrics and parameters are logged.")

latest_run = runs.iloc[0]

# Check if active run exists
assert mlflow.active_run() is None, "Active run found - did you forget to end the run?"

# Verify metrics
assert "metrics.MAE" in latest_run, "MAE metric not logged"
assert "metrics.R2" in latest_run, "R2 metric not logged"

# Verify parameters
assert "params.n_estimators" in latest_run, "Model parameters not logged"

print("\nMLflow logging verified successfully!")

if test_result.wasSuccessful():
    print("All tests passed successfully!")
else:
    print("Some tests failed. Please check the output above for details.")
    

Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2025/03/23 09:56:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run useful-chimp-794 at: adb-4332105040219628.8.azuredatabricks.net/ml/experiments/1684593945464055/runs/5e43e45ad94e48a7989f3fb0964a4783.
2025/03/23 09:56:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: adb-4332105040219628.8.azuredatabricks.net/ml/experiments/1684593945464055.
test_data_loading (__main__.TestPlayerValuationPipeline.test_data_loading)
Test if input data is loaded correctly ... ok
test_model_training (__main__.TestPlayerValuationPipeline.test_model_training)
Test model initialization and training ... ok
test_output_generation (__main__.TestPlayerValuationPipeline.test_output_generation)
Test output table creation ... ok
test_preprocessing (__main__.TestPlayerValuationPipeline.test_preprocessing)
Test feature engineering steps ... ok
test_train_test_split (__main__.TestPlayerValuationPipeline.test_train_test_split)
Test data splitting ... ok

-------------------------------


MLflow logging verified successfully!
All tests passed successfully!
