### XGBoost: Player Market Value Prediction

**Import Libraries**

In [0]:
# Import Libraries
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer, VectorAssembler
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import mlflow

# Initialize Spark session
spark = SparkSession.builder.appName("XGBoostPipeline").getOrCreate()


**Read data from gold layer and specify the output file path**

In [0]:
input_table = "gold.player_stats_and_valuations"
output_table = "gold.player_predictions"

# Load the Delta table
try:
    data = spark.read.format("delta").table(input_table)
except Exception as e:
    raise ValueError(f"Failed to load input table {input_table}: {e}")


**Data Processing**

In [0]:

# Preprocess the data
try:
    # Encode categorical columns ("position" and "foot") using StringIndexer
    indexers = [
        StringIndexer(inputCol=column, outputCol=f"{column}_indexed").fit(data)
        for column in ["position", "foot"]
    ]

    # Apply the indexers to the data
    for indexer in indexers:
        data = indexer.transform(data)

    # Drop the original categorical columns and keep the indexed ones
    data = data.drop("position", "foot")

    # Assemble all features into a single vector column
    feature_columns = [
        "position_indexed",
        "foot_indexed",
        "age",
        "height_in_cm",
        "contract_months_left_to_expire",
        "total_goals",
        "total_assists",
        "avg_minutes_played",
        "total_yellow_cards",
        "total_red_cards",
        "total_game_events",
        "max_market_value_at_transfer",
        "min_market_value_at_transfer",
        "squad_size",
        "average_age",
        "foreigners_percentage",
        "is_major_national_league",
    ]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
    data = assembler.transform(data)

    # Select the features, target variable, and identifiers (player_id, first_name, last_name)
    data = data.select("player_id", "first_name", "last_name", "features", "market_value_in_eur")

except Exception as e:
    raise ValueError(f"Data preprocessing failed: {e}")


**Split dataset into train and test sets**

In [0]:

# Split the data into training and testing sets (80%-20%)
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Convert Spark DataFrames to Pandas DataFrames
try:
    train_pdf = train_data.toPandas()
    test_pdf = test_data.toPandas()

    # Separate features, target variable, and identifiers
    X_train = train_pdf["features"].apply(lambda x: x.toArray()).tolist()  # Convert SparseVector to list
    y_train = train_pdf["market_value_in_eur"].values
    player_ids_train = train_pdf["player_id"].values
    first_names_train = train_pdf["first_name"].values
    last_names_train = train_pdf["last_name"].values

    X_test = test_pdf["features"].apply(lambda x: x.toArray()).tolist()
    y_test = test_pdf["market_value_in_eur"].values
    player_ids_test = test_pdf["player_id"].values
    first_names_test = test_pdf["first_name"].values
    last_names_test = test_pdf["last_name"].values

except Exception as e:
    raise ValueError(f"Data conversion to Pandas failed: {e}")


**Parameters for the model**

In [0]:

# Train the XGBoost model with MLflow tracking
parameters = {
    "colsample_bytree": 0.4816791503913271,
    "learning_rate": 0.012644116888864644,
    "max_depth": 9,
    "min_child_weight": 7,
    "n_estimators": 656,
    "n_jobs": 100,
    "subsample": 0.5964879726165608,
    "verbosity": 0,
    "random_state": 614612684,
}


**Model Training**

In [0]:
try:
    # Start an MLflow experiment
    with mlflow.start_run():
        # Log parameters
        mlflow.log_params(parameters)

        # Train the model
        model = XGBRegressor(**parameters)
        model.fit(X_train, y_train)

        # Evaluate the model
        y_pred = model.predict(X_test)

        # Calculate metrics
        mae = round(mean_absolute_error(y_test, y_pred), 2)
        r2 = round(r2_score(y_test, y_pred), 4)

        # Log metrics
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)

        # Log the model
        mlflow.sklearn.log_model(model, "player_valuation_model")
        print("===========================================\n")

        print(f"Model trained successfully. MAE: {mae}, R-squared: {r2}")

        print("============================================\n")

except Exception as e:
    raise ValueError(f"Model training or evaluation failed: {e}")




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2025/03/22 11:18:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run delightful-ape-596 at: adb-4332105040219628.8.azuredatabricks.net/ml/experiments/3612831496567537/runs/f5597fcd99874c03ab4eaf806c8ab836.
2025/03/22 11:18:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: adb-4332105040219628.8.azuredatabricks.net/ml/experiments/3612831496567537.



Model trained successfully. MAE: 2075680.75, R-squared: 0.7451



In [0]:
# Combine predictions with player details for traceability
predictions_df = pd.DataFrame({
    "player_id": player_ids_test,
    "first_name": first_names_test,
    "last_name": last_names_test,
    "actual_market_value": y_test,
    "predicted_market_value": y_pred
})


**Save the predictions to Delta table**

In [0]:
# Save the predictions to Delta table
try:
    predictions_spark_df = spark.createDataFrame(predictions_df)
    predictions_spark_df.write.format("delta").mode("overwrite").saveAsTable(output_table)
    print(f"Predictions saved to Delta table: {output_table}")
except Exception as e:
    raise ValueError(f"Failed to save predictions to Delta table {output_table}: {e}")

Predictions saved to Delta table: gold.player_predictions


**Optimize and Z-Order by player_id and actual_market_value**

In [0]:
%sql
OPTIMIZE gold.player_predictions
ZORDER BY (player_id, actual_market_value);

path,metrics
abfss://unity-catalog-storage@dbstoragebmlwakr3lruh6.dfs.core.windows.net/4332105040219628/__unitystorage/catalogs/882c4f0d-67c4-4c44-a2ce-e33636e464fc/tables/eca94125-028f-4a52-a70f-4d26305f07bb,"List(1, 2, List(33077, 33077, 33077.0, 1, 33077), List(18196, 18316, 18256.0, 2, 36512), 0, List(minCubeSize(107374182400), List(0, 0), List(2, 36512), 0, List(2, 36512), 1, null), null, 0, 1, 2, 0, false, 0, 0, 1742642300542, 1742642304375, 4, 1, null, List(0, 0), 5, 5, 164, 0, null)"
