### Gold Layer

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS gold

In [0]:
from pyspark.sql.functions import * 

#### Load Data From Silver Layer

In [0]:
import logging
from pyspark.sql import SparkSession

# Configure structured logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(module)s - %(message)s",
    handlers=[logging.StreamHandler()]
)

# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

try:
    # Log the start of the process
    logging.info("Starting to read Silver tables for merging.")

    # Read Silver tables
    try:
        df_appearance_silver = spark.read.table("silver.appearance")
        logging.info("Successfully read table: silver.appearance")
    except Exception as e:
        logging.error(f"Failed to read table: silver.appearance. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.appearance: {str(e)}") from e

    try:
        df_players_silver = spark.read.table("silver.players")
        logging.info("Successfully read table: silver.players")
    except Exception as e:
        logging.error(f"Failed to read table: silver.players. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.players: {str(e)}") from e

    try:
        df_clubs_silver = spark.read.table("silver.clubs")
        logging.info("Successfully read table: silver.clubs")
    except Exception as e:
        logging.error(f"Failed to read table: silver.clubs. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.clubs: {str(e)}") from e

    try:
        df_game_events_silver = spark.read.table("silver.game_events")
        logging.info("Successfully read table: silver.game_events")
    except Exception as e:
        logging.error(f"Failed to read table: silver.game_events. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.game_events: {str(e)}") from e

    try:
        df_transfers_silver = spark.read.table("silver.transfers")
        logging.info("Successfully read table: silver.transfers")
    except Exception as e:
        logging.error(f"Failed to read table: silver.transfers. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.transfers: {str(e)}") from e

    try:
        df_competitions_silver = spark.read.table("silver.competitions")
        logging.info("Successfully read table: silver.competitions")
    except Exception as e:
        logging.error(f"Failed to read table: silver.competitions. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.competitions: {str(e)}") from e

    try:
        df_club_games_silver = spark.read.table("silver.club_games")
        logging.info("Successfully read table: silver.club_games")
    except Exception as e:
        logging.error(f"Failed to read table: silver.club_games. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.club_games: {str(e)}") from e

    try:
        df_player_valuations_silver = spark.read.table("silver.player_valuations")
        logging.info("Successfully read table: silver.player_valuations")
    except Exception as e:
        logging.error(f"Failed to read table: silver.player_valuations. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.player_valuations: {str(e)}") from e

    try:
        df_games_silver = spark.read.table("silver.games")
        logging.info("Successfully read table: silver.games")
    except Exception as e:
        logging.error(f"Failed to read table: silver.games. Error: {str(e)}")
        raise RuntimeError(f"Error reading silver.games: {str(e)}") from e

    # Perform joins
    try:
        logging.info("Starting to merge tables.")
        df_merged = df_appearance_silver.join(df_players_silver, "player_id", "left") \
                                        .join(df_clubs_silver, df_appearance_silver.player_club_id == df_clubs_silver.club_id, "left")
        logging.info("Successfully merged tables: silver.appearance, silver.players, and silver.clubs.")
    except Exception as e:
        logging.error(f"Failed to merge tables. Error: {str(e)}")
        raise RuntimeError(f"Error during table merging: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

### Preparing Players Stats 

In [0]:
import logging
from pyspark.sql.functions import sum, avg, desc

try:
    # Log the start of the aggregation process
    logging.info("Starting to aggregate player statistics.")

    # Aggregate player statistics
    try:
        df_player_stats = (
            df_appearance_silver.groupBy("player_id")
            .agg(
                sum("goals").alias("total_goals"),
                sum("assists").alias("total_assists"),
                round(avg("minutes_played"), 2).alias("avg_minutes_played")
            )
        )
        logging.info("Successfully aggregated player statistics.")
    except Exception as e:
        logging.error(f"Failed to aggregate player statistics. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation: {str(e)}") from e

    # Join with df_players to get first_name and last_name
    try:
        logging.info("Starting to join player statistics with player names.")
        df_player_stats_with_names = (
            df_player_stats
            .join(df_players_silver, df_player_stats.player_id == df_players_silver.player_id, "inner")
            .select(
                df_players_silver.player_id,
                df_players_silver.first_name,
                df_players_silver.last_name,
                df_player_stats.total_goals,
                df_player_stats.total_assists,
                df_player_stats.avg_minutes_played
            )
        )
        logging.info("Successfully joined player statistics with player names.")
    except Exception as e:
        logging.error(f"Failed to join player statistics with player names. Error: {str(e)}")
        raise RuntimeError(f"Error during join operation: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

In [0]:
import logging
from pyspark.sql.functions import col, desc

try:
    # Log the start of the process
    logging.info("Starting to create the Gold DataFrame.")

    # Create the Gold DataFrame by joining player stats, players, and clubs
    try:
        df_gold = (
            df_player_stats.join(df_players_silver, df_player_stats.player_id == df_players_silver.player_id, "inner")
            .join(df_clubs_silver, df_players_silver.current_club_id == df_clubs_silver.club_id, "left")
            .select(
                df_player_stats.player_id.alias("player_id"),
                "first_name",
                "last_name",
                "total_goals",
                "total_assists",
                "avg_minutes_played",
                "position",
                col("name").alias("club_name"),
                "market_value_in_eur"
            )
        )
        logging.info("Successfully created the Gold DataFrame.")
    except Exception as e:
        logging.error(f"Failed to create the Gold DataFrame. Error: {str(e)}")
        raise RuntimeError(f"Error during Gold DataFrame creation: {str(e)}") from e

    # Sort by total_goals in descending order
    try:
        logging.info("Sorting the Gold DataFrame by total_goals in descending order.")
        df_sorted_players = df_gold.orderBy(desc("total_goals"))
        logging.info("Successfully sorted the Gold DataFrame.")
    except Exception as e:
        logging.error(f"Failed to sort the Gold DataFrame. Error: {str(e)}")
        raise RuntimeError(f"Error during sorting: {str(e)}") from e

    # Display the results
    try:
        logging.info("Displaying the top 10 players by total_goals.")
        display(df_sorted_players.head(10))
        logging.info("Successfully displayed the top 10 players.")
    except Exception as e:
        logging.error(f"Failed to display the results. Error: {str(e)}")
        raise RuntimeError(f"Error during result display: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

player_id,first_name,last_name,total_goals,total_assists,avg_minutes_played,position,club_name,market_value_in_eur
38253,Robert,Lewandowski,216,44,83.35,Attack,Futbol Club Barcelona,15000000.0
418560,Erling,Haaland,203,41,81.94,Attack,Manchester City Football Club,200000000.0
342229,Kylian,Mbappé,176,60,82.16,Attack,Real Madrid Club de Fútbol,160000000.0
132098,Harry,Kane,169,54,84.74,Attack,FC Bayern München,90000000.0
406625,Lautaro,Martínez,124,43,71.97,Attack,Football Club Internazionale Milano S.p.A.,100000000.0
105521,Ciro,Immobile,114,26,74.31,Attack,Beşiktaş Jimnastik Kulübü,3300000.0
96341,Romelu,Lukaku,110,35,72.76,Attack,Società Sportiva Calcio Napoli,25000000.0
533738,Jonathan,David,108,22,77.14,Attack,Lille Olympique Sporting Club,45000000.0
401923,Victor,Osimhen,102,24,74.55,Attack,Galatasaray Spor Kulübü,75000000.0
324503,Vangelis,Pavlidis,101,41,79.87,Attack,Sport Lisboa e Benfica,23000000.0


In [0]:
try:
    # Log the start of the write operation
    logging.info("Starting to write the sorted player stats DataFrame to the Delta table: gold.player_stats.")

    # Write the sorted DataFrame to the Delta table
    try:
        df_sorted_players.write.format("delta") \
                         .mode("overwrite") \
                         .option("mergeSchema", "true") \
                         .saveAsTable("gold.player_stats")
        logging.info("Successfully wrote the sorted player stats DataFrame to the Delta table: gold.player_stats.")
        print("Successfully wrote the sorted player stats DataFrame to the Delta table: gold.player_stats.")
    except Exception as e:
        logging.error(f"Failed to write the sorted player stats DataFrame to the Delta table. Error: {str(e)}")
        raise RuntimeError(f"Error during Delta table write: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

Successfully wrote the sorted player stats DataFrame to the Delta table: gold.player_stats.


**Optimize and Z-Order by club_name, position  and market_value_in_eur**

In [0]:
%sql
OPTIMIZE gold.player_stats
ZORDER BY (club_name, position, market_value_in_eur);

path,metrics
abfss://unity-catalog-storage@dbstoragebmlwakr3lruh6.dfs.core.windows.net/4332105040219628/__unitystorage/catalogs/882c4f0d-67c4-4c44-a2ce-e33636e464fc/tables/dbebc63d-5a56-43fb-97c5-540607955bf9,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 268167), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1742383345016, 1742383347360, 4, 0, null, List(0, 0), 9, 9, 0, 0, null)"


++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

#### Player Stat with more feature for ML prediction

In [0]:
from pyspark.sql.functions import sum, avg, count, max, min, col

try:
    # Log the start of the aggregation process
    logging.info("Starting to aggregate performance metrics from df_appearance_silver.")

    # Aggregate performance metrics from df_appearance
    try:
        df_performance = (
            df_appearance_silver.groupBy("player_id")
            .agg(
                sum("goals").alias("total_goals"),
                sum("assists").alias("total_assists"),
                round(avg("minutes_played"), 2).alias("avg_minutes_played"),
                sum("yellow_cards").alias("total_yellow_cards"),
                sum("red_cards").alias("total_red_cards")
            )
        )
        logging.info("Successfully aggregated performance metrics from df_appearance_silver.")
    except Exception as e:
        logging.error(f"Failed to aggregate performance metrics from df_appearance_silver. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation of df_appearance_silver: {str(e)}") from e

    # Aggregate game events from df_game_events
    try:
        logging.info("Starting to aggregate game events from df_game_events_silver.")
        df_game_events_agg = (
            df_game_events_silver.groupBy("player_id")
            .agg(
                count("game_event_id").alias("total_game_events"),
                sum((col("type") == "Goal").cast("int")).alias("total_goals_from_events"),
                sum((col("type") == "Assist").cast("int")).alias("total_assists_from_events")
            )
        )
        logging.info("Successfully aggregated game events from df_game_events_silver.")
    except Exception as e:
        logging.error(f"Failed to aggregate game events from df_game_events_silver. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation of df_game_events_silver: {str(e)}") from e

    # Aggregate transfer history from df_transfers
    try:
        logging.info("Starting to aggregate transfer history from df_transfers_silver.")
        df_transfer_history = (
            df_transfers_silver.groupBy("player_id")
            .agg(
                max("market_value_in_eur").alias("max_market_value_at_transfer"),
                min("market_value_in_eur").alias("min_market_value_at_transfer")
            )
        )
        logging.info("Successfully aggregated transfer history from df_transfers_silver.")
    except Exception as e:
        logging.error(f"Failed to aggregate transfer history from df_transfers_silver. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation of df_transfers_silver: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the aggregation process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

In [0]:

# Calculate age
df_players_with_age = df_players_silver.withColumn(
    "age",
    floor(datediff(current_date(), col("date_of_birth")) / 365.25)  
)


In [0]:

try:
    # Log the start of the process
    logging.info("Starting to join performance metrics into df_player_features.")

    # Join performance metrics
    try:
        df_player_features = (
            df_players_with_age.join(df_performance, df_players_with_age.player_id == df_performance.player_id, "left")
            .join(df_game_events_agg, df_players_with_age.player_id == df_game_events_agg.player_id, "left")
            .join(df_transfer_history, df_players_with_age.player_id == df_transfer_history.player_id, "left")
        )
        logging.info("Successfully joined performance metrics into df_player_features.")
    except Exception as e:
        logging.error(f"Failed to join performance metrics into df_player_features. Error: {str(e)}")
        raise RuntimeError(f"Error during joining performance metrics: {str(e)}") from e

    # Add club-related features
    try:
        logging.info("Adding club-related features to df_player_features.")
        df_club_features = (
            df_clubs_silver.select(
                "club_id",
                "squad_size",
                "average_age",
                "foreigners_percentage"
            )
        )
        df_player_features = (
            df_player_features
            .join(df_club_features, df_player_features.current_club_id == df_club_features.club_id, "left")
        )
        logging.info("Successfully added club-related features to df_player_features.")
    except Exception as e:
        logging.error(f"Failed to add club-related features to df_player_features. Error: {str(e)}")
        raise RuntimeError(f"Error during adding club-related features: {str(e)}") from e

    # Add competition-related features
    try:
        logging.info("Adding competition-related features to df_player_features.")
        df_competition_features = (
            df_competitions_silver.select(
                "competition_id",
                "is_major_national_league",
                "country_name"
            )
        )
        df_player_features = (
            df_player_features
            .join(df_competition_features, df_player_features.current_club_domestic_competition_id == df_competition_features.competition_id, "left")
        )
        logging.info("Successfully added competition-related features to df_player_features.")
    except Exception as e:
        logging.error(f"Failed to add competition-related features to df_player_features. Error: {str(e)}")
        raise RuntimeError(f"Error during adding competition-related features: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the feature engineering process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

In [0]:
df_final = (
    df_player_features.select(
        df_players_silver.player_id,
        "first_name",
        "last_name",
        "position",
        "age",  
        "height_in_cm",
        "foot",
        "contract_expiration_date",
        "total_goals",
        "total_assists",
        "avg_minutes_played",
        "total_yellow_cards",
        "total_red_cards",
        "total_game_events",
        "max_market_value_at_transfer",
        "min_market_value_at_transfer",
        "squad_size",
        "average_age",
        "foreigners_percentage",
        "is_major_national_league",
        "market_value_in_eur"  
    )
    .dropna() 
)

In [0]:
from pyspark.sql.functions import datediff, current_date, col

try:
    # Log the start of the process
    logging.info("Starting to transform the 'contract_expiration_date' column.")

    # Replace the old column with the calculated value
    try:
        df_final = df_final.withColumn(
            "contract_expiration_date",  
            (datediff(col("contract_expiration_date"), current_date()) / 30).cast("integer")
        )
        logging.info("Successfully replaced 'contract_expiration_date' with calculated months left.")
    except Exception as e:
        logging.error(f"Failed to calculate months left for 'contract_expiration_date'. Error: {str(e)}")
        raise RuntimeError(f"Error during transformation of 'contract_expiration_date': {str(e)}") from e

    # Rename the column to a more meaningful name
    try:
        df_final = df_final.withColumnRenamed(
            "contract_expiration_date",  
            "contract_months_left_to_expire"  
        )
        logging.info("Successfully renamed 'contract_expiration_date' to 'contract_months_left_to_expire'.")
    except Exception as e:
        logging.error(f"Failed to rename 'contract_expiration_date' to 'contract_months_left_to_expire'. Error: {str(e)}")
        raise RuntimeError(f"Error during column renaming: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the transformation process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

In [0]:
row_count = df_final.count()
print(f"Number of rows: {row_count}")


Number of rows: 6762


In [0]:
missing_values_count = df_final.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_final.columns])

missing_values_count.show()

+---------+----------+---------+--------+---+------------+----+------------------------------+-----------+-------------+------------------+------------------+---------------+-----------------+----------------------------+----------------------------+----------+-----------+---------------------+------------------------+-------------------+
|player_id|first_name|last_name|position|age|height_in_cm|foot|contract_months_left_to_expire|total_goals|total_assists|avg_minutes_played|total_yellow_cards|total_red_cards|total_game_events|max_market_value_at_transfer|min_market_value_at_transfer|squad_size|average_age|foreigners_percentage|is_major_national_league|market_value_in_eur|
+---------+----------+---------+--------+---+------------+----+------------------------------+-----------+-------------+------------------+------------------+---------------+-----------------+----------------------------+----------------------------+----------+-----------+---------------------+-----------------------

In [0]:
num_records = df_final.count()
print(f"Number of records ingested: {num_records}")

Number of records ingested: 6762


In [0]:
display(df_final.head(10))

player_id,first_name,last_name,position,age,height_in_cm,foot,contract_months_left_to_expire,total_goals,total_assists,avg_minutes_played,total_yellow_cards,total_red_cards,total_game_events,max_market_value_at_transfer,min_market_value_at_transfer,squad_size,average_age,foreigners_percentage,is_major_national_league,market_value_in_eur
45026,Rui,Patrício,Goalkeeper,37,190.0,left,3,0,0,90.26,8,0,12,10000000.0,1000000.0,26,27.6,53.8,True,1000000.0
125685,Luke,Berry,Midfield,32,177.0,right,15,2,3,23.75,2,0,15,400000.0,400000.0,30,26.1,53.3,True,400000.0
126014,Leandro,Bacuna,Defender,33,180.0,right,3,3,2,75.48,6,0,20,1800000.0,900000.0,28,24.4,32.1,False,350000.0
168678,Marvin,Schulz,Midfield,30,186.0,right,3,0,0,30.0,2,0,10,800000.0,800000.0,30,26.0,40.0,True,700000.0
257980,Jeroen,Houwen,Goalkeeper,29,188.0,right,3,0,0,86.42,0,0,2,400000.0,250000.0,30,26.5,46.7,False,300000.0
262226,Timothy,Castagne,Defender,29,185.0,right,27,7,12,78.88,9,0,38,25000000.0,18000000.0,25,28.2,80.0,True,15000000.0
273945,Miroslav,Bogosavac,Defender,28,176.0,left,27,0,14,82.25,13,0,24,1700000.0,1400000.0,28,27.6,39.3,False,1800000.0
290532,Saïd,Benrahma,Attack,29,172.0,right,27,24,22,56.95,4,1,124,22000000.0,20000000.0,27,28.4,66.7,True,20000000.0
308133,Ado,Onaiwu,Attack,29,180.0,right,15,4,1,29.53,0,1,17,1500000.0,1200000.0,28,25.2,60.7,True,1800000.0
325223,Mama,Baldé,Attack,29,176.0,right,27,30,15,60.5,15,2,112,8000000.0,4000000.0,26,27.1,38.5,True,4500000.0


In [0]:
try:
    # Log the start of the write operation
    logging.info("Starting to write the final DataFrame to the Delta table: gold.player_stats_and_valuations.")

    # Write the final DataFrame to the Delta table
    try:
        df_final.write.format("delta") \
                .mode("overwrite") \
                .option("mergeSchema", "true") \
                .saveAsTable("gold.player_stats_and_valuations")
        logging.info("Successfully wrote the final DataFrame to the Delta table: gold.player_stats_and_valuations.")
        print("Final DataFrame written to the Delta table: gold.player_stats_and_valuations.")
    except Exception as e:
        logging.error(f"Failed to write the final DataFrame to the Delta table. Error: {str(e)}")
        raise RuntimeError(f"Error during Delta table write: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

Final DataFrame written to the Delta table: gold.player_stats_and_valuations.


**Optimize and Z-Order by player_id, position and market_value_in_eur**

In [0]:
%sql
OPTIMIZE gold.player_stats_and_valuations
ZORDER BY (player_id, position, market_value_in_eur);

path,metrics
abfss://unity-catalog-storage@dbstoragebmlwakr3lruh6.dfs.core.windows.net/4332105040219628/__unitystorage/catalogs/882c4f0d-67c4-4c44-a2ce-e33636e464fc/tables/cf5bff45-5804-4ce1-b3d5-f048cadbd736,"List(1, 2, List(232283, 232283, 232283.0, 1, 232283), List(106025, 145973, 125999.0, 2, 251998), 0, List(minCubeSize(107374182400), List(0, 0), List(2, 251998), 0, List(2, 251998), 1, null), null, 0, 1, 2, 0, false, 0, 0, 1742384177992, 1742384182532, 4, 1, null, List(0, 0), 21, 21, 607, 0, null)"


**Aggregate performance metrics for each club**

In [0]:
from pyspark.sql.functions import sum, count, when, col

try:
    # Log the start of the process
    logging.info("Starting to aggregate performance metrics for each club.")

    # Aggregate performance metrics for each club
    try:
        df_team_performance = (
            df_club_games_silver.groupBy("club_id")
            .agg(
                sum(when(col("is_win") == 1, 1).otherwise(0)).alias("total_wins"),
                sum(when(col("is_win") == 0, 1).otherwise(0)).alias("total_losses"),
                sum(when(col("own_goals") == col("opponent_goals"), 1).otherwise(0)).alias("total_draws"),
                sum("own_goals").alias("total_goals_scored"),
                sum("opponent_goals").alias("total_goals_conceded"),
                count("game_id").alias("total_games_played")
            )
        )
        logging.info("Successfully aggregated performance metrics for each club.")
    except Exception as e:
        logging.error(f"Failed to aggregate performance metrics for clubs. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation of club performance metrics: {str(e)}") from e

    # Calculate points (3 points for a win, 1 point for a draw)
    try:
        df_team_performance = (
            df_team_performance.withColumn(
                "total_points",
                col("total_wins") * 3 + col("total_draws")
            )
        )
        logging.info("Successfully calculated total points for each club.")
    except Exception as e:
        logging.error(f"Failed to calculate total points for clubs. Error: {str(e)}")
        raise RuntimeError(f"Error during calculation of total points: {str(e)}") from e

    # Add average goals scored and conceded per game
    try:
        df_team_performance = (
            df_team_performance.withColumn(
                "avg_goals_scored_per_game",
                col("total_goals_scored") / col("total_games_played")
            ).withColumn(
                "avg_goals_conceded_per_game",
                col("total_goals_conceded") / col("total_games_played")
            )
        )
        logging.info("Successfully calculated average goals scored and conceded per game for each club.")
    except Exception as e:
        logging.error(f"Failed to calculate average goals scored and conceded per game. Error: {str(e)}")
        raise RuntimeError(f"Error during calculation of average goals: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

**Join with df_clubs to get club names**

In [0]:
# Join with df_clubs to get club names
df_team_performance_with_names = (
    df_team_performance
    .join(df_clubs_silver.select("club_id", "name"), df_team_performance.club_id == df_clubs_silver.club_id, "left")
    .select(
        "name",
        "total_wins",
        "total_losses",
        "total_draws",
        "total_goals_scored",
        "total_goals_conceded",
        "total_points",
        round("avg_goals_scored_per_game", 2).alias("avg_goals_scored_per_game"),
        round("avg_goals_conceded_per_game", 2).alias("avg_goals_conceded_per_game")
    )
)

**Show the top-performing teams**

In [0]:
# Sort by total points in descending order
df_team_performance_sorted = df_team_performance_with_names.orderBy(col("total_points").desc())

# Show the top-performing teams
display(df_team_performance_sorted.head(10))

name,total_wins,total_losses,total_draws,total_goals_scored,total_goals_conceded,total_points,avg_goals_scored_per_game,avg_goals_conceded_per_game
Real Madrid Club de Fútbol,500,232,121,1734,743,1621,2.37,1.02
Futbol Club Barcelona,497,228,119,1768,705,1610,2.44,0.97
FC Bayern München,488,166,80,1784,624,1544,2.73,0.95
Manchester City Football Club,463,227,103,1627,674,1492,2.36,0.98
The Celtic Football Club,450,207,102,1519,594,1452,2.31,0.9
Juventus Football Club,434,241,128,1229,577,1430,1.82,0.85
Club Atlético de Madrid S.A.D.,411,282,150,1193,596,1383,1.72,0.86
Sport Lisboa e Benfica,422,202,103,1354,560,1369,2.17,0.9
Paris Saint-Germain Football Club,415,190,105,1420,523,1350,2.35,0.86
Liverpool Football Club,402,265,143,1393,729,1349,2.09,1.09


### Transfer Market Trend

**Aggregate transfer metrics by season**

In [0]:
try:
    # Log the start of the process
    logging.info("Starting to aggregate transfer metrics by season.")

    # Aggregate transfer metrics by season
    try:
        df_transfer_trends = (
            df_transfers_silver.groupBy("transfer_season")
            .agg(
                round(avg("market_value_in_eur"), 2).alias("avg_market_value"),
                count("player_id").alias("total_transfers")
            )
        )
        logging.info("Successfully aggregated transfer metrics by season.")
    except Exception as e:
        logging.error(f"Failed to aggregate transfer metrics by season. Error: {str(e)}")
        raise RuntimeError(f"Error during aggregation of transfer metrics: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

**Show the results**

In [0]:
# Show the results
display(df_transfer_trends.head(10))

transfer_season,avg_market_value,total_transfers
18/19,0.0,2
24/25,2970707.19,6038
19/20,2100439.37,2276
21/22,1758433.82,7397
25/26,5730743.24,74
23/24,2233307.09,8509
22/23,2095775.05,8135
20/21,1543683.09,6819
26/27,500000.0,1


**Save team performance analysis**

In [0]:
try:
    # Log the start of the write operation
    logging.info("Starting to save the team performance analysis to the Delta table: gold.team_performance_analysis.")

    # Save team performance analysis to Delta table
    try:
        df_team_performance_sorted.write.format("delta") \
                                  .mode("overwrite") \
                                  .option("mergeSchema", "true") \
                                  .saveAsTable("gold.team_performance_analysis")
        logging.info("Successfully saved the team performance analysis to the Delta table: gold.team_performance_analysis.")
        print("Team performance analysis saved to the Delta table: gold.team_performance_analysis.")
    except Exception as e:
        logging.error(f"Failed to save the team performance analysis to the Delta table. Error: {str(e)}")
        raise RuntimeError(f"Error during Delta table write: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

Team performance analysis saved to the Delta table: gold.team_performance_analysis.


**Optimize and Z-Order by name and total_points**

In [0]:
%sql
OPTIMIZE gold.team_performance_analysis
ZORDER BY (name, total_points);

path,metrics
abfss://unity-catalog-storage@dbstoragebmlwakr3lruh6.dfs.core.windows.net/4332105040219628/__unitystorage/catalogs/882c4f0d-67c4-4c44-a2ce-e33636e464fc/tables/0ffc746f-e50e-406e-b7a1-392cdde126ed,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 39724), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1742350839993, 1742350842337, 4, 0, null, List(0, 0), 9, 9, 0, 0, null)"


**Save transfer market trends**

In [0]:
try:
    # Log the start of the write operation
    logging.info("Starting to save the transfer market trends to the Delta table: gold.transfer_market_trends.")

    # Save transfer market trends to Delta table
    try:
        df_transfer_trends.write.format("delta") \
                          .mode("overwrite") \
                          .option("mergeSchema", "true") \
                          .saveAsTable("gold.transfer_market_trends")
        logging.info("Successfully saved the transfer market trends to the Delta table: gold.transfer_market_trends.")
        print("Transfer market trends saved to the Delta table: gold.transfer_market_trends.")
    except Exception as e:
        logging.error(f"Failed to save the transfer market trends to the Delta table. Error: {str(e)}")
        raise RuntimeError(f"Error during Delta table write: {str(e)}") from e

except Exception as e:
    logging.error(f"Critical failure during the process. Error: {str(e)}")
    raise RuntimeError(f"Critical failure: {str(e)}") from e

Transfer market trends saved to the Delta table: gold.transfer_market_trends.


**Optimize and Z-Order by transfer_season and avg_market_value**

In [0]:
%sql
OPTIMIZE gold.transfer_market_trends
ZORDER BY (transfer_season, avg_market_value);

path,metrics
abfss://unity-catalog-storage@dbstoragebmlwakr3lruh6.dfs.core.windows.net/4332105040219628/__unitystorage/catalogs/882c4f0d-67c4-4c44-a2ce-e33636e464fc/tables/921228a9-0965-463b-8256-9e2b179cb328,"List(0, 0, List(null, null, 0.0, 0, 0), List(null, null, 0.0, 0, 0), 0, List(minCubeSize(107374182400), List(0, 0), List(1, 1417), 0, List(0, 0), 0, null), null, 0, 0, 1, 1, false, 0, 0, 1742350847027, 1742350849343, 4, 0, null, List(0, 0), 3, 3, 0, 0, null)"
