# Initialize MongoDB client
See README.md for setup instructions.

In [2]:
import os
from urllib.parse import quote_plus
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

username = quote_plus('common')
password = quote_plus(os.environ.get('MONGODB_PASSWORD'))
uri = f"mongodb+srv://{username}:{password}@playervaluations.v7jevdf.mongodb.net/?retryWrites=true&w=majority"
# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [3]:
"""import json

db = client['player_valuations']
collection = db['players']
player = collection.find_one({'player_id': 10})

# Print the result
if player:
    print("Player found:", json.dumps(player, indent=4, default=str))
else:
    print("No player found with player_id", 65)"""

'import json\n\ndb = client[\'player_valuations\']\ncollection = db[\'players\']\nplayer = collection.find_one({\'player_id\': 10})\n\n# Print the result\nif player:\n    print("Player found:", json.dumps(player, indent=4, default=str))\nelse:\n    print("No player found with player_id", 65)'

In [4]:
import pandas as pd
from pyspark.sql.types import StructType, StructField, ArrayType, MapType, StringType, IntegerType, DoubleType

db = client['player_valuations']
collection = db['players']
res = collection.find()

df = pd.DataFrame(list(res))
df.drop("_id", axis=1, inplace=True)

In [5]:
schema = StructType([
    StructField("player_id", IntegerType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("name", StringType(), True),
    StructField("last_season", IntegerType(), True),
    StructField("current_club_id", IntegerType(), True),
    StructField("player_code", StringType(), True),
    StructField("country_of_birth", StringType(), True),
    StructField("city_of_birth", StringType(), True),
    StructField("country_of_citizenship", StringType(), True),
    StructField("date_of_birth", StringType(), True),
    StructField("sub_position", StringType(), True),
    StructField("position", StringType(), True),
    StructField("foot", StringType(), True),
    StructField("height_in_cm", DoubleType(), True),
    StructField("contract_expiration_date", StringType(), True),
    StructField("agent_name", StringType(), True),
    StructField("image_url", StringType(), True),
    StructField("url", StringType(), True),
    StructField("current_club_domestic_competition_id", StringType(), True),
    StructField("current_club_name", StringType(), True),
    StructField("market_value_in_eur", DoubleType(), True),
    StructField("highest_market_value_in_eur", DoubleType(), True),
    StructField("valuations", ArrayType(StructType([
        StructField("player_id", IntegerType(), True),
        StructField("date", StringType(), True),
        StructField("datetime", StringType(), True),
        StructField("dateweek", StringType(), True),
        StructField("market_value_in_eur", IntegerType(), True),
        StructField("current_club_id", IntegerType(), True),
        StructField("player_club_domestic_competition_id", StringType(), True),
    ]), True), True),
])

# Starting a spark session and extracting the raw data into dataframes

In [6]:
from pyspark.sql import SparkSession

ss = SparkSession.builder.master("local[*]").getOrCreate()

player_valuation_df=ss.createDataFrame(df, schema = schema)
appearances_df = ss.read.csv("raw_data\\transfermarkt\\appearances.csv", header=True, inferSchema=True)
games_df = ss.read.option("multiline","true").json("raw_data\\transfermarkt\\games.json")

# Some cleaning operations

In [7]:
from pyspark.sql.functions import when

player_valuation_df = player_valuation_df.drop(*["image_url", "url", "name", "player_code"])
player_valuation_df = player_valuation_df.withColumns({
    "current_club_id": when(player_valuation_df["last_season"] != 2023, -1).otherwise(player_valuation_df["current_club_id"]),
    "current_club_domestic_competition_id": when(player_valuation_df["last_season"] != 2023, "-1").otherwise(player_valuation_df["current_club_domestic_competition_id"]),
    "current_club_name": when(player_valuation_df["last_season"] != 2023, "Retired").otherwise(player_valuation_df["current_club_name"]),
    "market_value_in_eur": when(player_valuation_df["last_season"] != 2023, 0).otherwise(player_valuation_df["market_value_in_eur"])
})

games_df = games_df.drop(*["url", "aggregate", "home_club_formation", "away_club_formation"])

# Max market value player in 2023. If there are many with the same max value take them all.

In [58]:
from pyspark.sql.functions import explode, col, expr

valuations = player_valuation_df.select("valuations")
flattened_valuations = valuations.select(explode("valuations").alias("valuation"))
valuations2023 = flattened_valuations.filter("substring(valuation.date, 1, 4) = '2023'")
max_market_value_players_2023 = valuations2023\
    .select("valuation.*").groupBy("player_id").max("market_value_in_eur")\
    .withColumnRenamed("max(market_value_in_eur)", "market_value_in_eur")\
    .join(player_valuation_df.select("player_id", "first_name", "last_name"), on="player_id", how="inner")\
    .orderBy('market_value_in_eur', ascending=False)\
    .select("first_name", "last_name", "market_value_in_eur")

max_value = max_market_value_players_2023.select("market_value_in_eur").first()["market_value_in_eur"]

max_value_players = max_market_value_players_2023.filter(col("market_value_in_eur") == max_value)

max_value_players.show()

+----------+----------+-------------------+
|first_name| last_name|market_value_in_eur|
+----------+----------+-------------------+
|    Kylian|    Mbappé|          180000000|
|    Erling|   Haaland|          180000000|
|      Jude|Bellingham|          180000000|
+----------+----------+-------------------+



# Closing the MongoDB client and the spark session

In [None]:
"""client.close()
ss.stop()"""