In [0]:
%python
%pip install wordcloud


In [0]:
%python
#Mount ADLS Gen2 to DBFS

# Necessary parameters
storage_account_name = "tokyodatastorageacc"
container_name = "paris-olympic-data"
mount_point = "/mnt/rawPris"  # DBFS mount path
client_id = "e16e52b2-d6f3-40d1-8a0a-c758b4e220a0"  # Service principal's Application ID
tenant_id = "4c0016a4-d7d3-4090-9c66-7d803a9fb487"  # Azure AD tenant ID
client_secret = "QBd8Q~VvqarjYq3iOdF-ZREIgiD.vXdw0qX7Ma8c"  # Service principal secret

# Build OAuth2 authorization configuration
configs = {
    "fs.azure.account.auth.type": "OAuth",
    "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
    "fs.azure.account.oauth2.client.id": client_id,
    "fs.azure.account.oauth2.client.secret": client_secret,
    "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token",
}

# Mount storage
try:
    dbutils.fs.mount(
        source=f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/",
        mount_point=mount_point,
        extra_configs=configs
    )
    print(f"Mount successful! Storage account {storage_account_name} mounted to {mount_point}")
except Exception as e:
    print(f"Mount failed: {str(e)}")

# Verify if mount was successful
try:
    display(dbutils.fs.ls(mount_point))
except Exception as e:
    print(f"Verification failed: {str(e)}")

In [0]:
#Data importing and data schema checking
mount_point = "dbfs:/mnt/rawPris/raw-data"

athletes = spark.read.csv(mount_point+"/athletes.csv", header=True,  
    quote='"',        
    escape='"',       
    multiLine=True,    
    inferSchema=True)
coaches = spark.read.csv(mount_point+"/coaches.csv", header=True,inferSchema=True)
events = spark.read.csv(mount_point+"/events.csv", header=True,inferSchema=True)
medals = spark.read.csv(mount_point+"/medals.csv", header=True,inferSchema=True)
medals_lists = spark.read.csv(mount_point+"/medallists.csv", header=True,inferSchema=True)
medals_total = spark.read.csv(mount_point+"/medalstotal.csv", header=True,inferSchema=True)
teams = spark.read.csv(mount_point+"/teams.csv", header=True,inferSchema=True)

In [0]:
athletes.printSchema()


In [0]:
%python
from pyspark.sql.functions import col, to_date, explode, split, regexp_replace, trim, lower

# Step 1: Filter rows where 'current' is True
athletes_spark_df = athletes.filter(col("current") == True)

# Step 2: Drop unnecessary columns
columns_to_remove = [
    'name_short', 'current', 'name', 'function', 'residence_place', 
    'country_long', 'nationality', 'nationality_long', 'nationality_code', 
    'birth_place', 'birth_country', 'residence_country', 'nickname', 
    'hobbies', 'family', 'lang', 'ritual', 'other_sports', 'sporting_relatives'
]
athletes_spark_df = athletes_spark_df.drop(*columns_to_remove)

# Step 3: Rename cols
athletes_spark_df = athletes_spark_df.withColumnRenamed("name_tv", "name") \
                                        .withColumnRenamed("code", "athlete_id")
# Step 4: Handle missing values
athletes_spark_df = athletes_spark_df.fillna({
    "education" : "",
    "reason": "",  # Fill string fields with None
    "hero": "",
    "influence": "",
    "philosophy": ""
})

# Step 5: Convert 'birth_date' column to date type
athletes_spark_df = athletes_spark_df.withColumn("birth_date", to_date(col("birth_date"), 'yyyy-MM-dd'))


In [0]:
athletes_spark_df_cleaned = athletes_spark_df.select("athlete_id", "name", "birth_date", "gender", "country", "height", "weight",  "education")

athletes_spark_df_cleaned.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/Athletes",
   header=True
)

In [0]:
# Show the first few rows
display(athletes_spark_df['code', 'reason','hero','influence','philosophy'].limit(15))

code,reason,hero,influence,philosophy
1532872,He followed his father and his uncle into the sport,"Footballer Zinedine Zidane (FRA), World Cup winner (1998) and European champion (2000) with France, won the Champions League as a player and three times as a manager with Real Madrid, three-time FIFA World Player of the Year","His father, Gevorg Aleksanyan","""Wrestling is my life."" (mediamax.am. 18 May 2016)"
1532873,,,,"""To become a good athlete, you first have to be a good person."" (ankakh.com, 6 Oct 2018)"
1532874,,,,
1532944,"While doing karate he noticed wrestlers training and decided to give it a try. He also tried judo but his father, a former wrestler, did not allow him to do both, so he chose wrestling. (sport.mediamax.am, 10 July 2017)","Wrestler Armen Nazaryan (ARM, BUL), two-time Olympic champion (1996, 2000) and 2004 bronze medallist. Eight-time world championship medallist (three gold, two silver, three bronze)",,"“Nothing is impossible, set goals in front of you, fight and achieve it.” (Instagram, 13 May 2023)"
1532945,"“My family did not like wrestling very much. At first I wanted to do boxing but my older friends advised me to go to wrestling training, and after a week, I started to like the sport.” (myInfo)",,,
1532951,,"Race walker Luis Fernando Lopez (COL), four-time Olympian (2004, 2008, 2012, 2016), 2011 world champion (20km walk)",,
1533112,"""I was thrown over [an argument about] a Pokemon card and wanted to learn to throw immediately."" (Athlete, 25 Jun 2024)","Boxer Muhammad Ali, born Cassius Clay (USA), former undisputed heavyweight champion of the world, 1960 Olympic champion (light heavyweight), nicknamed 'The Greatest' and regarded as one of the most significant sports figures of the 20th century","""My coach Luke Preston. We've been a team for the last 12 years."" (Athlete, 25 Jun 2024)",
1533136,"“I started running when I was in primary school, like we have junior championships. It's where I started running and it’s where I started to notice that, if I work hard I will be a great athlete.” (olympics.com, 24 Apr 2024)","Sprinter Shelly-Ann Fraser-Pryce (JAM), three time Olympic champion (four silver, one bronze), 16-time world championship medallist (10 gold, five silver, one bronze). In the 100m, two-time Olympic champion (2008, 2012), five-time world champion (2009, 2013, 2015, 2019, 2022). ""Shelly-Ann, she is consistent. She is a mother but she still loves what she is doing and she is still performing as she did before."" (Tales of Hagie Drammeh Youtube, 16 Jan 2023)Sprinter Marie-Josee Ta Lou (CIV), three fourth places at the Olympic Games (2016, 2020), double world silver medallist (100m-200m) in 2017, world bronze medallist in 2019 (100m)",,"""If you believe in yourself, never be discouraged."" (worldathletics.org, 17 Dec 2019)"
1533176,"""I love running and I was fast a child."" (Athlete, 7 Jul 2024)","Sprinter Gina Mariam Bass Bittaye (GAM), two-time Olympian (Rio 2016, Tokyo 2020), 2019 World Championships 200m finalist, four-time African championships medallist (two gold, two bronze), four-time African Games medallist (three gold, one silver)","Momodou Lamin Kujabi, a former international athlete from The Gambia who was once his physical education teacher","""What does not kill you makes you stronger."" (Athlete, 7 Jul 2024)"
1533188,Was a competitive swimmer from age 12 and went on to race internationally in marathon swimming,,,


In [0]:
coaches_df = coaches.select("code",  "name", "gender", "function", "country", "disciplines","birth_date")
coaches_df = coaches_df.withColumnRenamed("code", "coach_id")
coaches_df = coaches_df.withColumn("birth_date", to_date(col("birth_date"), 'yyyy-MM-dd'))
coaches_df = coaches_df.fillna({
    "birth_date" : "1900-01-01"
})
display(coaches_df.limit(15))
coaches_df.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/Coaches",
   header=True
)

coach_id,name,gender,function,country,disciplines,birth_date
1533246,PEDRERO Ofelia,Female,Coach,Mexico,Artistic Swimming,1988-03-28
1535775,RADHI SHENAISHIL,Male,Head Coach,Iraq,Football,1965-07-01
1536055,AFLAKIKHAMSEH Majid,Male,Coach,IR Iran,Taekwondo,1973-08-26
1536059,YOUSEFY Mehrdad,Male,Coach,IR Iran,Taekwondo,1972-06-12
1536060,MADDAH Minoo,Female,Coach,IR Iran,Taekwondo,1976-05-17
1536328,LOFTUS Adriana,Female,Coach,Mexico,Artistic Swimming,1958-06-28
1538313,FERRARA Fernando,Male,Head Coach,Argentina,Hockey,1968-07-24
1538315,GULLA Alejandra,Female,Assistant Coach,Argentina,Hockey,1977-07-04
1538317,CAPURRO Santiago,Male,Assistant Coach,Argentina,Hockey,1975-04-08
1538745,RONCONI Mariano,Male,Head Coach,Argentina,Hockey,1900-01-01


In [0]:

medals_total_df = medals_total.select("country",  "Gold Medal", "Silver Medal", "Bronze Medal","Total")
medals_total_df.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/MedalsTotal",
   header=True
)

medal_list_df = medals_lists.select("medal_date", "medal_type", "code_athlete", "discipline")
medal_list_df = medal_list_df.withColumnRenamed("code_athlete", "athlete_id")
medal_list_df.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/MedalsList",
   header=True
)


In [0]:

athlete_medal_df = athletes_spark_df_cleaned.join(medals_total_df, on="country", how="left")
athlete_medallist_df = medal_list_df.join(athlete_medal_df, on="athlete_id", how="left")


In [0]:
athlete_medallist_df.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/Olympic",
   header=True
)

In [0]:

athletes_spark_df_ml_cleaned = athletes_spark_df.select("code", "reason", "philosophy") 
athletes_spark_df_ml_cleaned = athletes_spark_df_ml_cleaned.withColumnRenamed("code", "athlete_id")
def clean_text(column_name):
    return trim(
        lower(
            regexp_replace(
            regexp_replace(
                regexp_replace(column_name, r"\([^)]*\)", ""), r"<[^>]+>", ""
            ), r"[^a-zA-Z0-9\s]", ""
        )
        )
    )
columns_to_clean = ["reason", "philosophy"]

for column in columns_to_clean:
    athletes_spark_df_ml_cleaned = athletes_spark_df_ml_cleaned.withColumn(
        column, clean_text(col(column))
    )

athletes_spark_df_ml_cleaned = athletes_spark_df_ml_cleaned.na.drop(subset=columns_to_clean)
athletes_spark_df_ml_cleaned.repartition(1).write.mode("overwrite").csv(
    "dbfs:/mnt/rawPris/transformed-data/MLText",
   header=True
)
# Show the first few rows
display(athletes_spark_df_ml_cleaned.limit(15))

athlete_id,reason,philosophy
1532872,he followed his father and his uncle into the sport,wrestling is my life
1532873,,to become a good athlete you first have to be a good person
1532874,,
1532944,while doing karate he noticed wrestlers training and decided to give it a try he also tried judo but his father a former wrestler did not allow him to do both so he chose wrestling,nothing is impossible set goals in front of you fight and achieve it
1532945,my family did not like wrestling very much at first i wanted to do boxing but my older friends advised me to go to wrestling training and after a week i started to like the sport,
1532951,,
1533112,i was thrown over an argument about a pokemon card and wanted to learn to throw immediately,
1533136,i started running when i was in primary school like we have junior championships its where i started running and its where i started to notice that if i work hard i will be a great athlete,if you believe in yourself never be discouraged
1533176,i love running and i was fast a child,what does not kill you makes you stronger
1533188,was a competitive swimmer from age 12 and went on to race internationally in marathon swimming,
