# 1 Installer Java, PySpark, et Findspark :

In [3]:
# Installer Java 8

!apt-get install openjdk-8-jdk-headless -qq > /dev/null


In [4]:
# Installer PySpark

!pip install -q pyspark
!pip install -q findspark


# 2 Télécharger les packages nécessaires pour lire les fichiers XML et Excel

In [5]:
# Télécharger le package spark-xml
!wget -P /content/ https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/0.11.0/spark-xml_2.12-0.11.0.jar

# Télécharger le package spark-excel
!wget -P /content/ https://repo1.maven.org/maven2/com/crealytics/spark-excel_2.12/0.13.5/spark-excel_2.12-0.13.5.jar


--2024-05-28 07:50:29--  https://repo1.maven.org/maven2/com/databricks/spark-xml_2.12/0.11.0/spark-xml_2.12-0.11.0.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 158494 (155K) [application/java-archive]
Saving to: ‘/content/spark-xml_2.12-0.11.0.jar’


2024-05-28 07:50:29 (4.47 MB/s) - ‘/content/spark-xml_2.12-0.11.0.jar’ saved [158494/158494]

--2024-05-28 07:50:29--  https://repo1.maven.org/maven2/com/crealytics/spark-excel_2.12/0.13.5/spark-excel_2.12-0.13.5.jar
Resolving repo1.maven.org (repo1.maven.org)... 199.232.192.209, 199.232.196.209, 2a04:4e42:4c::209, ...
Connecting to repo1.maven.org (repo1.maven.org)|199.232.192.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6131669 (5.8M) [application/java-archive]
Saving to: ‘/content/spark-excel_2.12-0.13.5.ja

# 3 .Configurer les variables d'environnement et initialiser Spark :


In [6]:
import os
import findspark

# Définir les variables d'environnement
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/usr/local/lib/python3.10/dist-packages/pyspark"

# Initialiser findspark
findspark.init()


# 4. Initialiser une session Spark en ajoutant les packages nécessaires :

In [7]:
from pyspark.sql import SparkSession

# Créer une session Spark en ajoutant les packages spark-xml et spark-excel
spark = SparkSession.builder \
    .appName("OlympicData") \
    .getOrCreate()


# 5. Charger et lire les différents fichiers :



1.   Lire le fichier JSON






In [8]:
df_athletes = spark.read.json("/content/sample_data/JO/olympic_athletes.json")
df_athletes.show()


+--------------------+--------------------+--------------------+------------------+--------------------+----------------+--------------------+
|   athlete_full_name|      athlete_medals|         athlete_url|athlete_year_birth|                 bio|      first_game|games_participations|
+--------------------+--------------------+--------------------+------------------+--------------------+----------------+--------------------+
|Cooper WOODS-TOPA...|                NULL|https://olympics....|            2000.0|                NULL|    Beijing 2022|                   1|
|      Felix ELOFSSON|                NULL|https://olympics....|            1995.0|                NULL|PyeongChang 2018|                   2|
|       Dylan WALCZYK|                NULL|https://olympics....|            1993.0|                NULL|    Beijing 2022|                   1|
|       Olli PENTTALA|                NULL|https://olympics....|            1995.0|                NULL|    Beijing 2022|                   1|

2. Lire le fichier XML

In [10]:
import xml.etree.ElementTree as ET
import findspark
findspark.init()

xml_file_path = "/content/sample_data/JO/olympic_hosts.xml"
def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()
    data = []
    for row in root.findall('row'):
          index = row.find('index').text
          game_slug = row.find('game_slug').text
          game_end_date = row.find('game_end_date').text
          game_start_date = row.find('game_start_date').text
          game_location = row.find('game_location').text
          game_name = row.find('game_name').text
          game_season = row.find('game_season').text
          game_year = row.find('game_year').text
          data.append((index, game_slug, game_end_date, game_start_date, game_location, game_name, game_season, game_year))
    return data
# Lire et parser le fichier XML
parsed_data = parse_xml(xml_file_path)
# Créer un DataFrame Spark à partir des données parsées
columns = ["Index", "Game_Slug", "Game_End_Date", "Game_Start_Date", "Game_Location", "Game_Name", "Game_Season", "Game_Year"]
df_hosts = spark.createDataFrame(parsed_data, columns)
# Afficher le DataFrame
df_hosts.show()


+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|Index|          Game_Slug|       Game_End_Date|     Game_Start_Date|     Game_Location|          Game_Name|Game_Season|Game_Year|
+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|    0|       beijing-2022|2022-02-20T12:00:00Z|2022-02-04T15:00:00Z|             China|       Beijing 2022|     Winter|     2022|
|    1|         tokyo-2020|2021-08-08T14:00:00Z|2021-07-23T11:00:00Z|             Japan|         Tokyo 2020|     Summer|     2020|
|    2|   pyeongchang-2018|2018-02-25T08:00:00Z|2018-02-08T23:00:00Z| Republic of Korea|   PyeongChang 2018|     Winter|     2018|
|    3|           rio-2016|2016-08-21T21:00:00Z|2016-08-05T12:00:00Z|            Brazil|           Rio 2016|     Summer|     2016|
|    4|         sochi-2014|2014-02-23T16:00:00Z|2014-02-07T04:00:00Z|Russian Federa

3. Lire le fichier Excel


In [11]:

!pip install pandas openpyxl




In [12]:
import pandas as pd

# Lire le fichier Excel
file_path_excel = '/content/sample_data/JO/olympic_medals.xlsx'
df = pd.read_excel(file_path_excel, sheet_name='Sheet1')  # Remplacer 'Sheet1' par le nom de la feuille si différent

# Convertir en CSV
file_path_csv = '/content/sample_data/JO/olympic_medals.csv'
df.to_csv(file_path_csv, index=False)


In [13]:
# Lire le fichier CSV converti
df_medals = spark.read.csv(file_path_csv, header=True, inferSchema=True)
df_medals.show()



+----------+----------------+------------+--------------------+------------+----------+----------------+-----------------+--------------------+--------------------+--------------------+------------+---------------------+
|Unnamed: 0|discipline_title|   slug_game|         event_title|event_gender|medal_type|participant_type|participant_title|         athlete_url|   athlete_full_name|        country_name|country_code|country_3_letter_code|
+----------+----------------+------------+--------------------+------------+----------+----------------+-----------------+--------------------+--------------------+--------------------+------------+---------------------+
|         0|         Curling|beijing-2022|       Mixed Doubles|       Mixed|      GOLD|        GameTeam|            Italy|https://olympics....|Stefania CONSTANTINI|               Italy|          IT|                  ITA|
|         1|         Curling|beijing-2022|       Mixed Doubles|       Mixed|      GOLD|        GameTeam|            

4. Lire le fichier HTML

In [14]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *

# Lire le fichier HTML avec pandas
html_tables = pd.read_html("/content/sample_data/JO/olympic_results.html")

# Supposons que les données que vous voulez sont dans la première table
df_results_pd = html_tables[0]

# Convertir les types de colonnes spécifiques pour correspondre aux attentes de Spark
df_results_pd['rank_equal'] = df_results_pd['rank_equal'].astype('bool', errors='ignore')
df_results_pd['rank_position'] = pd.to_numeric(df_results_pd['rank_position'], errors='coerce').fillna(0).astype('int')
df_results_pd['Unnamed: 0'] = pd.to_numeric(df_results_pd['Unnamed: 0'], errors='coerce').fillna(0).astype('int')

# Afficher les types de colonnes
print(df_results_pd.dtypes)

# Spécifier explicitement le schéma pour Spark DataFrame
schema = StructType([
    StructField("Unnamed: 0", IntegerType(), True),
    StructField("discipline_title", StringType(), True),
    StructField("event_title", StringType(), True),
    StructField("slug_game", StringType(), True),
    StructField("participant_type", StringType(), True),
    StructField("medal_type", StringType(), True),
    StructField("athletes", StringType(), True),
    StructField("rank_equal", BooleanType(), True),
    StructField("rank_position", IntegerType(), True),
    StructField("country_name", StringType(), True),
    StructField("country_code", StringType(), True),
    StructField("country_3_letter_code", StringType(), True),
    StructField("athlete_url", StringType(), True),
    StructField("athlete_full_name", StringType(), True),
    StructField("value_unit", StringType(), True),
    StructField("value_type", StringType(), True)
])


Unnamed: 0                int64
discipline_title         object
event_title              object
slug_game                object
participant_type         object
medal_type               object
athletes                 object
rank_equal                 bool
rank_position             int64
country_name             object
country_code             object
country_3_letter_code    object
athlete_url              object
athlete_full_name        object
value_unit               object
value_type               object
dtype: object


In [15]:
# Convertir le DataFrame pandas en DataFrame Spark
df_results = spark.createDataFrame(df_results_pd, schema=schema)
df_results.show()


+----------+----------------+-------------+------------+----------------+----------+--------------------+----------+-------------+--------------------+------------+---------------------+-----------+-----------------+----------+----------+
|Unnamed: 0|discipline_title|  event_title|   slug_game|participant_type|medal_type|            athletes|rank_equal|rank_position|        country_name|country_code|country_3_letter_code|athlete_url|athlete_full_name|value_unit|value_type|
+----------+----------------+-------------+------------+----------------+----------+--------------------+----------+-------------+--------------------+------------+---------------------+-----------+-----------------+----------+----------+
|         0|         Curling|Mixed Doubles|beijing-2022|        GameTeam|      GOLD|[('Stefania CONST...|     false|            1|               Italy|          IT|                  ITA|        NaN|              NaN|       NaN|       NaN|
|         1|         Curling|Mixed Doubles|b

# Gestion des valeurs manquantes


In [16]:
df_athletes = df_athletes.dropna()
df_hosts = df_hosts.dropna()
df_results = df_results.dropna()
df_medals=df_medals.dropna()

In [17]:
df_hosts.show()

+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|Index|          Game_Slug|       Game_End_Date|     Game_Start_Date|     Game_Location|          Game_Name|Game_Season|Game_Year|
+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|    0|       beijing-2022|2022-02-20T12:00:00Z|2022-02-04T15:00:00Z|             China|       Beijing 2022|     Winter|     2022|
|    1|         tokyo-2020|2021-08-08T14:00:00Z|2021-07-23T11:00:00Z|             Japan|         Tokyo 2020|     Summer|     2020|
|    2|   pyeongchang-2018|2018-02-25T08:00:00Z|2018-02-08T23:00:00Z| Republic of Korea|   PyeongChang 2018|     Winter|     2018|
|    3|           rio-2016|2016-08-21T21:00:00Z|2016-08-05T12:00:00Z|            Brazil|           Rio 2016|     Summer|     2016|
|    4|         sochi-2014|2014-02-23T16:00:00Z|2014-02-07T04:00:00Z|Russian Federa

## Exploration et visualisation des données

1 France a organisé 6 JO : 3 d’hiver et 3 d’été (en comptant celui de 2024) :

In [18]:
from pyspark.sql.functions import col

# Filtrer les données pour la France
df_france_hosts = df_hosts.filter(col("game_location") == "France")

# Compter le nombre de jeux d'été et d'hiver
df_france_hosts.groupBy("game_season").count().show()


+-----------+-----+
|game_season|count|
+-----------+-----+
|     Winter|    3|
|     Summer|    2|
+-----------+-----+



2. France est le 2è pays qui a organisé le plus de JO après les USA (8 JO) :


In [19]:
# Compter le nombre de jeux organisés par chaque pays
df_country_counts = df_hosts.groupBy("game_location").count().orderBy(col("count").desc())
df_country_counts.show()


+--------------------+-----+
|       game_location|count|
+--------------------+-----+
|       United States|    8|
|              France|    5|
|               Japan|    4|
|       Great Britain|    3|
|               Italy|    3|
|              Canada|    3|
|   Republic of Korea|    2|
|              Greece|    2|
|               China|    2|
|              Norway|    2|
|             Austria|    2|
|             Germany|    2|
|         Switzerland|    2|
|               Spain|    1|
|  Russian Federation|    1|
|Federal Republic ...|    1|
|          Yugoslavia|    1|
|                USSR|    1|
|              Brazil|    1|
|           Australia|    1|
+--------------------+-----+
only showing top 20 rows



3.Les JO d’hiver sont nés à Chamonix en 1924 :

In [20]:
# Filtrer les données pour les jeux d'hiver en 1924
df_winter_1924 = df_hosts.filter((col("game_year") == 1924) & (col("game_season") == "Winter"))
df_winter_1924.show()


+-----+-------------+--------------------+--------------------+-------------+-------------+-----------+---------+
|Index|    Game_Slug|       Game_End_Date|     Game_Start_Date|Game_Location|    Game_Name|Game_Season|Game_Year|
+-----+-------------+--------------------+--------------------+-------------+-------------+-----------+---------+
|   46|chamonix-1924|1924-02-05T20:00:00Z|1924-01-25T08:00:00Z|       France|Chamonix 1924|     Winter|     1924|
+-----+-------------+--------------------+--------------------+-------------+-------------+-----------+---------+



4.JO de Paris, en 1900 : les femmes peuvent participer aux JO :


In [21]:
# Filtrer les données pour les jeux de Paris 1900
# df_paris_1900 = df_hosts.filter((col("game_year") == 1900) & (col("game_location") == "France"))

# # Filtrer les données des athlètes pour Paris 1900 et vérifier la participation des femmes
# df_paris_1900_athletes = df_athletes.filter((col("game_year") == 1900) & (col("first_game") == "Paris 1900"))

# df_paris_1900_athletes.show()
df_paris_1900_results = df_results.filter((col("slug_game") == "paris-1900"))


df_paris_1900_women = df_paris_1900_results.filter(col("event_title") == "Women")
# Afficher le DataFrame pour vérifier
df_paris_1900_women.show(truncate=False)

# Compter le nombre de participantes féminines
count_women = df_paris_1900_women.count()
print(f"Number of female participants in Paris 1900: {count_women}")

+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+
|Unnamed: 0|discipline_title|event_title|slug_game|participant_type|medal_type|athletes|rank_equal|rank_position|country_name|country_code|country_3_letter_code|athlete_url|athlete_full_name|value_unit|value_type|
+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+
+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+

Number of female participants in Paris 1900: 0



5.   Seuls 4 athlètes ont remporté des médailles à la fois aux JO d’hiver
et d’été. Une seule d’entre eux, Christa Ludinger-Rothenburger, a remporté des médailles au cours de la même année ?

In [32]:
from pyspark.sql.functions import col

df_joined = df_results.join(df_hosts, df_results.slug_game == df_hosts.Game_Slug)

df_joined=df_joined.select("athlete_full_name","game_season","game_year","medal_type")
# df_joined.show()

# # Filtrer les athlètes ayant remporté des médailles aux JO d'hiver
df_winter_medals = df_joined.filter(col("game_season") == "Winter").select("athlete_full_name").distinct()

# # Filtrer les athlètes ayant remporté des médailles aux JO d'été
df_summer_medals = df_joined.filter(col("game_season") == "Summer").select("athlete_full_name").distinct()

 # Trouver les athlètes ayant remporté des médailles aux deux saisons
df_both_medals = df_winter_medals.intersect(df_summer_medals)
df_both_medals.show()
# # Afficher les athlètes ayant remporté des médailles aux deux saisons
print("Athletes who won medals in both Winter and Summer Olympics:")
df_both_medals.show()

# # Filtrer les résultats pour Christa Ludinger-Rothenburger
df_christa = df_joined.filter(col("athlete_full_name") == "Christa Ludinger-Rothenburger")

# # Afficher les résultats pour Christa Ludinger-Rothenburger
print("Results for Christa Ludinger-Rothenburger:")
df_christa.show()

# # Vérifier si elle a remporté des médailles dans la même année
df_christa_grouped = df_christa.groupBy("game_year").count()
print("Number of medals won by Christa Ludinger-Rothenburger in each year:")
df_christa_grouped.show()

+--------------------+
|   athlete_full_name|
+--------------------+
|       Stig LINDBERG|
|         Ake NILSSON|
|      Leila GYENESEI|
|              Yi ZHU|
|Timo Juhani GRONLUND|
| Laurine VAN RIESSEN|
|        David MERCER|
|       Jan SZYMANSKI|
|       Markus KELLER|
|       Luis GONZALEZ|
|         Johan STÃA|
|     Christine WITTY|
|             Jun DAI|
|   Arthur MANNSBARTH|
|        Hans MÃLLER|
|Christa LUDING-RO...|
|   Mikhail SIAMIONAU|
|              Wei LI|
|       Josef NEMECKY|
|           Xin ZHANG|
+--------------------+
only showing top 20 rows

Athletes who won medals in both Winter and Summer Olympics:
+--------------------+
|   athlete_full_name|
+--------------------+
|       Stig LINDBERG|
|         Ake NILSSON|
|      Leila GYENESEI|
|              Yi ZHU|
|Timo Juhani GRONLUND|
| Laurine VAN RIESSEN|
|        David MERCER|
|       Jan SZYMANSKI|
|       Markus KELLER|
|       Luis GONZALEZ|
|         Johan STÃA|
|     Christine WITTY|
|             Jun


6. De 1924 à 1992, les JO d’hiver et d’été avaient lieu au cours de la même année. Désormais, ils sont organisés selon des cycles distincts avec une alternance de 2 ans ?

In [41]:

#Filtrer les données pour les années 1924 à 1992

df_1924_1992 = df_hosts.filter((col("game_year") >= 1924) & (col("game_year") <= 1992))
df_1924_1992.show()
df_seasons_per_year = df_1924_1992.groupBy("game_year",).agg({"game_season": "count"})
df_seasons_per_year.show()


 #Vérifier si les Jeux d'hiver et d'été ont eu lieu la même année
df_same_year = df_seasons_per_year.filter(col("count(game_season)") == 2)

# Analyser le changement de cycle après 1992
df_after_1992 = df_hosts.filter(col("game_year") > 1992)
df_seasons_after_1992 = df_after_1992.groupBy("game_year", "game_season").count()
df_seasons_after_1992.show()

+-----+--------------------+-------------+---------------+--------------------+--------------------+-----------+---------+
|index|           Game_Slug|game_end_date|game_start_date|       Game_Location|           Game_Name|Game_Season|game_year|
+-----+--------------------+-------------+---------------+--------------------+--------------------+-----------+---------+
|   15|      barcelona-1992|   1992-08-09|     1992-07-25|               Spain|      Barcelona 1992|     Summer|     1992|
|   16|    albertville-1992|   1992-02-23|     1992-02-08|              France|    Albertville 1992|     Winter|     1992|
|   17|          seoul-1988|   1988-10-02|     1988-09-16|   Republic of Korea|          Seoul 1988|     Summer|     1988|
|   18|        calgary-1988|   1988-02-28|     1988-02-13|              Canada|        Calgary 1988|     Winter|     1988|
|   19|    los-angeles-1984|   1984-08-12|     1984-07-28|       United States|    Los Angeles 1984|     Summer|     1984|
|   20|       sa




7. Tarzan lui-même a participé aux JO. En effet, Johnny Weissmuller, ancien athlète devenu acteur et ayant interprété 12 films de Tarzan, a remporté 5 médailles d’or en natation dans les années 1920 ?



In [42]:
#pour verifier cette affirmation ,nous allons concentrer sur les points suivants
# Vérifier si Johnny Weissmuller a participé aux Jeux Olympiques.
# Vérifier s'il a remporté 5 médailles d'or en natation dans les années 1920.
# Filtrer les résultats pour Johnny Weissmuller
df_johnny = df_results.filter(col("athlete_full_name") == "Johnny Weissmuller")

# Afficher les résultats pour Johnny Weissmuller
df_johnny.show(truncate=False)

+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+
|Unnamed: 0|discipline_title|event_title|slug_game|participant_type|medal_type|athletes|rank_equal|rank_position|country_name|country_code|country_3_letter_code|athlete_url|athlete_full_name|value_unit|value_type|
+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+
+----------+----------------+-----------+---------+----------------+----------+--------+----------+-------------+------------+------------+---------------------+-----------+-----------------+----------+----------+




8.  Les JO de Londres de 2012 ont été les 1ers JO durant lesquels tous les pays participants ont envoyé des athlètes de sexe féminin ?





9.  Les sports suivants ne font (malheureusement) plus partie des J.O : la natation synchronisée en solo, le tir à la corde, la corde à grimper, la montgolfière, le duel au pistolet, le vélo tandem, la course d’obstacles à la nage et le plongeon à distance. Par chance, le tir au pigeon n’a été mis en place qu’une seule fois pendant les Jeux Olympiques de Paris de 1900 ?




10. Les Jeux de 2016, à Rio, marqueront la 1è manifestation des JO en Amérique du Sud ?






11.   Pendant les 17 jours des JO d’été de 2016, 10 500 athlètes, originaires de 205 pays, représenteront 42 sports différents et participeront à 306 épreuves, à Rio ?






#- Questions ouvertes :
   

   



1.   Combien de médailles la France a remporté : en tout, en Or, en argent et en Bronze (depuis le début des JO) ?








2.  Lors de quelle JO la France a eu le plus (le moins) de succès ?





3.   Élément de listePeut-on considérer que la France est la grande spécialiste de certaine(s) discipline(s) sportive(s), comparativement aux autres notions ? Laquelle / Lesquelles ?



In [23]:
df_winter_medals=df_hosts.filter(col("Game_season")=="Winter").select("Game_Slug")



4.   Quelles sont les sports les plus dominants dans les JO au fil des ans ?










In [24]:
df_hosts.show()
df_athletes.show()


+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|Index|          Game_Slug|       Game_End_Date|     Game_Start_Date|     Game_Location|          Game_Name|Game_Season|Game_Year|
+-----+-------------------+--------------------+--------------------+------------------+-------------------+-----------+---------+
|    0|       beijing-2022|2022-02-20T12:00:00Z|2022-02-04T15:00:00Z|             China|       Beijing 2022|     Winter|     2022|
|    1|         tokyo-2020|2021-08-08T14:00:00Z|2021-07-23T11:00:00Z|             Japan|         Tokyo 2020|     Summer|     2020|
|    2|   pyeongchang-2018|2018-02-25T08:00:00Z|2018-02-08T23:00:00Z| Republic of Korea|   PyeongChang 2018|     Winter|     2018|
|    3|           rio-2016|2016-08-21T21:00:00Z|2016-08-05T12:00:00Z|            Brazil|           Rio 2016|     Summer|     2016|
|    4|         sochi-2014|2014-02-23T16:00:00Z|2014-02-07T04:00:00Z|Russian Federa

In [25]:

df_results.show()
df_medals.show()

+----------+----------------+-------------+------------+----------------+----------+--------------------+----------+-------------+--------------------+------------+---------------------+-----------+-----------------+----------+----------+
|Unnamed: 0|discipline_title|  event_title|   slug_game|participant_type|medal_type|            athletes|rank_equal|rank_position|        country_name|country_code|country_3_letter_code|athlete_url|athlete_full_name|value_unit|value_type|
+----------+----------------+-------------+------------+----------------+----------+--------------------+----------+-------------+--------------------+------------+---------------------+-----------+-----------------+----------+----------+
|         0|         Curling|Mixed Doubles|beijing-2022|        GameTeam|      GOLD|[('Stefania CONST...|     false|            1|               Italy|          IT|                  ITA|        NaN|              NaN|       NaN|       NaN|
|         1|         Curling|Mixed Doubles|b

# Supprimer les colonnes inutiles


In [26]:
athletes_cleaned = df_athletes.drop("athlete_url")
results_cleaned = df_results.drop("athlete_url", "value_unit", "value_type")
medals_cleaned=df_medals.drop("athlete_url","Unnamed: 0")
# results_cleaned.show()

# Conversion des types de données si nécessaire


In [27]:
df_hosts.dtypes

[('Index', 'string'),
 ('Game_Slug', 'string'),
 ('Game_End_Date', 'string'),
 ('Game_Start_Date', 'string'),
 ('Game_Location', 'string'),
 ('Game_Name', 'string'),
 ('Game_Season', 'string'),
 ('Game_Year', 'string')]

In [28]:
from pyspark.sql.functions import col, to_date
df_hosts = df_hosts.withColumn("game_year", col("game_year").cast("integer"))
df_hosts = df_hosts.withColumn("index", col("index").cast("integer"))
df_hosts = df_hosts.withColumn("game_start_date", to_date(col("game_start_date")))
df_hosts = df_hosts.withColumn("game_end_date", to_date(col("game_end_date")))
# Afficher le schéma pour vérifier les conversions
df_hosts.printSchema()

root
 |-- index: integer (nullable = true)
 |-- Game_Slug: string (nullable = true)
 |-- game_end_date: date (nullable = true)
 |-- game_start_date: date (nullable = true)
 |-- Game_Location: string (nullable = true)
 |-- Game_Name: string (nullable = true)
 |-- Game_Season: string (nullable = true)
 |-- game_year: integer (nullable = true)



# Normalisation des valeurs numériques


In [29]:
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler

In [30]:
df_medals.show()

+----------+----------------+------------+--------------------+------------+----------+----------------+--------------------+--------------------+--------------------+--------------------+------------+---------------------+
|Unnamed: 0|discipline_title|   slug_game|         event_title|event_gender|medal_type|participant_type|   participant_title|         athlete_url|   athlete_full_name|        country_name|country_code|country_3_letter_code|
+----------+----------------+------------+--------------------+------------+----------+----------------+--------------------+--------------------+--------------------+--------------------+------------+---------------------+
|         0|         Curling|beijing-2022|       Mixed Doubles|       Mixed|      GOLD|        GameTeam|               Italy|https://olympics....|Stefania CONSTANTINI|               Italy|          IT|                  ITA|
|         1|         Curling|beijing-2022|       Mixed Doubles|       Mixed|      GOLD|        GameTeam|