In [2]:
import os
%pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator
import joblib


# Initialisation de la session Spark
spark = SparkSession.builder \
    .appName("Predicting Olympic Medals") \
    .config("spark.jars", "mysql-connector-java-8.0.26.jar.3") \
    .getOrCreate()

# Configuration de la connexion à la base de données
url = "jdbc:mysql://mysql-hackathonipssi.alwaysdata.net:3306/hackathonipssi_mia4"

properties = {
    "user": "360556_root",
    "password": "hackathonipssi*",
    "driver": "com.mysql.cj.jdbc.Driver",
    "useSSL": "false",
    "requireSSL": "false",
    "verifyServerCertificate": "false"
}

# Lecture des données historiques
df = spark.read.jdbc(url=url, table="Result", properties=properties)

# Transformation des données
df = df.withColumn("gold", when(col("medal_type") == "GOLD", 1).otherwise(0))
df = df.withColumn("silver", when(col("medal_type") == "SILVER", 1).otherwise(0))
df = df.withColumn("bronze", when(col("medal_type") == "BRONZE", 1).otherwise(0))
medals_df = df.groupBy("country_name").agg(
    sum("gold").alias("gold"),
    sum("silver").alias("silver"),
    sum("bronze").alias("bronze")
)
medals_df = medals_df.withColumn("total", col("gold") + col("silver") + col("bronze"))
# Préparation des caractéristiques
assembler = VectorAssembler(inputCols=["gold", "silver", "bronze", "total"], outputCol="features")
data = assembler.transform(medals_df)

# Normalisation des caractéristiques
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(data)
data = scalerModel.transform(data)

# Division des données en ensembles d'entraînement et de test
(train_data, test_data) = data.randomSplit([0.8, 0.2])

# Entraînement du modèle
rf = RandomForestRegressor(featuresCol="scaledFeatures", labelCol="total")
model = rf.fit(train_data)

# Je sauvegarde le model
model.save("model_spark")

# Évaluation du modèle
predictions = model.transform(test_data)
evaluator = RegressionEvaluator(labelCol="total", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("RMSE on test data:", rmse)

# Préparation des données pour 2024 (à adapter selon les données disponibles)
new_data_2024 = spark.createDataFrame([
    ("France", 10, 12, 9),
    ("USA", 35, 32, 30),
    ("China", 40, 30, 20)
], ["country_name", "gold", "silver", "bronze"])

new_data_2024 = new_data_2024.withColumn("total", col("gold") + col("silver") + col("bronze"))
new_data_2024 = assembler.transform(new_data_2024)
new_data_2024 = scalerModel.transform(new_data_2024)

# Utilisation du modèle pour faire des prédictions pour 2024
predictions_2024 = model.transform(new_data_2024)
predictions_2024.select("country_name", "prediction").show()

# Fermeture de la session Spark
spark.stop()


Collecting pyspark
  Using cached pyspark-3.5.1.tar.gz (317.0 MB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting py4j==0.10.9.7 (from pyspark)
  Obtaining dependency information for py4j==0.10.9.7 from https://files.pythonhosted.org/packages/10/30/a58b32568f1623aaad7db22aa9eafc4c6c194b429ff35bdc55ca2726da47/py4j-0.10.9.7-py2.py3-none-any.whl.metadata
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py): started
  Building wheel for pyspark (setup.py): finished with status 'done'
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488539 sha256=c9a48fb2850e06ea69b3fe0bca67d3f8ff55373e02cd25fe4942b2ed6937abc3
  Stored in directory: c:\users\user\appdata\local\pip\cache\wheels\80\1d\60\2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Su


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


FileNotFoundError: [WinError 2] Le fichier spécifié est introuvable