Import dependance


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt

In [None]:
# Charger les données depuis le fichier CSV dans un DataFrame pandas
df = pd.read_csv("cleaned_dataset.csv")

# Définir la liste des colonnes à utiliser comme variables explicatives (features)
features = [
    "Average journey time",
    "Number of scheduled trains",
    "Number of cancelled trains",
    "Number of trains delayed > 15min",
    "Number of trains delayed > 30min",
    "Number of trains delayed > 60min",
    "Average delay of all trains at departure",
    "Average delay of late trains at arrival",
    "Average delays",
    "Year",
]

# Dictionnaire des cibles à prédire (targets) avec leur nom pour affichage
targets = ["Average delay of all trains at arrival"]

In [None]:
# Sélection des features et de la target
X = df[features]
y = df[targets[0]]

# Supprimer toutes les lignes avec NaN dans X ou y
data = pd.concat([X, y], axis=1)
data = data.dropna()
X = data[features]
y = data[targets[0]]

# On ne garde que les valeurs cibles positives
mask = y >= 0
X = X[mask]
y = y[mask]

# Transformation log1p sur la cible
y = np.log1p(y)

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=70
)

In [None]:
future_year = 2026
feature_values_2026 = {}
for feat in features:
    if feat == "Year":
        feature_values_2026[feat] = future_year
        continue
    temp = df[["Year", feat]].dropna()
    if temp["Year"].nunique() > 1:
        reg = LinearRegression()
        reg.fit(temp[["Year"]], temp[feat])
        feature_values_2026[feat] = reg.predict(pd.DataFrame({"Year": [future_year]}))[
            0
        ]
    else:
        feature_values_2026[feat] = temp[feat].mean()

input_2026 = pd.DataFrame([feature_values_2026])[features]

In [None]:
gb_model = GradientBoostingRegressor(n_estimators=200, random_state=70)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

r2_gb = r2_score(y_test, y_pred_gb)
print(f"GradientBoostingRegressor R^2: {r2_gb:.3f}")

pred_log = gb_model.predict(input_2026)[0]
pred_delay = np.expm1(pred_log)
print(f"Retard moyen prédit à l'arrivée pour 2026 (GBR) : {pred_delay:.2f} minutes")

In [None]:
# RandomForestRegressor
print("RandomForestRegressor")
random_forest = RandomForestRegressor(
    n_jobs=-1,
    random_state=70,
    n_estimators=280,
    max_depth=140,
    min_samples_split=72,
)
random_forest.fit(X_train, y_train)
y_pred_rfr = random_forest.predict(X_test)
r2_rfr = r2_score(y_test, y_pred_rfr)
print(f"  random_state  70  estimator  280  max_depth  140  R^2 score {r2_rfr:.6f}")

In [None]:
os.makedirs("models", exist_ok=True)
joblib.dump(random_forest, "models/rfr_model.joblib")
joblib.dump(gb_model, "models/gb_model.joblib")
joblib.dump(features, "models/features.joblib")
joblib.dump(reg, "models/reg.joblib")

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred_gb, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=1.5)
plt.xlabel("True Value")
plt.ylabel("Prédictions")
plt.title("GradientBoostingRegressor: Prédictions vs True Values")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred_rfr, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "r--", lw=1.5)
plt.xlabel("True Value")
plt.ylabel("Prédictions")
plt.title("RandomForestRegressor: Prédictions vs True Values")
plt.show()