In [51]:
import pandas as pd  
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.model_selection import cross_val_score, validation_curve
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn import preprocessing, pipeline
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, LabelBinarizer, MinMaxScaler, StandardScaler, OneHotEncoder, RobustScaler, PolynomialFeatures, PowerTransformer, Binarizer, KBinsDiscretizer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV



#sklearn.preprocessing     # encodage et normalisation
#sklearn.feature_selection  # selection des variables utiles
#sklearn.feature_extraction  # extractuon de caractéristiques (generation de nouvelles variables)

In [52]:
# Étape 1 : Chargement des données
file_path = "data/4072eb5e-e963-4a17-a794-3ea028d0a9c4.csv"
data = pd.read_csv(file_path)
categorical_features = ["sex", "smoker", "region"]
numeric_features = ["age", "bmi", "children", "charges"]
X = data[numeric_features]
Xnp = data[numeric_features].to_numpy()
X.head()
data.head()
data2 = data


In [53]:
# Convertir en DataFrame
df = pd.DataFrame(data)



df['BMI_category'] = pd.cut(
    df['age'],
    bins=[0, 15, 30, 40, float('inf')],
    labels=['maigre', 'normal', 'surpoids', 'obèse'],  # Noms des catégories
    right=False  # Inclure la borne inférieure dans chaque intervalle
)

df['age_category'] = pd.cut(
    df['age'],
    bins=[0, 30, 50, float('inf')],
    labels=['Djeuns', 'Adultes', 'Séniors',],  # Noms des catégories
    right=False  # Inclure la borne inférieure dans chaque intervalle
)

df.drop_duplicates()



# Définir les features et la target
categorical_features = ["smoker", "sex", "region", "BMI_category", "age_category"]
numeric_features = ["age", "bmi", "children"]
X = df[categorical_features + numeric_features]
y = df["charges"]

# Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat_ord", OrdinalEncoder(), ["smoker", "sex", "BMI_category", "age_category"]),
        ("cat_onehot", OneHotEncoder(), ["region"])
    ]
)

# Ajouter des Polynomial Features
poly = PolynomialFeatures(degree=2, include_bias=False)

# Modèles à tester
models = {
    # "Linear Regression": make_pipeline(preprocessor, poly, LinearRegression()),
    "Ridge": make_pipeline(preprocessor, poly, Ridge(alpha=2.154)),
    "Lasso": make_pipeline(preprocessor, poly, Lasso(alpha=46.41589)),
    # "ElasticNet Regression": make_pipeline(preprocessor, poly, ElasticNet(alpha=0.1, l1_ratio=0.5))
}




In [54]:
# Définir la grille des hyperparamètres pour Ridge et Lasso
param_grid = {
    "Ridge": {"ridge__alpha": np.logspace(-3, 3, 10)},  # Valeurs de alpha de 0.001 à 1000
    "Lasso": {"lasso__alpha": np.logspace(-3, 3, 10)},  # Pareil pour Lasso
}

score_dict = {"Ridge": [], "Lasso": []}
alpha_values = np.linspace(0.15, 0.2, 10)  


# Recherche de la meilleure valeur de alpha pour chaque modèle
for name, pipeline in models.items():
    print(f"Optimisation pour {name}...")
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid[name],
        scoring="r2",  # Utiliser le score R² comme métrique
        cv=5,  # Validation croisée avec 5 sous-échantillons
        n_jobs=-1  # Utiliser tous les cœurs pour paralléliser
    )
    grid_search.fit(X_train, y_train)
    
    # Récupérer le meilleur modèle et ses performances
    best_models[name] = grid_search.best_estimator_
    best_alpha = grid_search.best_params_[f"{name.lower().split()[0]}__alpha"]
    best_score = grid_search.best_score_
    
    results[name] = {
        "Best Alpha": best_alpha,
        "Best CV R²": best_score
    }
    
    print(f"  Meilleur alpha pour {name}: {best_alpha:.5f}")
    print(f"  Meilleur score R² CV: {best_score:.5f}\n")


Optimisation pour Ridge...


  Meilleur alpha pour Ridge: 2.15443
  Meilleur score R² CV: 0.82240

Optimisation pour Lasso...


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


  Meilleur alpha pour Lasso: 46.41589
  Meilleur score R² CV: 0.82778



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [55]:

#test david
# 
data["male"] = data["sex"].apply(lambda x: 1 if x == "male" else 0)
data["female"] = data["sex"].apply(lambda x: 1 if x == "female" else 0)
data["southwest"] = data["region"].apply(lambda x: 1 if x == "southwest" else 0)
data["northwest"] = data["region"].apply(lambda x: 1 if x == "northwest" else 0)
data["northeast"] = data["region"].apply(lambda x: 1 if x == "northeast" else 0)
data["southeast"] = data["region"].apply(lambda x: 1 if x == "southeast" else 0)
data["smoker"] = data["smoker"].apply(lambda x: 1 if x == "yes" else 0)
data = data.drop(columns=['sex', 'region','female'])


# # Catégorisation de la variable 'age'
# data['age_category'] = pd.cut(
#     data['age'],
#     bins=[0, 25, 50, 60, float('inf')],  
#     labels=['jeunes', 'adultes', 'pre-séniors', 'séniors'],  # Noms des catégories
#     right=False  # Inclure la borne inférieure dans chaque intervalle
# )

# # Catégorisation de la variable 'age'
# data['bmi_category'] = pd.cut(
#     data['age'],
#     bins=[0, 15, 30, 40, float('inf')],
#     labels=['maigre', 'normal', 'surpoids', 'obèse'],  # Noms des catégories
#     right=False  # Inclure la borne inférieure dans chaque intervalle
# )

data.head()



Unnamed: 0,age,bmi,children,smoker,charges,male,southwest,northwest,northeast,southeast
0,19,27.9,0,1,16884.924,0,1,0,0,0
1,18,33.77,1,0,1725.5523,1,0,0,0,1
2,28,33.0,3,0,4449.462,1,0,0,0,1
3,33,22.705,0,0,21984.47061,1,0,1,0,0
4,32,28.88,0,0,3866.8552,1,0,1,0,0


In [57]:
# Définir les features et la target
categorical_features = ["smoker", "male", "southwest", "northwest", "northeast", "southeast"]
numeric_features = ["age", "bmi", "children"]
X = data[categorical_features + numeric_features]
y = data["charges"]




poly = PolynomialFeatures(degree=2)
poly_variables = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

alpha_ranges = np.arange(1,20,.05)
maxscore = 0
maxalp = 0
for alp in alpha_ranges:
    regression = linear_model.Lasso(alpha=alp, tol=0.08)
    model = regression.fit(X_train, y_train)
    if model.score(X_test,y_test) > maxscore:
        maxscore = model.score(X_test,y_test)
        maxalp = alp
print(maxalp, maxscore)

19.950000000000017 0.7800321475280458


In [None]:
# Évaluer chaque modèle
results = {}
best_model_name = None
best_model = None
best_r2 = 0

for name, pipeline in models.items():
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MSE": mse, "R²": r2}

    if r2> best_r2:
        best_r2 = r2
        best_model_name = name
        best_model = pipeline

    print(f"{name}:\n  Mean Squared Error: {mse:.3f}\n  R² Score: {r2:.3f}\n")


# Afficher le meilleur modèle
print(f"Le meilleur modèle est : {best_model_name} avec un R² de {best_r2:.3f}")

Ridge:
  Mean Squared Error: 20944127.414
  R² Score: 0.865

Lasso:
  Mean Squared Error: 20057502.607
  R² Score: 0.871

Le meilleur modèle est : Lasso avec un R² de 0.871


In [11]:
# Exporter le meilleur modèle
model_filename = f"best_model_{best_model_name.replace(' ', '_').lower()}.pkl"
joblib.dump(best_model, model_filename)
print(f"Modèle {best_model_name} exporté en fichier : {model_filename}")


Modèle Ridge Regression exporté en fichier : best_model_ridge_regression.pkl
