### Import des données

In [1]:
import pandas as pd
import numpy as np 

df = pd.read_csv("clean_data.csv")
df = df.drop(["age_category", "bmi_index", "sex_categ"], axis=1) # avec bmi 
df 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,bmi_categories
0,19,female,27.900,0,1,southwest,16884.92400,overweight
1,18,male,33.770,1,0,southeast,1725.55230,obesity class 1
2,28,male,33.000,3,0,southeast,4449.46200,obesity class 1
3,33,male,22.705,0,0,northwest,21984.47061,normal
4,32,male,28.880,0,0,northwest,3866.85520,overweight
...,...,...,...,...,...,...,...,...
1332,50,male,30.970,3,0,northwest,10600.54830,obesity class 1
1333,18,female,31.920,0,0,northeast,2205.98080,obesity class 1
1334,18,female,36.850,0,0,southeast,1629.83350,obesity class 2
1335,21,female,25.800,0,0,southwest,2007.94500,overweight


### Nettoyage/Encodage des données & pipelines, transformers, modèles...

In [2]:
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 

from sklearn.preprocessing import PolynomialFeatures

# selection des features et de la cible
X = df.drop('charges', axis=1)
y = df.charges

# Création du train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# Sélection des variables numériques
num_col = list(X.select_dtypes(include=[float,int]).columns)
# Sélection des variables catégorielles
cat_col = list(X.select_dtypes(include=[object]).columns)


# pipeline pour les valeurs numériques
# cela permet de créer une succession d'étapes (ici il n'y en a qu'une)
my_num_pipe = make_pipeline(PolynomialFeatures(2), RobustScaler())
# Créer un pipeline pour les variables catégorielles
categorical_pipeline = make_pipeline(OneHotEncoder(), PolynomialFeatures(2))




preprocessing = ColumnTransformer([
    ("numeric", my_num_pipe, num_col),
    ("categorical", categorical_pipeline, cat_col)
])

# Créer le pipeline complet avec préprocesseur et modèle
my_pipe_elasticnet = make_pipeline(
    preprocessing,
    ElasticNet(alpha=1e-05, l1_ratio=1)
)




# # ColumnTransformer permet de faire des opérations sur des sélections de colonnes
# preprocessing = ColumnTransformer([
#     ("one_hot", OneHotEncoder(), cat_col),
#     ("scaling", my_num_pipe, num_col)
# ])

#### Voir si les données ont bien été modifiées

In [3]:
# Appliquez le pipeline sur les données d'entraînement et de test
# X_train_transformed = preprocessing.fit_transform(X_train)
# X_train_transformed

In [4]:
# # Supposez que X_train_transformed soit une matrice NumPy résultante de la transformation
# # Appliquer le ColumnTransformer sur les données d'entraînement
# X_train_transformed = preprocessing.fit_transform(X_train)

# # Obtenez les noms de colonnes après la transformation
# all_columns = preprocessing.transformers_[0][1].get_feature_names_out(cat_col).tolist() + num_col

# # Créez un DataFrame pandas à partir des données transformées et des noms de colonnes
# X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_columns)

# # Affichez le DataFrame
# print(X_train_transformed_df.head())

### DummyRegressor

In [5]:
# DUMMY REGRESSOR (ne prend pas en compte les variables, prédit la moyenne de chaque valeur Y)
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
print(f"regression dummy : {dummy_regr.score(X_test, y_test)}")

regression dummy : -0.0020162626112618653


### LinearRegression

In [6]:
# REGRESSION LINEAIRE
# j'ajoute la régression linéaire au pipeline
my_pipe_lr = make_pipeline(preprocessing, LinearRegression())
# Je fit sur les données
my_pipe_lr.fit(X_train, y_train)
# afficher le score
print(f"regression linéaire : {my_pipe_lr.score(X_test, y_test)}")

regression linéaire : 0.8992496385212491


In [7]:
my_pipe_lr[-1].coef_
# plus le coef est gros plus l'info est importante

array([ 0.00000000e+00, -1.82594695e+03,  5.32848473e+02,  2.63981714e+03,
       -1.06797700e+04,  7.53989416e+03,  6.91534946e+02, -1.33016929e+02,
       -9.62240735e+00, -2.04995473e+03, -1.95586944e+02,  1.49710990e+03,
       -4.30941102e+02, -3.93918539e+02, -1.06797700e+04, -1.03264983e-10,
        2.50384713e+02, -2.50384713e+02,  3.51613474e+02,  1.11681303e+02,
       -1.35274048e+02, -3.28020729e+02, -4.61156623e+02,  4.34568378e+02,
        7.31934907e+02,  3.08324784e+02, -5.44202824e+02, -4.69468623e+02,
        2.50384713e+02,  0.00000000e+00,  2.55581867e+02, -8.16034402e+01,
        1.13335861e+01,  6.50727002e+01, -1.91189277e+02, -3.76891925e+00,
       -2.35666941e+02, -3.11251968e+02, -5.85187635e+02,  1.57744945e+03,
       -2.50384713e+02,  9.60316074e+01,  1.93284743e+02, -1.46607634e+02,
       -3.93093430e+02, -2.69967345e+02,  4.38337297e+02,  9.67601848e+02,
        6.19576752e+02,  4.09848110e+01, -2.04691808e+03,  3.51613474e+02,
        0.00000000e+00,  

### Lasso

In [8]:
# REGRESSION LASSO CV
my_pipe_lasso = make_pipeline(preprocessing, Lasso(alpha=9.91))
# Je fit sur les données
my_pipe_lasso.fit(X_train, y_train)
# le score
print(f"regression lasso : {my_pipe_lasso.score(X_test, y_test)}")

regression lasso : 0.9027399804464591


In [9]:
from sklearn.model_selection import GridSearchCV
param = {"lasso__alpha" : np.arange(0.01,10, 0.1)}

grid = GridSearchCV(my_pipe_lasso, param, cv=5)
grid.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [10]:
len(np.arange(0.01,10, 0.1))

100

In [11]:
grid.best_params_

{'lasso__alpha': 9.91}

### Ridge

In [12]:
# REGRESSION RIDGE CV 
my_pipe_ridge = make_pipeline(preprocessing, Ridge())
# Je fit sur les données
my_pipe_ridge.fit(X_train, y_train)
# le score
print(f"regression ridge : {my_pipe_ridge.score(X_test, y_test)}")

regression ridge : 0.9002734723366195


In [13]:
from sklearn.model_selection import GridSearchCV
param = {"ridge__alpha" : np.arange(0.00001, 0.0001, 0.1)}

grid = GridSearchCV(my_pipe_ridge, param, cv=5)
grid.fit(X_train, y_train)

In [14]:
grid.best_params_

{'ridge__alpha': 1e-05}

### ElasticNet

In [15]:
# REGRESSION ELASTICNETCV
my_pipe_elasticnet = make_pipeline(preprocessing, ElasticNet(alpha=1e-05, l1_ratio=1)) # avant alpha = 0.01
# Je fit sur les données
my_pipe_elasticnet.fit(X_train, y_train)
# le score
print(f"regression elasticnet : {my_pipe_elasticnet.score(X_test, y_test)}")

# si l1_ratio = 1 alors elasticnet devient un lasso
# si l1_ratio = 0 alors elasticnet devient un ridge

regression elasticnet : 0.8992387417493726


  model = cd_fast.sparse_enet_coordinate_descent(


In [16]:
from sklearn.model_selection import GridSearchCV
param = {"elasticnet__alpha" : np.arange(0.00001, 0.0001, 0.1),
         "elasticnet__l1_ratio" : np.arange(0.001, 1, 0.1)}

grid = GridSearchCV(my_pipe_elasticnet, param, cv=5)
grid.fit(X_train, y_train)

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


In [17]:
len(np.arange(0.01, 10, 0.1))

100

In [18]:
grid.best_params_

{'elasticnet__alpha': 1e-05, 'elasticnet__l1_ratio': 0.001}