### Import des données

In [112]:
import pandas as pd
import numpy as np 

df = pd.read_csv("clean_data.csv")
df = df.drop(['sex', 'sex_categ', "region", "children", "age_category", "bmi_categories", "bmi_index"], axis=1)

### Nettoyage/Encodage des données & pipelines, transformers, modèles...

In [106]:
from sklearn.pipeline import make_pipeline
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 

# selection des features et de la cible
X = df.drop('charges', axis=1)
y = df.charges

# Création du train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# Sélection des variables numériques
num_col = list(X.select_dtypes(include=[float,int]).columns)
# Sélection des variables catégorielles
cat_col = list(X.select_dtypes(include=[object]).columns)


# pipeline pour les valeurs numériques
# cela permet de créer une succession d'étapes (ici il n'y en a qu'une)
my_num_pipe = make_pipeline(RobustScaler())

# ColumnTransformer permet de faire des opérations sur des sélections de colonnes
preprocessing = ColumnTransformer([
    ("one_hot", OneHotEncoder(),cat_col),
    ("scaling", my_num_pipe, num_col)
])

### DummyRegressor

In [107]:
# DUMMY REGRESSOR (ne prend pas en compte les variables, prédit la moyenne de chaque valeur Y)
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
print(f"regression dummy : {dummy_regr.score(X_test, y_test)}")

regression dummy : -0.0053648447378846775


### LinearRegression

In [108]:
# REGRESSION LINEAIRE
# j'ajoute la régression linéaire au pipeline
my_pipe_lr = make_pipeline(preprocessing, LinearRegression())
# Je fit sur les données
my_pipe_lr.fit(X_train, y_train)
# afficher le score
print(f"regression linéaire : {my_pipe_lr.score(X_test, y_test)}")

regression linéaire : 0.8049502474643171


### Lasso

In [109]:
# REGRESSION LASSO CV
my_pipe_lasso = make_pipeline(preprocessing, Lasso(alpha=17.01))
# Je fit sur les données
my_pipe_lasso.fit(X_train, y_train)
# le score
print(f"regression lasso : {my_pipe_lasso.score(X_test, y_test)}")

regression lasso : 0.8038946550701336


In [103]:
from sklearn.model_selection import GridSearchCV
param = {"lasso__alpha" : np.arange(0.01,100)}

grid = GridSearchCV(my_pipe_lasso, param, cv=5)
grid.fit(X_train, y_train)

### Ridge

In [110]:
# REGRESSION RIDGE CV 
my_pipe_ridge = make_pipeline(preprocessing, Ridge())
# Je fit sur les données
my_pipe_ridge.fit(X_train, y_train)
# le score
print(f"regression ridge : {my_pipe_ridge.score(X_test, y_test)}")

regression ridge : 0.8039517442411992


### ElasticNet

In [111]:
# REGRESSION ELASTICNETCV
my_pipe_elasticnet = make_pipeline(preprocessing, ElasticNet(alpha=0.01))
# Je fit sur les données
my_pipe_elasticnet.fit(X_train, y_train)
# le score
print(f"regression elasticnet : {my_pipe_elasticnet.score(X_test, y_test)}")

regression elasticnet : 0.7993329887009215


In [86]:
from sklearn.model_selection import GridSearchCV
param = {"elasticnet__alpha" : np.arange(0.01, 40)}

grid = GridSearchCV(my_pipe_elasticnet, param, cv=5)
grid.fit(X_train, y_train)

In [76]:
grid.best_params_

{'elasticnet__alpha': 0.01}