### Import des données

In [1]:
import pandas as pd
import numpy as np 

df = pd.read_csv("clean_data.csv")
df["smoker_bmi"] = df["smoker"]*df["bmi"]
df["smoker_age"] = df["smoker"]*df["age"]
df = df.drop(["age_category", "bmi_index", "sex_categ", "region"], axis=1) # avec bmi 
df 

Unnamed: 0,age,sex,bmi,children,smoker,charges,bmi_categories,smoker_bmi,smoker_age
0,19,female,27.900,0,1,16884.92400,overweight,27.90,19
1,18,male,33.770,1,0,1725.55230,obesity class 1,0.00,0
2,28,male,33.000,3,0,4449.46200,obesity class 1,0.00,0
3,33,male,22.705,0,0,21984.47061,normal,0.00,0
4,32,male,28.880,0,0,3866.85520,overweight,0.00,0
...,...,...,...,...,...,...,...,...,...
1332,50,male,30.970,3,0,10600.54830,obesity class 1,0.00,0
1333,18,female,31.920,0,0,2205.98080,obesity class 1,0.00,0
1334,18,female,36.850,0,0,1629.83350,obesity class 2,0.00,0
1335,21,female,25.800,0,0,2007.94500,overweight,0.00,0


### Nettoyage/Encodage des données & pipelines, transformers, modèles...

In [2]:
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer 

from sklearn.preprocessing import PolynomialFeatures

# selection des features et de la cible
X = df.drop('charges', axis=1)
y = df.charges

# Création du train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
#X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# Sélection des variables numériques
num_col = list(X.select_dtypes(include=[float,int]).columns)
# Sélection des variables catégorielles
cat_col = list(X.select_dtypes(include=[object]).columns)


# pipeline pour les valeurs numériques
# cela permet de créer une succession d'étapes (ici il n'y en a qu'une)
my_num_pipe = make_pipeline(PolynomialFeatures(3), RobustScaler())
# Créer un pipeline pour les variables catégorielles
categorical_pipeline = make_pipeline(OneHotEncoder(), PolynomialFeatures(3))


# ColumnTransformer permet de faire des opérations sur des sélections de colonnes
preprocessing = ColumnTransformer([
    ("numeric", my_num_pipe, num_col),
    ("categorical", categorical_pipeline, cat_col)
])


# preprocessing = ColumnTransformer([
#     ("one_hot", OneHotEncoder(), cat_col),
#     ("scaling", my_num_pipe, num_col)
# ])

In [3]:
# import pandas as pd
# from sklearn.pipeline import make_pipeline
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import RobustScaler, OneHotEncoder, PolynomialFeatures
# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split


# # Features and target
# X = df.drop('charges', axis=1)
# y = df['charges']

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42)

# # Numeric columns
# num_col = list(X.select_dtypes(include=[float, int]).columns)

# # Categorical columns
# cat_col = list(X.select_dtypes(include=[object]).columns)

# # Numeric pipeline
# num_pipe = make_pipeline(PolynomialFeatures(2, include_bias=False), RobustScaler())

# # Categorical pipeline
# cat_pipe = make_pipeline(OneHotEncoder(), PolynomialFeatures(2, include_bias=False))

# # ColumnTransformer
# preprocessing = ColumnTransformer([
#     ("numeric", num_pipe, num_col),
#     ("categorical", cat_pipe, cat_col)
# ])

# # Full pipeline including preprocessing and linear regression
# full_pipeline = make_pipeline(preprocessing, LinearRegression())

# # Fit the pipeline to the training data
# full_pipeline.fit(X_train, y_train)

# # Extract coefficients from the linear regression model
# coefficients = full_pipeline[-1].coef_

# # Extract feature names from PolynomialFeatures
# poly_feature_names = preprocessing.transformers_[0][1][-1].get_feature_names_out(num_col + cat_col)

# # Create a DataFrame with coefficients and feature names
# coef_df = pd.DataFrame({'Feature': poly_feature_names, 'Coefficient': coefficients})

# # Display the DataFrame
# print(coef_df)

#### Voir si les données ont bien été modifiées

In [4]:
# Appliquez le pipeline sur les données d'entraînement et de test
# X_train_transformed = preprocessing.fit_transform(X_train)
# X_train_transformed

In [5]:
# # Supposez que X_train_transformed soit une matrice NumPy résultante de la transformation
# # Appliquer le ColumnTransformer sur les données d'entraînement
# X_train_transformed = preprocessing.fit_transform(X_train)

# # Obtenez les noms de colonnes après la transformation
# all_columns = preprocessing.transformers_[0][1].get_feature_names_out(cat_col).tolist() + num_col

# # Créez un DataFrame pandas à partir des données transformées et des noms de colonnes
# X_train_transformed_df = pd.DataFrame(X_train_transformed, columns=all_columns)

# # Affichez le DataFrame
# print(X_train_transformed_df.head())

### DummyRegressor

In [6]:
# DUMMY REGRESSOR (ne prend pas en compte les variables, prédit la moyenne de chaque valeur Y)
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)
print(f"regression dummy : {dummy_regr.score(X_test, y_test)}")

regression dummy : -0.0020162626112618653


### LinearRegression

In [7]:
# REGRESSION LINEAIRE
# j'ajoute la régression linéaire au pipeline
my_pipe_lr = make_pipeline(preprocessing, LinearRegression())
# Je fit sur les données
my_pipe_lr.fit(X_train, y_train)
# afficher le score
print(f"regression linéaire : {my_pipe_lr.score(X_test, y_test)}")

regression linéaire : 0.8672978757333859


In [8]:
len(my_pipe_lr[-1].coef_)

249

In [9]:
my_pipe_lr[-1].coef_
# plus le coef est gros plus l'info est importante
# coef des variables (avec polynomialfeatures)

array([-5.16740805e+12,  1.55820969e+03,  2.72220586e+04,  1.27823967e+04,
        4.15645681e+04, -2.40850069e+03, -1.02431401e+02,  1.73005156e+04,
       -1.02988619e+04, -1.73946068e+04, -1.02431396e+02,  3.01332227e+00,
        1.75187832e+00, -5.14937827e+04,  4.97353816e+03, -2.40850069e+03,
        8.07924506e+01,  3.01332128e+00, -6.87825613e+03, -4.54606241e+03,
        2.26454525e+02, -5.22100751e+01,  4.15645681e+04, -2.40850069e+03,
       -1.02431396e+02,  8.07924500e+01,  3.01332081e+00,  1.75187888e+00,
       -5.71488490e+03, -1.46262527e+03,  2.23113351e+03,  1.75187871e+00,
        2.67151558e-02, -4.27507064e-02,  4.93458076e+03,  4.50071578e+03,
        3.01332126e+00, -1.00951584e-01,  2.67151521e-02,  4.76688336e+03,
       -5.22100746e+01,  9.94600537e-01,  7.75448060e-01, -1.02431396e+02,
        3.01332132e+00,  1.75187797e+00, -1.00952796e-01,  2.67155247e-02,
       -4.27519216e-02,  2.23296757e+04, -2.70264179e+03,  8.07924506e+01,
       -1.27006822e+00, -

In [10]:
# # Extract coefficients from the linear regression model
# coefficients = my_pipe_lr[-1].coef_
# poly = PolynomialFeatures(2)
# # Extract feature names from PolynomialFeatures after fitting
# poly_feature_names = poly.get_feature_names_out(X.columns)

# # Create a DataFrame with coefficients and feature names
# coef_df = pd.DataFrame({'Feature': poly_feature_names, 'Coefficient': coefficients})

# # Display the DataFrame
# print(coef_df)

In [11]:
# import pandas as pd
# import numpy as np
# from sklearn.linear_model import LinearRegression
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import make_pipeline

# # selection des features et de la cible
# X = X
# y = y

# # Create polynomial features and linear regression
# degree = 2  # You can adjust the degree as needed
# poly = PolynomialFeatures(degree)

# # Fit the pipeline to the data
# my_pipe_lr.fit(X, y)

# # Extract coefficients from the linear regression model
# coefficients = my_pipe_lr[-1].coef_

# # Extract feature names from PolynomialFeatures
# poly_feature_names = poly.get_feature_names_out(X.columns)

# # Create a DataFrame with coefficients and feature names
# coef_df = pd.DataFrame({'Feature': poly_feature_names, 'Coefficient': coefficients})

# # Display the DataFrame
# print(coef_df)

### Lasso

In [12]:
# REGRESSION LASSO CV
my_pipe_lasso = make_pipeline(preprocessing, Lasso(alpha=46.5))  # avant 21.5
# Je fit sur les données
my_pipe_lasso.fit(X_train, y_train)
# le score
print(f"regression lasso : {my_pipe_lasso.score(X_test, y_test)}")

regression lasso : 0.9055239629338527


  model = cd_fast.enet_coordinate_descent(


In [13]:
from sklearn.model_selection import GridSearchCV
param = {"lasso__alpha" : np.arange(0.5,50, 0.5)}

grid = GridSearchCV(my_pipe_lasso, param, cv=5)
grid.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [14]:
len(np.arange(0.01,10, 0.1))

100

In [15]:
grid.best_params_

{'lasso__alpha': 29.0}

### Ridge

In [16]:
# REGRESSION RIDGE CV 
my_pipe_ridge = make_pipeline(preprocessing, Ridge(alpha=1e-05))
# Je fit sur les données
my_pipe_ridge.fit(X_train, y_train)
# le score
print(f"regression ridge : {my_pipe_ridge.score(X_test, y_test)}")

regression ridge : 0.9014409290345482


In [17]:
from sklearn.model_selection import GridSearchCV
param = {"ridge__alpha" : np.arange(0.00001, 0.0001, 0.1)}

grid = GridSearchCV(my_pipe_ridge, param, cv=5)
grid.fit(X_train, y_train)

In [18]:
grid.best_params_

{'ridge__alpha': 1e-05}

### ElasticNet

In [19]:
# REGRESSION ELASTICNETCV
my_pipe_elasticnet = make_pipeline(preprocessing, ElasticNet(alpha=9.1e-05)) # avant alpha = 0.01
# Je fit sur les données
my_pipe_elasticnet.fit(X_train, y_train)
# le score
print(f"regression elasticnet : {my_pipe_elasticnet.score(X_test, y_test)}")

# si l1_ratio = 1 alors elasticnet devient un lasso
# si l1_ratio = 0 alors elasticnet devient un ridge

regression elasticnet : 0.9010818571700009


  model = cd_fast.enet_coordinate_descent(


In [20]:
from sklearn.model_selection import GridSearchCV
param = {"elasticnet__alpha" : np.arange(0.000001, 0.0001, 0.00001),
         "elasticnet__l1_ratio" : np.arange(0.001, 1, 0.1)}

grid = GridSearchCV(my_pipe_elasticnet, param, cv=5)
grid.fit(X_train, y_train)

# np.arange(0.000001, 0.0001, 0.1)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [21]:
len(np.arange(0.000001, 0.0001, 0.000001))

100

In [22]:
grid.best_params_

{'elasticnet__alpha': 9.1e-05, 'elasticnet__l1_ratio': 0.001}