In [19]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer, make_column_transformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso, ElasticNet
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV, cross_val_score, KFold
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.

from sklearn.metrics import *
from scipy.stats import probplot
import matplotlib.pyplot as plt
import seaborn as sns
import my_functions
from statistics import mean

In [20]:
# chargement et affichage des données
data = pd.read_csv('../data.csv')
def classify_bmi(row):
    if row["bmi"] < 25:
        return "normal"
    elif row["bmi"] < 30:
        return "overweight"
    else:
        return "obese"

data["bmi_class"] = data.apply(classify_bmi, axis=1)

# Remove duplicates from the 'data' DataFrame
df = data.drop_duplicates()

In [21]:
## chargement et affichage des données
#data = pd.read_csv('../data.csv')
#def classify_bmi(row):
#    if row["bmi"] < 18.5:
#        return "underweight"
#    elif row["bmi"] < 25:
#        return "normal"
#    elif row["bmi"] < 30:
#        return "overweight"
#    elif row["bmi"] < 35:
#        return "obese"
#    else:
#        return "severely obese"
#data["bmi_class"] = data.apply(classify_bmi, axis=1)
## Remove duplicates from the 'data' DataFrame
#df = data.drop_duplicates()
## Drop Bmi
#df.drop("bmi", axis=1, inplace=True)

In [22]:
# Select the 'charges' column and store it in a separate DataFrame
y = df[['charges']]
# Drop the 'charges' column from the 'data' DataFrame and store the rest of the columns in a separate DataFrame
X = df.drop(columns=['charges'])
metrics = []

def make_pipeline_to_ML(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42, stratify=X[['smoker']])
    numerical_features = make_column_selector(dtype_include=np.number)
    categorical_features = make_column_selector(dtype_exclude= np.number)
    numerical_pipeline = make_pipeline(StandardScaler(with_mean=False))
    categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
    preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
    return preprocessor, X_train, X_test, y_train, y_test




In [23]:
preprocessor, X_train, X_test, y_train, y_test = make_pipeline_to_ML(X,y)


#### Cook Distance 

#print(f"len(X_train) : {len(X_train)}")
#index_to_be_removed = my_functions.get_index_to_remove_by_Cooks_Distance(X_train=X_train, y_train=y_train, preprocessor=preprocessor)
#X_train = X_train.drop(index=index_to_be_removed.values)
#y_train = y_train.drop(index=index_to_be_removed.values)
#print(f"New len(X_train) : {len(X_train)}")

In [24]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, LR_model_1 = my_functions.LR_with_CV(PolynomialFeatures_degree = 1, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                    preprocessor = preprocessor, shuffle=True, random_state=42,
                    isplot = False, isinfo = False, include_learning_curve = False) 
    
metrics.append(["LR with Kfold CV (Polynomial degree=1)", R2, MAE, RMSE, Model_score_test, 
                    Model_score_training, "%0.4f (+/- %0.2f)" % (scores_mean, scores_std)])



In [25]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, LR_model_2 = my_functions.LR_with_CV(PolynomialFeatures_degree = 2, 
                    X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                    preprocessor = preprocessor, shuffle=True, random_state=42,
                    isplot = False, isinfo = False, include_learning_curve = False) 
    
metrics.append(["LR with Kfold CV (Polynomial degree=2)", R2, MAE, RMSE, Model_score_test, 
                    Model_score_training, "%0.4f (+/- %0.2f)" % (scores_mean, scores_std)])

In [26]:
################################
################################
#
# Get best Hypermarameters
#
################################
################################


#my_functions.get_best_params(
#    PolynomialFeatures_degree = 2,  
#    model=Lasso(max_iter=100000, 
#    tol=0.0001,
#    random_state=42, 
#    selection='cyclic'),   
#    param_grid= {'lasso__alpha': np.linspace(30,40,200)}, 
#    preprocessor = preprocessor, 
#    X_train = X_train, 
#    y_train = y_train)


In [27]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Lasso_model_1 = my_functions.LASSO_with_CV(PolynomialFeatures_degree = 1, Best_alpha= 39.4321608040201,
                X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                preprocessor = preprocessor, shuffle=True, random_state=42,
                isplot = False, isinfo = False, include_learning_curve = False) 


metrics.append(["LASSO (Polynomial degree=1)", R2, MAE, RMSE, Model_score_test, 
                Model_score_training, "%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)}"])


In [28]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Lasso_model_2 = my_functions.LASSO_with_CV(PolynomialFeatures_degree = 2, Best_alpha= 39.4321608040201,
                X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                preprocessor = preprocessor, shuffle=True, random_state=42,
                isplot = False, isinfo = False, include_learning_curve = False) 


metrics.append(["LASSO (Polynomial degree=2)", R2, MAE, RMSE, Model_score_test, 
                Model_score_training, "%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)}"])

In [29]:
#best = my_functions.get_best_params(PolynomialFeatures_degree = 1,  model=Ridge(max_iter=100000, tol=0.0001,random_state=42),   param_grid= {'ridge__alpha': np.linspace(1,3,200)}, preprocessor = preprocessor, X_train = X_train, y_train = y_train)
#best

In [30]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Ridge_model_1 = my_functions.Ridge_with_CV(PolynomialFeatures_degree = 1, Best_alpha=  1.4623115577889447,
                X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                preprocessor = preprocessor, shuffle=True, random_state=42,
                isplot = False, isinfo = False, include_learning_curve = False) 

metrics.append(["Ridg (Polynomial degree=1)", R2, MAE, RMSE, Model_score_test, Model_score_training ,"%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)}"])


In [31]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Ridge_model_2 = my_functions.Ridge_with_CV(PolynomialFeatures_degree = 2, Best_alpha=  1.4623115577889447,
                X_train = X_train, y_train = y_train, X_test = X_test, y_test = y_test, 
                preprocessor = preprocessor, shuffle=True, random_state=42,
                isplot = False, isinfo = False, include_learning_curve = False) 

metrics.append(["Ridg (Polynomial degree=2)", R2, MAE, RMSE, Model_score_test, Model_score_training ,"%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)}"])


In [32]:
# best = my_functions.get_best_params(PolynomialFeatures_degree = 2,  model=ElasticNet(max_iter=100000, tol=0.0001,random_state=42),   param_grid = {'elasticnet__alpha': np.linspace(49, 51, 200), 
#                 'elasticnet__l1_ratio': [0.9999999]}, 
#                 preprocessor = preprocessor, X_train = X_train, y_train = y_train)

In [33]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Best_elasticnet__l1_ratio, ElasticNet_model_1 = my_functions.ElasticNet_with_CV(PolynomialFeatures_degree =1, Best_alpha = 33.24120603015076, 
    Best_elasticnet__l1_ratio = 0.9999999, 
    X_train = X_train, y_train = y_train 
    , X_test = X_test , y_test = y_test, 
    preprocessor = preprocessor , shuffle=True, 
    random_state=42, isplot= False, isinfo = False, include_learning_curve = False)

metrics.append(["ElasticNet (Polynomial degree=1)", R2, MAE, RMSE, Model_score_test, Model_score_training ,"%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)} Best_ratio : {round(Best_elasticnet__l1_ratio,3)}"])

In [34]:
R2, MAE, RMSE, Model_score_test, Model_score_training, scores_mean, scores_std, Best_alpha, Best_elasticnet__l1_ratio, ElasticNet_model_2 = my_functions.ElasticNet_with_CV(PolynomialFeatures_degree =2, Best_alpha = 50.00502512562814, 
    Best_elasticnet__l1_ratio = 0.999999, 
    X_train = X_train, y_train = y_train 
    , X_test = X_test , y_test = y_test, 
    preprocessor = preprocessor , shuffle=True, 
    random_state=42, isplot= False, isinfo = False, include_learning_curve = False)

metrics.append(["ElasticNet (Polynomial degree=2)", R2, MAE, RMSE, Model_score_test, Model_score_training ,"%0.4f (+/- %0.2f)" % (scores_mean, scores_std), f"Best_alpha : {round(Best_alpha,3)}  Best_ratio : {round(Best_elasticnet__l1_ratio,3)}"])

In [35]:

df_result = pd.DataFrame()
df_result = pd.DataFrame(metrics , columns=['Model' , 'R2' , 'MAE' , 'RMSE', 'Score (test)', 'Score (trainging)' , "CV Accuracy", 'HyperParamter']).sort_values(['CV Accuracy'] , ascending=False, ignore_index=True).set_index('Model')
#df_result = pd.DataFrame(metrics , columns=['Model' , 'R2' , 'MAE' , 'RMSE', 'Score (test)', 'Score (trainging)' , "CV Accuracy"]).sort_values(['MAE' , 'RMSE'] , ignore_index=True).set_index('Model')
df_result

Unnamed: 0_level_0,R2,MAE,RMSE,Score (test),Score (trainging),CV Accuracy,HyperParamter
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
LASSO (Polynomial degree=2),0.9124,1957.294489,3340.682271,0.922583,0.858246,0.8488 (+/- 0.04),Best_alpha : 39.432
ElasticNet (Polynomial degree=2),0.9124,1960.457382,3335.036643,0.922844,0.857777,0.8488 (+/- 0.04),Best_alpha : 50.005 Best_ratio : 1.0
Ridg (Polynomial degree=2),0.9088,2029.465675,3419.335355,0.918894,0.861147,0.8432 (+/- 0.04),Best_alpha : 1.462
LR with Kfold CV (Polynomial degree=2),0.908,2059.129706,3441.72265,0.917829,0.861231,0.8395 (+/- 0.04),
LASSO (Polynomial degree=1),0.7717,3655.59983,5092.514964,0.820099,0.739239,0.7286 (+/- 0.04),Best_alpha : 39.432
ElasticNet (Polynomial degree=1),0.7724,3662.001424,5094.071744,0.819989,0.739371,0.7285 (+/- 0.04),Best_alpha : 33.241 Best_ratio : 1.0
LR with Kfold CV (Polynomial degree=1),0.7759,3708.067361,5107.965335,0.819006,0.739695,0.7282 (+/- 0.04),
Ridg (Polynomial degree=1),0.7743,3706.798635,5107.543835,0.819036,0.739682,0.7282 (+/- 0.04),Best_alpha : 1.462


In [36]:
import pickle

pickle_out = open("Lasso_Model.pkl", "wb") 
pickle.dump(Lasso_model_2, pickle_out) 
pickle_out.close()