In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_selector, ColumnTransformer
# sklearn.compose: The sklearn.compose module is a submodule of the sklearn library for machine learning in Python. It provides functions for creating complex preprocessing and modeling pipelines.
from sklearn.preprocessing import OneHotEncoder,StandardScaler,PolynomialFeatures,RobustScaler
#sklearn.preprocessing: The sklearn.preprocessing module is a submodule of the sklearn library that provides functions for preprocessing data, such as scaling and normalizing features, imputing missing values, and encoding categorical variables.
from sklearn.linear_model import Ridge,LinearRegression,Lasso
# sklearn.linear_model: The sklearn.linear_model module is a submodule of the sklearn library that provides functions for fitting linear models for regression and classification.
from sklearn.pipeline import make_pipeline
# sklearn.pipeline: The sklearn.pipeline module is a submodule of the sklearn library that provides functions for creating and working with pipelines of transformers and models.
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve, RandomizedSearchCV
# sklearn.model_selection: The sklearn.model_selection module is a submodule of the sklearn library that provides functions for splitting data into training and test sets, evaluating models using cross-validation, and hyperparameter tuning.
from sklearn.dummy import DummyRegressor
# sklearn.dummy: The sklearn.dummy module is a submodule of the sklearn library that provides simple dummy models for regression and classification.



In [3]:
# chargement et affichage des données
data = pd.read_csv('../data.csv')
def classify_bmi(row):
    if row["bmi"] < 25:
        return "normal"
    elif row["bmi"] < 30:
        return "overweight"
    else:
        return "obese"

data["bmi_class"] = data.apply(classify_bmi, axis=1)
print(data.head())

   age     sex     bmi  children smoker     region      charges   bmi_class
0   19  female  27.900         0    yes  southwest  16884.92400  overweight
1   18    male  33.770         1     no  southeast   1725.55230       obese
2   28    male  33.000         3     no  southeast   4449.46200       obese
3   33    male  22.705         0     no  northwest  21984.47061      normal
4   32    male  28.880         0     no  northwest   3866.85520  overweight


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        1338 non-null   int64  
 1   sex        1338 non-null   object 
 2   bmi        1338 non-null   float64
 3   children   1338 non-null   int64  
 4   smoker     1338 non-null   object 
 5   region     1338 non-null   object 
 6   charges    1338 non-null   float64
 7   bmi_class  1338 non-null   object 
dtypes: float64(2), int64(2), object(4)
memory usage: 83.8+ KB


In [5]:
# Remove duplicates from the 'data' DataFrame
df = data.drop_duplicates()

# Print the number of rows and columns in the cleaned DataFrame
print(df.shape)

(1337, 8)


In [6]:
# Select the 'charges' column and store it in a separate DataFrame
y = df[['charges']]

# Drop the 'charges' column from the 'data' DataFrame and store the rest of the columns in a separate DataFrame
X = df.drop(columns=['charges'])

# Print the shape of the 'y' and 'X' DataFrames
print(y.shape)
print(X.shape)

(1337, 1)
(1337, 7)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.8, random_state=42)
# shuffle: This is a boolean parameter that determines whether the data should be shuffled before splitting. If True, the data will be shuffled randomly before the split. If False, the data will be split in the order it is in the DataFrame.
# train_size: This is a float parameter that determines the proportion of the data that should be included in the training set. For example, if train_size=0.8, 80% of the data will be included in the training set and the remaining 20% will be included in the test set.
# random_state: This is an optional integer parameter that sets the random seed for shuffling the data. This can be useful for reproducibility of the split.



In [8]:
#dummy = DummyRegressor()


In [9]:
from sklearn.compose import make_column_transformer

numerical_features = make_column_selector(dtype_include=np.number)
categorical_features = make_column_selector(dtype_exclude= np.number)

# Create a preprocessing pipeline for numerical features
numerical_pipeline = make_pipeline(StandardScaler())

# Create a preprocessing pipeline for categorical features
categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
# The handle_unknown parameter of the OneHotEncoder transformer in scikit-learn is used to specify how the transformer should handle categorical levels (i.e., categories) that are present in the test data but not in the training data.


preprocessor = make_column_transformer((numerical_pipeline, numerical_features),(categorical_pipeline, categorical_features))

In [10]:
model = make_pipeline(preprocessor, LinearRegression())
# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)
model.score(X_test, y_test)

0.8054717970876826

In [11]:
from sklearn.model_selection import cross_val_score, KFold

def train_and_validate_regression_model(data_path, target_column, shuffle, train_size, random_state, cv):
  df = pd.read_csv(data_path)
  df = df.drop_duplicates()
  y = df[[target_column]]
  X = df.drop(columns=[target_column])
  X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=shuffle, train_size=train_size, random_state=random_state, stratify=X[['smoker']])
  numerical_features = make_column_selector(dtype_include=np.number)
  categorical_features = make_column_selector(dtype_exclude= np.number)
  numerical_pipeline = make_pipeline(StandardScaler())
  categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
  preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
  model = make_pipeline(preprocessor, LinearRegression())
  scores = cross_val_score(model, X_train, y_train, cv=cv)
  model.fit(X_train, y_train)
  test_score = model.score(X_test, y_test)
  return scores, test_score


scores, test_score = train_and_validate_regression_model('../data.csv', 'charges', True, 0.8, 42, 10)
print(scores)
test_score

[0.67616826 0.7768447  0.73834607 0.73636039 0.77490605 0.68236163
 0.78436305 0.76235248 0.70033763 0.60316271]


0.8204970129593311

In [17]:
def train_and_validate_regression_model(data_path, target_column, shuffle, train_size, random_state, n_splits):
  df = pd.read_csv(data_path)
  df = df.drop_duplicates()
  y = df[[target_column]]
  X = df.drop(columns=[target_column])
  X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=shuffle, train_size=train_size, random_state=random_state, stratify=X[['smoker']])
  numerical_features = make_column_selector(dtype_include=np.number)
  categorical_features = make_column_selector(dtype_exclude= np.number)
  numerical_pipeline = make_pipeline(StandardScaler())
  categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
  preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
  model = make_pipeline(preprocessor, LinearRegression())

  kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
  scores = cross_val_score(model, X_train, y_train, cv=kfold)
  model.fit(X_train, y_train)
  test_score = model.score(X_test, y_test)
  return scores, test_score


scores, test_score = train_and_validate_regression_model('../data.csv', 'charges', True, 0.8, 42, n_splits=5)
print(scores)
test_score

[0.6979856  0.67572351 0.76802403 0.70980435 0.76070805]


0.8204970129593311

In [25]:
def train_and_validate_regression_model(data_path, target_column, shuffle, train_size, random_state, cv, n_splits):
  df = pd.read_csv(data_path)
  df = df.drop_duplicates()
  y = df[[target_column]]
  X = df.drop(columns=[target_column])
  
  X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=shuffle, train_size=train_size, random_state=random_state, stratify=X[['smoker']])
  
  numerical_features = make_column_selector(dtype_include=np.number)
  categorical_features = make_column_selector(dtype_exclude= np.number)
  numerical_pipeline = make_pipeline(StandardScaler())
  categorical_pipeline = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
  preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
                                    (categorical_pipeline, categorical_features)
                                    )
  
  model = make_pipeline(preprocessor, Lasso())
  param_grid = {'lasso__alpha': np.linspace(0.1,100,1000)}
  grid_search = GridSearchCV(model, param_grid, cv=cv)
  grid_search.fit(X_train, y_train)
  best_params = grid_search.best_params_
  print(best_params)
  model = grid_search.best_estimator_
  
  kfold = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
  scores = cross_val_score(model, X_train, y_train, cv=kfold)
  model.fit(X_train, y_train)
  test_score = model.score(X_test, y_test)
  return scores, test_score


scores, test_score = train_and_validate_regression_model('../data.csv', 'charges', True, 0.8, 42, 5, 10)
print(scores)
test_score

{'lasso__alpha': 50.0}
[0.70254937 0.70111686 0.70702414 0.63858058 0.70989915 0.81545106
 0.71928002 0.69648111 0.70360675 0.81746816]


0.8210936451530005