# Exploitation des Données

### Installation

In [415]:
#%pip install pandas
#%pip install matplotlib
#%pip install seaborn
#%pip install scipy
#%pip install statsmodels

### Importation


In [416]:
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.formula.api import ols
import plotly.express as px
import matplotlib

### Chargement des données

In [417]:
df = pd.read_csv("data_.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


L'objectif est d'estimer les charges en fonction des autres infomrations (age, sex, bmi, children, smoker, region, charges). Il s'agit de pédire les charges des nouveaux sujets.

### Analyse des données

In [418]:
# Afficher uniquement les lignes dupliquées
duplicates_rows = df[df.duplicated()]

print(duplicates_rows)

     age   sex    bmi  children smoker     region    charges
581   19  male  30.59         0     no  northwest  1639.5631


In [419]:
# suppression des doublons
df.drop_duplicates(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


- détection des valeurs manquantes

In [420]:
df.notna().sum()

age         1337
sex         1337
bmi         1337
children    1337
smoker      1337
region      1337
charges     1337
dtype: int64

- Inspection des données

In [421]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [422]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [423]:
print(df[df["charges"]>20000])

      age     sex     bmi  children smoker     region      charges
3      33    male  22.705         0     no  northwest  21984.47061
9      60  female  25.840         0     no  northwest  28923.13692
11     62  female  26.290         0    yes  southeast  27808.72510
14     27    male  42.130         0    yes  southeast  39611.75770
19     30    male  35.300         0    yes  southwest  36837.46700
...   ...     ...     ...       ...    ...        ...          ...
1313   19  female  34.700         2    yes  southwest  36397.57600
1321   62    male  26.695         0    yes  northeast  28101.33305
1323   42  female  40.370         2    yes  southeast  43896.37630
1328   23  female  24.225         2     no  northeast  22395.74424
1337   61  female  29.070         0    yes  northwest  29141.36030

[273 rows x 7 columns]


In [424]:
df.describe(include="O")

Unnamed: 0,sex,smoker,region
count,1337,1337,1337
unique,2,2,4
top,male,no,southeast
freq,675,1063,364


In [425]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1337 non-null   int64  
 1   sex       1337 non-null   object 
 2   bmi       1337 non-null   float64
 3   children  1337 non-null   int64  
 4   smoker    1337 non-null   object 
 5   region    1337 non-null   object 
 6   charges   1337 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [426]:
df = df[["age", "sex", "bmi", "children", "smoker", "region", "charges"]]

In [427]:
df.to_csv("df_assurance_clean.csv", index=False)

- Ajout de paramètres

In [428]:
# Logarithme des charges
#df['log_charges'] = np.log1p(df['charges'])

#df['log_age'] = np.log(df['age']+1)
#df['log_bmi'] = np.log(df['bmi']+1)
#df['log_children'] = np.log(df['children']+1)

# Discrétisation de l'âge (par tranche)
bins_age = [0, 28, 51, 65, np.inf]
#bins_age = [0, 30, 45, 60, np.inf]
labels_age = ['Jeune', 'Mature', 'Âgé', 'Senior']
df['age_group'] = pd.cut(df['age'], bins=bins_age, labels=labels_age)

# Discrétisation du BMI
#bins_bmi = [0, 16, 28, 34, np.inf]
bins_bmi = [0, 18, 30, 40, np.inf]
labels_bmi = ['Maigre', 'Normal', 'Surpoids', 'Obèse']
df['bmi_category'] = pd.cut(df['bmi'], bins=bins_bmi, labels=labels_bmi)

df['smoker_encoded'] = df['smoker'].map({'yes': 1, 'no': 0})

# Création des interactions
df['bmi_smoker'] = df['bmi'] * df['smoker_encoded']  # Interaction BMI x Smoker
df['age_smoker'] = df['age'] * df['smoker_encoded'] 

df['age_bmi'] = df['age'] * df['bmi']
#Création de nouvelles variables
#Ratios entre variables, par exemple charges par enfant ou BMI par tranche d'âge.
#Interactions entre variables.
# Ratios
#df['charges_par_enfant'] = df['charges'] / (df['children'] + 1)  # éviter division par zéro

# Interaction entre BMI et charges
#df['bmi_charges_'] = df['bmi'] * df['charges']

#Gestion des valeurs aberrantes (outliers)
#Suppression ou traitement des valeurs extrêmes dans les variables continues.
# Suppression des outliers dans charges
q1, q3 = df['charges'].quantile([0.25, 0.75])
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

#df = df[(df['charges'] >= lower_bound) & (df['charges'] <= upper_bound)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1337 entries, 0 to 1337
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             1337 non-null   int64   
 1   sex             1337 non-null   object  
 2   bmi             1337 non-null   float64 
 3   children        1337 non-null   int64   
 4   smoker          1337 non-null   object  
 5   region          1337 non-null   object  
 6   charges         1337 non-null   float64 
 7   age_group       1337 non-null   category
 8   bmi_category    1337 non-null   category
 9   smoker_encoded  1337 non-null   int64   
 10  bmi_smoker      1337 non-null   float64 
 11  age_smoker      1337 non-null   int64   
 12  age_bmi         1337 non-null   float64 
dtypes: category(2), float64(4), int64(4), object(3)
memory usage: 128.4+ KB


In [429]:
#df = df[["age", "sex", "bmi", "children", "smoker", "region", "log_age", "log_bmi", "age_group", "bmi_category","log_children", "charges"]]

In [430]:
#df = df[["age", "sex", "bmi", "children", "smoker", "region", "log_age", "log_bmi", "age_group", "bmi_category","log_children", "charges"]]

In [431]:
#df = df[["age", "sex", "bmi", "children", "smoker", "region", "age_bmi","age_group", "bmi_category","bmi_smoker","age_smoker","log_age","log_children","charges"]]

In [432]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,age_group,bmi_category,smoker_encoded,bmi_smoker,age_smoker,age_bmi
0,19,female,27.9,0,yes,southwest,16884.924,Jeune,Normal,1,27.9,19,530.1
1,18,male,33.77,1,no,southeast,1725.5523,Jeune,Surpoids,0,0.0,0,607.86
2,28,male,33.0,3,no,southeast,4449.462,Jeune,Surpoids,0,0.0,0,924.0
3,33,male,22.705,0,no,northwest,21984.47061,Mature,Normal,0,0.0,0,749.265
4,32,male,28.88,0,no,northwest,3866.8552,Mature,Normal,0,0.0,0,924.16


In [433]:
df = df[["age", "sex","bmi","children", "smoker", "region", "age_bmi","age_group", "bmi_category","bmi_smoker","age_smoker","charges"]]

In [434]:
#df = df[["age", "sex","bmi","children", "smoker", "region","charges"]]

In [435]:
df.to_csv("df_assurance_ag_sex_bmi_ch_sm_re_agebmi_agegrp_bmicat_bmismok_agesmok.csv", index=False)

In [436]:
labels_age2 = ['Jeune', 'Mature', 'Âgé', 'Senior']
df['age_group2'] = pd.cut(df['age'], bins=bins_age, labels=labels_age2)

In [437]:
df['age_group2'] 

0        Jeune
1        Jeune
2        Jeune
3       Mature
4       Mature
         ...  
1333    Mature
1334     Jeune
1335     Jeune
1336     Jeune
1337       Âgé
Name: age_group2, Length: 1337, dtype: category
Categories (4, object): ['Jeune' < 'Mature' < 'Âgé' < 'Senior']