## Import du dataset

In [14]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import link
link = 'https://raw.githubusercontent.com/MaskiVal/DataSets/main/kidney_disease.csv'

# Load the data
data_kidney_disease = pd.read_csv(link)

## Exploration du dataset

In [15]:
data_kidney_disease.head()

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.02,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.02,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.01,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.01,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd


In [16]:
data_kidney_disease.info

<bound method DataFrame.info of       id   age    bp     sg   al   su     rbc        pc         pcc  \
0      0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent   
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
2      2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
4      4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
395  395  55.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
396  396  42.0  70.0  1.025  0.0  0.0  normal    normal  notpresent   
397  397  12.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
398  398  17.0  60.0  1.025  0.0  0.0  normal    normal  notpresent   
399  399  58.0  80.0  1.025  0.0  0.0  normal    normal  notpresent   

             ba  ...  pcv    wc   rc  htn   dm  cad appet   pe  ane  \
0    notpresent  ...   44  7800  5.2  yes  y

In [17]:
data_kidney_disease.describe

<bound method NDFrame.describe of       id   age    bp     sg   al   su     rbc        pc         pcc  \
0      0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent   
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
2      2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
4      4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
395  395  55.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
396  396  42.0  70.0  1.025  0.0  0.0  normal    normal  notpresent   
397  397  12.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
398  398  17.0  60.0  1.025  0.0  0.0  normal    normal  notpresent   
399  399  58.0  80.0  1.025  0.0  0.0  normal    normal  notpresent   

             ba  ...  pcv    wc   rc  htn   dm  cad appet   pe  ane  \
0    notpresent  ...   44  7800  5.2  yes 

In [18]:
data_kidney_disease.columns

Index(['id', 'age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr',
       'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad',
       'appet', 'pe', 'ane', 'classification'],
      dtype='object')

In [19]:
# Calculer le pourcentage de valeurs nulles pour chaque colonne
null_percentage = data_kidney_disease.isnull().mean() * 100

# Afficher le résultat
null_percentage

id                 0.00
age                2.25
bp                 3.00
sg                11.75
al                11.50
su                12.25
rbc               38.00
pc                16.25
pcc                1.00
ba                 1.00
bgr               11.00
bu                 4.75
sc                 4.25
sod               21.75
pot               22.00
hemo              13.00
pcv               17.50
wc                26.25
rc                32.50
htn                0.50
dm                 0.50
cad                0.50
appet              0.25
pe                 0.25
ane                0.25
classification     0.00
dtype: float64

## Detection des valeurs aberrantes par la méthode de l'IQR 

In [20]:
def detect_outliers_iqr(data_kidney_disease):
    outliers = pd.DataFrame()
    for col in data_kidney_disease.select_dtypes(include='number').columns:
        Q1 = data_kidney_disease[col].quantile(0.25)
        Q3 = data_kidney_disease[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        outliers_in_col = data_kidney_disease[(data_kidney_disease[col] < lower_bound) | (data_kidney_disease[col] > upper_bound)]
        outliers = pd.concat([outliers, outliers_in_col])
    return outliers.drop_duplicates()

outliers_iqr = detect_outliers_iqr(data_kidney_disease)
print("Valeurs aberrantes détectées par la méthode de l'IQR:")
print(outliers_iqr)

Valeurs aberrantes détectées par la méthode de l'IQR:
      id   age    bp     sg   al   su     rbc        pc         pcc  \
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
75    75   5.0   NaN  1.015  1.0  0.0     NaN    normal  notpresent   
131  131   5.0  50.0  1.010  0.0  0.0     NaN    normal  notpresent   
150  150   8.0  60.0  1.025  3.0  0.0  normal    normal  notpresent   
185  185   4.0   NaN  1.020  1.0  0.0     NaN    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
203  203   NaN  90.0    NaN  NaN  NaN     NaN       NaN  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
162  162  59.0  70.0    NaN  NaN  NaN     NaN       NaN  notpresent   
171  171  83.0  70.0  1.020  3.0  0.0  normal    normal  notpresent   
60    60  67.0  90.0  1.020  1.0  0.0     NaN  abnormal     present   

             ba  ...  pcv     wc   rc  htn   dm  cad appet   pe  ane  \
1    notpresent  ... 

## Remplacement des valeurs aberrantes

In [21]:
# Remplacer les valeurs aberrantes
data_kidney_disease['pot'] = data_kidney_disease['pot'].replace({39.0: 3.90, 47.0: 4.70})

# Afficher le DataFrame après la modification
print(r"Après la modification:")
print(data_kidney_disease)

Après la modification:
      id   age    bp     sg   al   su     rbc        pc         pcc  \
0      0  48.0  80.0  1.020  1.0  0.0     NaN    normal  notpresent   
1      1   7.0  50.0  1.020  4.0  0.0     NaN    normal  notpresent   
2      2  62.0  80.0  1.010  2.0  3.0  normal    normal  notpresent   
3      3  48.0  70.0  1.005  4.0  0.0  normal  abnormal     present   
4      4  51.0  80.0  1.010  2.0  0.0  normal    normal  notpresent   
..   ...   ...   ...    ...  ...  ...     ...       ...         ...   
395  395  55.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
396  396  42.0  70.0  1.025  0.0  0.0  normal    normal  notpresent   
397  397  12.0  80.0  1.020  0.0  0.0  normal    normal  notpresent   
398  398  17.0  60.0  1.025  0.0  0.0  normal    normal  notpresent   
399  399  58.0  80.0  1.025  0.0  0.0  normal    normal  notpresent   

             ba  ...  pcv    wc   rc  htn   dm  cad appet   pe  ane  \
0    notpresent  ...   44  7800  5.2  yes  yes   no  