In [301]:
import numpy as np
import mlflow
from mlflow.models import infer_signature
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score
from urllib.parse import urlparse
import os 

In [302]:
#mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")


In [303]:
#1- Lecture de la base de données
Data = pd.read_csv("kidney_disease.csv",delimiter=",",decimal=".")
display(Data)

Unnamed: 0,id,age,bp,sg,al,su,rbc,pc,pcc,ba,...,pcv,wc,rc,htn,dm,cad,appet,pe,ane,classification
0,0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,...,38,6000,,no,no,no,good,no,no,ckd
2,2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [304]:
#2 - Vérifications le type de Data
Data.dtypes
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              400 non-null    int64  
 1   age             391 non-null    float64
 2   bp              388 non-null    float64
 3   sg              353 non-null    float64
 4   al              354 non-null    float64
 5   su              351 non-null    float64
 6   rbc             248 non-null    object 
 7   pc              335 non-null    object 
 8   pcc             396 non-null    object 
 9   ba              396 non-null    object 
 10  bgr             356 non-null    float64
 11  bu              381 non-null    float64
 12  sc              383 non-null    float64
 13  sod             313 non-null    float64
 14  pot             312 non-null    float64
 15  hemo            348 non-null    float64
 16  pcv             330 non-null    object 
 17  wc              295 non-null    obj

In [305]:
#3- Détermination du nombre d'individus et de variables
#Dimensions de la table de données : nombre de lignes, nombre de colonnes
#La ligne d'en-tête n'est pas comptabilisée dans le nombre de lignes
#La colonne d'ID ne doit pas être considérés comme variable caractéristique
dimension = Data.shape
NbrLignes = Data.shape[0]
NbrColonnes = Data.shape[1]
print("Dimension :",dimension)
print("Nombre de lignes :",NbrLignes)
print("Nombre de colonnes :",NbrColonnes)

Dimension : (400, 26)
Nombre de lignes : 400
Nombre de colonnes : 26


In [306]:
# 4 renommer les colonnes
Data.columns = ['ID', 'âge', 'pression_sang', 'gravité_spécifique', 'albumine', 'sucre', 'cellules_rouges', 'pus_cell',
              'pus_cell_clumps', 'bactéries', 'blood_glucose_random', 'blood_urea', 'serum_creatinine', 'sodium',
              'potassium', 'hémoglobine', 'volume_concentré_cellules', 'nombre_globules_blancs', 'nombre_globules_rouges',
              'hypertension', 'diabète_mellitus', 'maladie_artérielle_coronaire', 'appétit', 'œdème_péda',
              'anémie', 'classification']
#5 dropping la colonne 'ID' 
Data.drop('ID', axis = 1, inplace = True)
display(Data)

Unnamed: 0,âge,pression_sang,gravité_spécifique,albumine,sucre,cellules_rouges,pus_cell,pus_cell_clumps,bactéries,blood_glucose_random,...,volume_concentré_cellules,nombre_globules_blancs,nombre_globules_rouges,hypertension,diabète_mellitus,maladie_artérielle_coronaire,appétit,œdème_péda,anémie,classification
0,48.0,80.0,1.020,1.0,0.0,,normal,notpresent,notpresent,121.0,...,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,,normal,notpresent,notpresent,,...,38,6000,,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.0,...,31,7500,,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.0,...,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.0,...,35,7300,4.6,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.0,...,47,6700,4.9,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.0,...,54,7800,6.2,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.0,...,49,6600,5.4,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.0,...,51,7200,5.9,no,no,no,good,no,no,notckd


In [307]:
#3- Détermination du nombre des valeurs manquantes
DataManquante = Data.isnull().sum()
print(DataManquante[DataManquante > 0].sort_values(ascending=False))

cellules_rouges                 152
nombre_globules_rouges          130
nombre_globules_blancs          105
potassium                        88
sodium                           87
volume_concentré_cellules        70
pus_cell                         65
hémoglobine                      52
sucre                            49
gravité_spécifique               47
albumine                         46
blood_glucose_random             44
blood_urea                       19
serum_creatinine                 17
pression_sang                    12
âge                               9
bactéries                         4
pus_cell_clumps                   4
hypertension                      2
diabète_mellitus                  2
maladie_artérielle_coronaire      2
appétit                           1
œdème_péda                        1
anémie                            1
dtype: int64


In [308]:
#6 changer des colonnes  de type de données Objet
text_col = ['volume_concentré_cellules', 'nombre_globules_blancs', 'nombre_globules_rouges']

for col in text_col:
    print(f"{col} -: {Data[col].dtype}")

volume_concentré_cellules -: object
nombre_globules_blancs -: object
nombre_globules_rouges -: object


In [309]:
# Convertit la colonne texte a la colonne numérique 
def Convertir_col_texte_2_col_num (dataframe, feature):
    dataframe[feature] = pd.to_numeric(Data[feature], errors='coerce')

for col in text_col:
    Convertir_col_texte_2_col_num(Data, col)
    print(f"text_col: {Data[col].dtype}")
Data.dtypes

text_col: float64
text_col: float64
text_col: float64


âge                             float64
pression_sang                   float64
gravité_spécifique              float64
albumine                        float64
sucre                           float64
cellules_rouges                  object
pus_cell                         object
pus_cell_clumps                  object
bactéries                        object
blood_glucose_random            float64
blood_urea                      float64
serum_creatinine                float64
sodium                          float64
potassium                       float64
hémoglobine                     float64
volume_concentré_cellules       float64
nombre_globules_blancs          float64
nombre_globules_rouges          float64
hypertension                     object
diabète_mellitus                 object
maladie_artérielle_coronaire     object
appétit                          object
œdème_péda                       object
anémie                           object
classification                   object


In [310]:
import numpy as np

# Remplacement des valeurs manquantes dans toutes les colonnes numériques par une moyenne
def mean_imputation(dataframe, feature):
    mean_value=dataframe[feature].mean()
    dataframe[feature].fillna(value=mean_value, inplace=True)

# Obtention des noms de colonnes de toutes les caractéristiques numériques
Data_num = Data.select_dtypes(exclude=['object'])

# Attribution d'un nombre aléatoire à toutes les données manquantes dans les colonnes numériques
for column_name in Data_num:
    mean_imputation(Data,column_name)
    


# Remplacement des valeurs manquantes dans toutes les colonnes catégorielles par les données de fréquence la plus élevée
def mode_imputation(dataframe, feature):
    mode = dataframe[feature].mode()[0]
    dataframe[feature] = dataframe[feature].fillna(mode) 

#Obtention des noms de colonnes de toutes les fonctionnalités catégorisées
Data_obj = Data.select_dtypes(include=[object])
mode_imputation(Data,"pression_sang")

#Attribuer la fréquence la plus élevée à toutes les données manquantes dans les colonnes catégorielles
for column_name in Data_obj:
    mode_imputation(Data,column_name)

In [311]:
Data

Unnamed: 0,âge,pression_sang,gravité_spécifique,albumine,sucre,cellules_rouges,pus_cell,pus_cell_clumps,bactéries,blood_glucose_random,...,volume_concentré_cellules,nombre_globules_blancs,nombre_globules_rouges,hypertension,diabète_mellitus,maladie_artérielle_coronaire,appétit,œdème_péda,anémie,classification
0,48.0,80.0,1.020,1.0,0.0,normal,normal,notpresent,notpresent,121.000000,...,44.0,7800.0,5.200000,yes,yes,no,good,no,no,ckd
1,7.0,50.0,1.020,4.0,0.0,normal,normal,notpresent,notpresent,148.036517,...,38.0,6000.0,4.707435,no,no,no,good,no,no,ckd
2,62.0,80.0,1.010,2.0,3.0,normal,normal,notpresent,notpresent,423.000000,...,31.0,7500.0,4.707435,no,yes,no,poor,no,yes,ckd
3,48.0,70.0,1.005,4.0,0.0,normal,abnormal,present,notpresent,117.000000,...,32.0,6700.0,3.900000,yes,no,no,poor,yes,yes,ckd
4,51.0,80.0,1.010,2.0,0.0,normal,normal,notpresent,notpresent,106.000000,...,35.0,7300.0,4.600000,no,no,no,good,no,no,ckd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,140.000000,...,47.0,6700.0,4.900000,no,no,no,good,no,no,notckd
396,42.0,70.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,75.000000,...,54.0,7800.0,6.200000,no,no,no,good,no,no,notckd
397,12.0,80.0,1.020,0.0,0.0,normal,normal,notpresent,notpresent,100.000000,...,49.0,6600.0,5.400000,no,no,no,good,no,no,notckd
398,17.0,60.0,1.025,0.0,0.0,normal,normal,notpresent,notpresent,114.000000,...,51.0,7200.0,5.900000,no,no,no,good,no,no,notckd


In [312]:
#Détermination du nombre des valeurs manquantes
valeurs_manquantes = Data.isnull().sum()
valeurs_manquantes[valeurs_manquantes > 0].sort_values(ascending=False)

Series([], dtype: int64)

In [313]:
#6 changer des colonnes  de type de données Objet
text_columns = ['volume_concentré_cellules', 'nombre_globules_blancs', 'nombre_globules_rouges']

for column in text_columns:
    print(f"{column} -: {Data[column].dtype}")

volume_concentré_cellules -: float64
nombre_globules_blancs -: float64
nombre_globules_rouges -: float64


In [314]:
# Obtention des noms de colonnes de toutes les fonctionnalités catégorisées
catig_col = [col for col in Data.columns if Data[col].dtype == 'object']

print(catig_col)
print('---------------------------------------------------------------------------------------------------------------')

# les données manquantes dans les colonnes catégorielles
for col_name in catig_col:
    print(f"{col_name} -: {Data[col_name].unique()}")

['cellules_rouges', 'pus_cell', 'pus_cell_clumps', 'bactéries', 'hypertension', 'diabète_mellitus', 'maladie_artérielle_coronaire', 'appétit', 'œdème_péda', 'anémie', 'classification']
---------------------------------------------------------------------------------------------------------------
cellules_rouges -: ['normal' 'abnormal']
pus_cell -: ['normal' 'abnormal']
pus_cell_clumps -: ['notpresent' 'present']
bactéries -: ['notpresent' 'present']
hypertension -: ['yes' 'no']
diabète_mellitus -: ['yes' 'no' ' yes' '\tno' '\tyes']
maladie_artérielle_coronaire -: ['no' 'yes' '\tno']
appétit -: ['good' 'poor']
œdème_péda -: ['no' 'yes']
anémie -: ['no' 'yes']
classification -: ['ckd' 'ckd\t' 'notckd']


In [315]:
Data['diabète_mellitus'] = Data['diabète_mellitus'].replace(to_replace = {' yes':'yes', '\tno':'no', '\tyes':'yes'})
Data['maladie_artérielle_coronaire'] = Data['maladie_artérielle_coronaire'].replace(to_replace = '\tno', value='no')
Data['classification'] = Data['classification'].replace(to_replace = {'ckd\t': 'ckd', 'notckd': 'not ckd'})

In [316]:
# checking the distribution of Target Variable
Data['classification'].value_counts()

classification
ckd        250
not ckd    150
Name: count, dtype: int64

In [317]:
Data['classification'] = Data['classification'].map({'ckd': 1, 'not ckd': 0})
Data['cellules_rouges'] = Data['cellules_rouges'].map({'normal': 1, 'abnormal': 0})
Data['pus_cell'] = Data['pus_cell'].map({'normal': 1, 'abnormal': 0})
Data['pus_cell_clumps'] = Data['pus_cell_clumps'].map({'present': 1, 'notpresent': 0})
Data['bactéries'] = Data['bactéries'].map({'present': 1, 'notpresent': 0})
Data['hypertension'] = Data['hypertension'].map({'yes': 1, 'no': 0})
Data['diabète_mellitus'] = Data['diabète_mellitus'].map({'yes': 1, 'no': 0})
Data['maladie_artérielle_coronaire'] = Data['maladie_artérielle_coronaire'].map({'yes': 1, 'no': 0}) 
Data['appétit'] = Data['appétit'].map({'good': 1, 'poor': 0})
Data['œdème_péda'] = Data['œdème_péda'].map({'yes': 1, 'no': 0})
Data['anémie'] = Data['anémie'].map({'yes': 1, 'no': 0})

In [318]:
display(Data)

Unnamed: 0,âge,pression_sang,gravité_spécifique,albumine,sucre,cellules_rouges,pus_cell,pus_cell_clumps,bactéries,blood_glucose_random,...,volume_concentré_cellules,nombre_globules_blancs,nombre_globules_rouges,hypertension,diabète_mellitus,maladie_artérielle_coronaire,appétit,œdème_péda,anémie,classification
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.000000,...,44.0,7800.0,5.200000,1,1,0,1,0,0,1
1,7.0,50.0,1.020,4.0,0.0,1,1,0,0,148.036517,...,38.0,6000.0,4.707435,0,0,0,1,0,0,1
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.000000,...,31.0,7500.0,4.707435,0,1,0,0,0,1,1
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.000000,...,32.0,6700.0,3.900000,1,0,0,0,1,1,1
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.000000,...,35.0,7300.0,4.600000,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.000000,...,47.0,6700.0,4.900000,0,0,0,1,0,0,0
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.000000,...,54.0,7800.0,6.200000,0,0,0,1,0,0,0
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.000000,...,49.0,6600.0,5.400000,0,0,0,1,0,0,0
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.000000,...,51.0,7200.0,5.900000,0,0,0,1,0,0,0


In [319]:
Data.fillna(Data.median(), inplace=True) 

In [320]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   âge                           400 non-null    float64
 1   pression_sang                 400 non-null    float64
 2   gravité_spécifique            400 non-null    float64
 3   albumine                      400 non-null    float64
 4   sucre                         400 non-null    float64
 5   cellules_rouges               400 non-null    int64  
 6   pus_cell                      400 non-null    int64  
 7   pus_cell_clumps               400 non-null    int64  
 8   bactéries                     400 non-null    int64  
 9   blood_glucose_random          400 non-null    float64
 10  blood_urea                    400 non-null    float64
 11  serum_creatinine              400 non-null    float64
 12  sodium                        400 non-null    float64
 13  potas

In [321]:
display(Data)

Unnamed: 0,âge,pression_sang,gravité_spécifique,albumine,sucre,cellules_rouges,pus_cell,pus_cell_clumps,bactéries,blood_glucose_random,...,volume_concentré_cellules,nombre_globules_blancs,nombre_globules_rouges,hypertension,diabète_mellitus,maladie_artérielle_coronaire,appétit,œdème_péda,anémie,classification
0,48.0,80.0,1.020,1.0,0.0,1,1,0,0,121.000000,...,44.0,7800.0,5.200000,1,1,0,1,0,0,1
1,7.0,50.0,1.020,4.0,0.0,1,1,0,0,148.036517,...,38.0,6000.0,4.707435,0,0,0,1,0,0,1
2,62.0,80.0,1.010,2.0,3.0,1,1,0,0,423.000000,...,31.0,7500.0,4.707435,0,1,0,0,0,1,1
3,48.0,70.0,1.005,4.0,0.0,1,0,1,0,117.000000,...,32.0,6700.0,3.900000,1,0,0,0,1,1,1
4,51.0,80.0,1.010,2.0,0.0,1,1,0,0,106.000000,...,35.0,7300.0,4.600000,0,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,55.0,80.0,1.020,0.0,0.0,1,1,0,0,140.000000,...,47.0,6700.0,4.900000,0,0,0,1,0,0,0
396,42.0,70.0,1.025,0.0,0.0,1,1,0,0,75.000000,...,54.0,7800.0,6.200000,0,0,0,1,0,0,0
397,12.0,80.0,1.020,0.0,0.0,1,1,0,0,100.000000,...,49.0,6600.0,5.400000,0,0,0,1,0,0,0
398,17.0,60.0,1.025,0.0,0.0,1,1,0,0,114.000000,...,51.0,7200.0,5.900000,0,0,0,1,0,0,0


In [322]:
# Define Class as Target Variable, and the rest as feature variable
X = Data.drop("classification", axis=1)     # everything except 'class' column
y = Data['classification']

In [323]:
from sklearn.model_selection import train_test_split
# Define the train dataset as 70% and test dataset as 30%
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [324]:
## Standardisation de data
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

import pickle
pickle.dump(scaler,open('data_scaling.pkl','wb'))

In [325]:
X_train

array([[ 1.79759044e+00,  2.36752214e-01, -1.36674970e+00, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       [-1.04301299e+00, -5.06600884e-01,  4.79997262e-01, ...,
        -1.78300434e+00, -4.88812902e-01, -4.37594974e-01],
       [ 1.67923196e+00,  2.36752214e-01,  1.40337074e+00, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       ...,
       [-5.69579086e-01, -5.06600884e-01,  1.40337074e+00, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       [-3.92041371e-01, -5.06600884e-01, -1.36674970e+00, ...,
         5.60851131e-01, -4.88812902e-01,  2.28521820e+00],
       [ 1.20579806e+00,  2.36752214e-01,  1.30789445e-03, ...,
        -1.78300434e+00, -4.88812902e-01,  2.28521820e+00]])

In [326]:
X_test

array([[-2.04906004e+00, -1.24995398e+00,  1.40337074e+00, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       [ 1.20579806e+00,  9.80105312e-01,  1.30789445e-03, ...,
        -1.78300434e+00, -4.88812902e-01, -4.37594974e-01],
       [-1.39808842e+00, -5.06600884e-01,  4.79997262e-01, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       ...,
       [ 4.36467963e-01, -5.06600884e-01, -1.36674970e+00, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01],
       [-2.58167318e+00, -1.99330708e+00,  4.79997262e-01, ...,
         5.60851131e-01,  2.04577252e+00, -4.37594974e-01],
       [ 1.99751010e-01,  2.36752214e-01,  4.79997262e-01, ...,
         5.60851131e-01, -4.88812902e-01, -4.37594974e-01]])

In [327]:
# Confirm that the records returned for Train is about 70% and Test is about 30%
print(f"'X' shape: {X_train.shape}")
print(f"'y' shape: {X_test.shape}")

'X' shape: (280, 24)
'y' shape: (120, 24)


In [330]:
os.environ['MLFLOW_TRACKING_USERNAME'] = 'tanjmon'
os.environ['MLFLOW_TRACKING_PASSWORD'] = 'ef90a9b82f58406af2af7c8ca66bc0ef8cf0155a'

In [331]:
# Model
mlflow.set_experiment("MLflow Quickstart01")
#mlflow.sklearn.autolog()
with mlflow.start_run(run_name='regressionLog01'):
    model = LogisticRegression()
# training the LogisticRegression model with Training data
    model.fit(X_train, Y_train)

2024/01/05 23:12:46 INFO mlflow.tracking.fluent: Experiment with name 'MLflow Quickstart01' does not exist. Creating a new experiment.


In [332]:
np.unique(y)
reg_pred=model.predict(X_test)
reg_pred

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 0], dtype=int64)

In [333]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  1.0


In [334]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.9833333333333333


In [None]:
""" with mlflow.start_run():
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("dataset", "kidney_disease.csv")
    mlflow.log_metric("accuracy", accuracy)

    # Save the model
    mlflow.sklearn.log_model(model, "model") """

' with mlflow.start_run():\n    mlflow.log_param("model_type", "LogisticRegression")\n    mlflow.log_param("dataset", "kidney_disease.csv")\n    mlflow.log_metric("accuracy", accuracy)\n\n    # Save the model\n    mlflow.sklearn.log_model(model, "model") '

In [335]:
def accuracymeasures(y_test,predictions,avg_method):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, average=avg_method)
    recall = recall_score(y_test, predictions, average=avg_method)
    f1score = f1_score(y_test, predictions, average=avg_method)

    return accuracy,precision,recall,f1score

In [None]:
mlflow.end_run()

In [336]:
# Start an MLflow run
with mlflow.start_run():

    y_pred = model.predict(X_test)
    accuracy,precision,recall,f1score = accuracymeasures(Y_test,y_pred,'weighted') 


    # Log the loss metric
    mlflow.log_metric("accuracy", test_data_accuracy)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1score)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Training Info", "Basic CKD model")

    # Infer the model signature
    signature = infer_signature(X_train, model.predict(X_train))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="CKD_model",
        signature=signature,
        input_example=X_train,
        registered_model_name="tracking",
    )
remote_server_uri= "https://dagshub.com/tanjmon/CKDProject.mlflow"  
mlflow.set_tracking_uri(remote_server_uri)
tracking_url_type_store = urlparse(mlflow.get_artifact_uri()).scheme


Successfully registered model 'tracking'.
2024/01/05 23:13:14 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: tracking, version 1
Created version '1' of model 'tracking'.


In [None]:
loaded_model = mlflow.pyfunc.load_model(model_info.model_uri)

In [None]:
import pickle
pickle.dump(model,open('ckd_model.pkl','wb'))

In [None]:
CKD_Predict = pickle.load(open('ckd_model.pkl','rb'))

In [None]:
input_data = (1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1)
#(62,5,10,140,0,0,0,160,0,3.6,0,2,2,62,0,0,140,268,0,0,160,0,3.6,0)

# Changer la forme des données a numpy array
input_data_as_numpy_array= np.asarray(input_data)

# reshape les données
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]== 0):
  print('La personne ne a pas le CKD')
else:
  print('La personne  a le CKD')

y_predi = CKD_Predict.predict(input_data_reshaped)

[1]
La personne  a le CKD


In [None]:
y_predi

array([1], dtype=int64)

In [None]:
if (y_predi[0]== 0):
  print('La personne ne a pas le CKD')
else:
  print('La personne  a le CKD')

La personne  a le CKD


In [None]:
input_data = (62,5,10,140,0,0,0,160,0,3.6,0,2,2,62,0,0,140,268,0,0,160,0,3.6,0)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
CKD_Predict.predict(input_data_reshaped)

array([1], dtype=int64)

In [None]:
input_data_as_numpy_array= np.asarray(input_data)
input_data_as_numpy_array

array([ 62. ,   5. ,  10. , 140. ,   0. ,   0. ,   0. , 160. ,   0. ,
         3.6,   0. ,   2. ,   2. ,  62. ,   0. ,   0. , 140. , 268. ,
         0. ,   0. , 160. ,   0. ,   3.6,   0. ])

In [None]:
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
input_data_reshaped

array([[ 62. ,   5. ,  10. , 140. ,   0. ,   0. ,   0. , 160. ,   0. ,
          3.6,   0. ,   2. ,   2. ,  62. ,   0. ,   0. , 140. , 268. ,
          0. ,   0. , 160. ,   0. ,   3.6,   0. ]])

In [None]:
CKD_Predict.predict(input_data_reshaped)

array([1], dtype=int64)