In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('data/dataset.csv')
data.sample(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
49460,Female,71.0,1,0,never,28.8,5.0,126,0
72667,Male,75.0,0,1,current,22.81,6.2,140,0
43974,Male,63.0,0,0,never,26.64,6.6,130,0
13462,Female,62.0,0,0,never,24.12,6.0,140,0
58383,Male,3.0,0,0,No Info,15.38,5.0,100,0
1880,Male,79.0,0,0,not current,27.32,6.6,158,0
87423,Female,49.0,0,0,current,31.95,5.8,130,0
76292,Female,26.0,0,0,former,17.51,6.0,126,0
17411,Female,34.0,0,0,No Info,22.86,4.5,126,0
60254,Male,27.0,0,0,No Info,27.32,5.0,100,0


In [4]:
data.drop_duplicates(inplace=True)
data.sample(10)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
53055,Female,48.0,0,0,never,22.9,6.0,159,0
43286,Female,53.0,0,0,current,25.78,6.0,126,0
90114,Female,57.0,0,1,never,26.6,4.0,130,0
48472,Female,80.0,0,0,never,23.91,6.6,126,0
61484,Female,50.0,0,0,No Info,19.09,5.7,145,0
2612,Female,80.0,0,0,never,27.32,6.1,90,0
42105,Female,45.0,0,0,not current,33.6,5.0,155,0
77143,Female,34.0,0,0,never,27.29,4.8,100,0
15223,Male,22.0,0,0,No Info,27.32,6.0,158,0
61674,Male,6.0,0,0,No Info,27.32,6.2,130,0


In [5]:
# Convert smoking history to numerical format
gender_mapping = {'Female': 0, 'Male': 1, 'Other': 2}
data['gender'] = data['gender'].map(gender_mapping)
data['gender']

0        0
1        0
2        1
3        0
4        1
        ..
99994    0
99996    0
99997    1
99998    0
99999    0
Name: gender, Length: 96146, dtype: int64

In [6]:
# Convert smoking history to numerical format
smoking_history_mapping = {'never': 0, 'No Info': -1, 'current': 2, 'former': 1, 'ever': 2, 'not current': 0}
data['smoking_history'] = data['smoking_history'].map(smoking_history_mapping)
data['smoking_history']

0        0
1       -1
2        0
3        2
4        2
        ..
99994   -1
99996   -1
99997    1
99998    0
99999    2
Name: smoking_history, Length: 96146, dtype: int64

In [7]:
## convert age column datatype to int
data['age'] = data['age'].astype(int)
data.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80,0,1,0,25.19,6.6,140,0
1,0,54,0,0,-1,27.32,6.6,80,0
2,1,28,0,0,0,27.32,5.7,158,0
3,0,36,0,0,2,23.45,5.0,155,0
4,1,76,1,1,2,20.14,4.8,155,0


In [8]:
diabetes_mapping = {0: "Not Diabetic", 1 : "Diabetic"}
data['diabetes'] = data['diabetes'].map(diabetes_mapping)
data['diabetes']

0        Not Diabetic
1        Not Diabetic
2        Not Diabetic
3        Not Diabetic
4        Not Diabetic
             ...     
99994    Not Diabetic
99996    Not Diabetic
99997    Not Diabetic
99998    Not Diabetic
99999    Not Diabetic
Name: diabetes, Length: 96146, dtype: object

In [9]:
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import metrics
mlflow.sklearn.autolog()
X = data.drop(columns=['diabetes'], axis=1)
y = data['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=0)
scaler = StandardScaler()
X_train_rescaled = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
X_test_rescaled = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns, index = X_test.index)

knn_classifier = LogisticRegression()
knn_classifier.fit(X_train_rescaled, y_train)
y_test_pred = knn_classifier.predict(X_test_rescaled)
acc = metrics.accuracy_score(y_test, y_test_pred)

2023/05/06 16:34:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '5d2eb57f10f4411bb8e579547280c8d4', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


In [10]:
from pickle import dump
dump(scaler, open('models/standard_scaler.pkl', 'wb'))

In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn import metrics

In [12]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Tejus")
    mlflow.set_tag("algo", "KNN")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/dataset.csv")
    k = 53
    mlflow.log_param("n_neighbors", k)
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train_rescaled, y_train)
    y_test_pred = knn_classifier.predict(X_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(knn_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [13]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Tejus Kavishwar")
    mlflow.set_tag("algo", "Logit")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/dataset.csv")
    C = 0.1
    mlflow.log_param("C", C)
    lr_classifier = LogisticRegression(C=C)
    lr_classifier.fit(X_train_rescaled, y_train)
    y_test_pred = lr_classifier.predict(X_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(lr_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [14]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Tejus Kavishwar")
    mlflow.set_tag("algo", "GaussianNB")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/dataset.csv")
    nb_classifier = GaussianNB()
    nb_classifier.fit(X_train_rescaled, y_train)
    y_test_pred = nb_classifier.predict(X_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(nb_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [15]:
with mlflow.start_run():
    mlflow.set_tag("dev", "Tejus Kavishwar")
    mlflow.set_tag("algo", "SVM")
    # log the data for each run using log_param, log_metric, log_model
    mlflow.log_param("data-path", "data/dataset.csv")
    C = 0.1
    mlflow.log_param("C", C)
    sv_classifier = SVC(C=C)
    sv_classifier.fit(X_train_rescaled, y_train)
    y_test_pred = sv_classifier.predict(X_test_rescaled)
    acc = metrics.accuracy_score(y_test, y_test_pred)    
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(sv_classifier, artifact_path="models")
    mlflow.log_artifact("models/standard_scaler.pkl")

In [None]:
from sklearn.model_selection import GridSearchCV
# Enabling automatic MLflow logging for scikit-learn runs
mlflow.sklearn.autolog(max_tuning_runs=None)

with mlflow.start_run():
    tuned_parameters = [{'n_neighbors':[i for i in range(1, 11)], 'p':[1, 2]}]
    clf = GridSearchCV(
        estimator=KNeighborsClassifier(), 
        param_grid=tuned_parameters, 
        scoring='accuracy',
        cv=3,
        return_train_score=True,
        verbose=1
    )
    clf.fit(X_train, y_train)
    
    # Disabling autologging
    mlflow.sklearn.autolog(disable=True)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# Enabling automatic MLflow logging for scikit-learn runs
mlflow.sklearn.autolog(max_tuning_runs=None)
with mlflow.start_run():
    params = {
    'max_depth': [2, 4, 6],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
    }
    clf = GridSearchCV(DecisionTreeClassifier(), param_grid=params, cv=5)
    clf.fit(X_train_rescaled, y_train)
    
    # Disabling autologging
    mlflow.sklearn.autolog(disable=True)