In [None]:
import mlflow
from mlflow.models import infer_signature
import numpy as np
from sklearn.linear_model import LogisticRegression

with mlflow.start_run():
    X = np.array([-2, -1, 0, 1, 2, 1]).reshape(-1, 1)
    y = np.array([0, 0, 1, 1, 1, 0])
    lr = LogisticRegression()
    lr.fit(X, y)
    signature = infer_signature(X, lr.predict(X))

    model_info = mlflow.sklearn.log_model(
        sk_model=lr, artifact_path="model", signature=signature
    )

sklearn_pyfunc = mlflow.pyfunc.load_model(model_uri=model_info.model_uri)

data = np.array([-4, 1, 0, 10, -2, 1]).reshape(-1, 1)

predictions = sklearn_pyfunc.predict(data)
predictions

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
url ='https://raw.githubusercontent.com/digipodium/Datasets/main/classfication/diabetes.csv'
df = pd.read_csv(url)
df

In [13]:
# creating a mlflow experiment

with mlflow.start_run():
    X = df.drop('Outcome', axis=1)
    y = df['Outcome']
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
        test_size=0.3, random_state=42)
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('model', DecisionTreeClassifier())
    ])
    pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    print(classification_report(y_test, predictions, output_dict=True))
    mlflow.sklearn.log_model(pipe, "model")
    mlflow.log_param("model", "DecisionTreeClassifier")

{'0': {'precision': 0.8014705882352942, 'recall': 0.7218543046357616, 'f1-score': 0.759581881533101, 'support': 151.0}, '1': {'precision': 0.5578947368421052, 'recall': 0.6625, 'f1-score': 0.6057142857142858, 'support': 80.0}, 'accuracy': 0.7012987012987013, 'macro avg': {'precision': 0.6796826625386997, 'recall': 0.6921771523178808, 'f1-score': 0.6826480836236934, 'support': 231.0}, 'weighted avg': {'precision': 0.7171153150255317, 'recall': 0.7012987012987013, 'f1-score': 0.7062944024616499, 'support': 231.0}}




comparing with mlflow

In [12]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

pipe1 = Pipeline([('scaler', StandardScaler()),  ('model', DecisionTreeClassifier())])
pipe2 = Pipeline([('scaler', StandardScaler()), ('model', KNeighborsClassifier())])
pipe3 = Pipeline([('scaler', StandardScaler()), ('model', RandomForestClassifier())])
pipe4 = Pipeline([('scaler', StandardScaler()), ('model', SVC())])

In [20]:
def train_n_log_model(model, model_name):
    mlflow.set_experiment("Diabetes Classification")
    with mlflow.start_run():
        model.fit(X_train, y_train)
        ypred = model.predict(X_test)
        signature = infer_signature(X_test, ypred)
        eval_report = classification_report(y_test, ypred, output_dict=True)
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", eval_report['accuracy'])
        mlflow.log_metric("recall", eval_report['weighted avg']['recall'])
        mlflow.log_metric("precision", eval_report['weighted avg']['precision'])
        mlflow.log_metric("f1-score", eval_report['weighted avg']['f1-score'])
        mlflow.sklearn.log_model(model, model_name, signature=signature)

In [21]:
train_n_log_model(pipe1, "DecisionTreeClassifier")
train_n_log_model(pipe2, "KNeighborsClassifier")
train_n_log_model(pipe3, "RandomForestClassifier")
train_n_log_model(pipe4, "SVC")

2024/10/05 15:46:05 INFO mlflow.tracking.fluent: Experiment with name 'Diabetes Classification' does not exist. Creating a new experiment.
