In [None]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [None]:
# 1. Load dataset
df = pd.read_csv("data/diabetes_risk_dataset.csv")

# 2. Separate features and target
X = df.drop("at_risk_diabetes", axis=1)
y = df["at_risk_diabetes"]

In [None]:
#  One-hot encode categorical feature
categorical_features = ["physical_activity_level"]
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

# Preprocessing pipeline 
preprocessor = ColumnTransformer(
    transformers=[("cat", categorical_transformer, categorical_features)],
    remainder="passthrough"
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000))
])

In [None]:
#  Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Set MLflow experiment name
mlflow.set_experiment("Diabetes Risk Prediction")

# Define hyperparameter grid for tuning
param_grid = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__penalty": ["l2"],
    "classifier__solver": ["lbfgs"]
}

# Use GridSearchCV to find best hyperparameters
grid = GridSearchCV(pipeline, param_grid, cv=3, scoring="f1")

# Start MLflow run and train model
with mlflow.start_run(run_name="LogReg_GridSearch"):
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)

    #  Calculate evaluation metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # Log best hyperparameters and metrics to MLflow
    mlflow.log_params(grid.best_params_)
    mlflow.log_metric("accuracy", acc)
    mlflow.log_metric("f1_score", f1)

    # Log the trained model
    mlflow.sklearn.log_model(grid.best_estimator_, "model")

print("Best Parameters:", grid.best_params_)