In [1]:
import pandas as pd
import mlflow
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

from scripts.hpo import read_from_s3


mlflow.set_tracking_uri(os.getenv("MLFLOW_SITE_URL"))
mlflow.set_experiment("red-wine-quality-prediction")

FILENAME = "winequality-red"
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
S3_ENDPOINT = os.getenv("MLFLOW_S3_ENDPOINT_URL")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")

SEED = 0
TARGET = "quality"
FEATURES = [
    "volatile acidity", "citric acid",
    "sulphates", "alcohol"
]

2023/07/28 19:19:56 INFO mlflow.tracking.fluent: Experiment with name 'red-wine-quality-prediction-test' does not exist. Creating a new experiment.


In [2]:
df = read_from_s3(bucket_name=S3_BUCKET_NAME, filename=FILENAME)
X_train, X_test, y_train, y_test = \
    train_test_split(df[FEATURES], df[TARGET], random_state=SEED)

In [3]:
def run_pipeline(pipeline):
    pipeline.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, pipeline.predict(X_test))
    mlflow.log_metric("accuracy", accuracy)

In [4]:
def run_experiment(pipeline):
    with mlflow.start_run() as run:
        mlflow.autolog()
        run_pipeline(pipeline)

In [5]:
linear_svc_pipeline = make_pipeline(
    StandardScaler(),
    LinearSVC()
)

run_experiment(linear_svc_pipeline)

2023/07/28 19:20:22 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
svc_pipeline = make_pipeline(
    StandardScaler(),
    SVC()
)

run_experiment(svc_pipeline)

2023/07/28 19:20:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
knn_pipeline = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

run_experiment(knn_pipeline)

2023/07/28 19:20:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
rfc_pipeline = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

run_experiment(rfc_pipeline)

2023/07/28 19:20:28 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
