In [1]:
import joblib
import mlflow
import mlflow.data
import pandas as pd
from mlflow.data.pandas_dataset import PandasDataset
from mlflow.models import infer_signature
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [2]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Lazada Reviews Classification")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1718698326454, experiment_id='2', last_update_time=1718698326454, lifecycle_stage='active', name='Lazada Reviews Classification', tags={}>

In [3]:
x_train_vec = joblib.load("../data/processed/x_train_vec_1.pkl")
x_test_vec = joblib.load("../data/processed/x_test_vec_1.pkl")

x_train = joblib.load("../data/interim/x_train_1.pkl")
x_test = joblib.load("../data/interim/x_test_1.pkl")

y_train = joblib.load("../data/processed/y_train_1.pkl")
y_test = joblib.load("../data/processed/y_test_1.pkl")

In [4]:
vectorizer = joblib.load("../models/vectorizer_1.pkl")

In [5]:
logreg = LogisticRegression()
logreg.fit(x_train_vec, y_train)

In [6]:
y_pred = pd.Series(logreg.predict(x_train_vec), index = x_train.index)

In [7]:
metrics = classification_report(y_train, y_pred, output_dict = True)

mlflow_metrics = {}
for rating in range(len(metrics) - 3):
    data_metric = metrics[str(rating + 1)]
    for name, value in data_metric.items():
        mlflow_metrics[name + "_" + str(rating + 1)] = value

In [8]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           1       0.78      0.72      0.75      1434
           2       0.97      0.08      0.16       438
           3       0.84      0.18      0.30       860
           4       0.86      0.15      0.25      2135
           5       0.84      0.99      0.91     16538

    accuracy                           0.84     21405
   macro avg       0.86      0.42      0.47     21405
weighted avg       0.84      0.84      0.79     21405



In [9]:
with mlflow.start_run(run_name = "Minimum Effort"):
    dataset: PandasDataset = mlflow.data.from_pandas(
        pd.concat([x_train, y_train, y_pred], axis = 1),
        source = "s3://mlops-lazada/dataset_review.csv",
        targets = "rating",
        name = "lazada reviews",
        predictions = 0
    )

    model_params = logreg.get_params()

    mlflow.log_params(model_params)
    mlflow.log_metrics(mlflow_metrics)
    mlflow.log_input(dataset, "training")
    mlflow.log_input(dataset, "testing")
    mlflow.log_artifact("../models/vectorizer_1.pkl", "vectorizer")

    mlflow.sklearn.log_model(
        sk_model = logreg,
        artifact_path = "models",
        serialization_format = mlflow.sklearn.SERIALIZATION_FORMAT_CLOUDPICKLE,
        registered_model_name = "Untouch Logistic Regression Model",
        input_example = x_test_vec
    )

    mlflow.set_tags(
        {
            "dataset config": "review contents"
        }
    )  
    

  return _dataset_source_registry.resolve(
Registered model 'Untouch Logistic Regression Model' already exists. Creating a new version of this model...
2024/06/20 21:58:32 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Untouch Logistic Regression Model, version 3
Created version '3' of model 'Untouch Logistic Regression Model'.
