# MLFLOW - Colomé

In [18]:
from sklearn.model_selection import train_test_split
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow.sklearn
from mlflow import log_metric, log_param
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import dagshub
from sklearn.linear_model import LogisticRegression

In [10]:
data = pd.read_csv("../data/cleaned_tickets.csv")

# Definir X e y
X_raw = data['complaint_what_happened']  # Característica textual
y = data['ticket_classification']       # Variable objetivo

# Vectorizar los textos (Transformación de texto a números)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Puedes ajustar los parámetros
X = vectorizer.fit_transform(X_raw).toarray()

In [11]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)

In [14]:
mlflow.set_tracking_uri("https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow") 
mlflow.set_experiment("colome-experiment")

2024/11/20 17:31:25 INFO mlflow.tracking.fluent: Experiment with name 'colome-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/ff62fd7bba484e129f79990a7cada3e8', creation_time=1732145485355, experiment_id='4', last_update_time=1732145485355, lifecycle_stage='active', name='colome-experiment', tags={}>

Random forest log

In [None]:
# Datos de ejemplo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Configuración del experimento
mlflow.set_experiment("colome-random-forest")

with mlflow.start_run(run_name="RandomForest-experiment"):
    # Hiperparámetros
    params = {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_split": 2,
        "random_state": 42
    }

    # Registrar parámetros en MLflow
    for param, value in params.items():
        log_param(param, value)

    # Entrenamiento del modelo
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    # Predicción
    predictions = model.predict(X_test)

    # Evaluación
    accuracy = accuracy_score(y_test, predictions)
    log_metric("accuracy", accuracy)

    # Registrar el modelo
    mlflow.sklearn.log_model(model, "colome-random-forest-model")

    print(f"Modelo registrado con precisión: {accuracy}")



Successfully registered model 'colome-random-forest'.
2024/11/20 17:34:38 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: colome-random-forest, version 1
Created version '1' of model 'colome-random-forest'.


Modelo registrado con precisión: 0.48589506986554176
🏃 View run RandomForest-experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/5/runs/a54bf9a3dfad4f4993880acd41c914ba
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/5


In [None]:
mlflow.end_run()

<!-- logistic regression log -->

In [None]:
mlflow.set_experiment("colome-logistic-regression")

with mlflow.start_run(run_name="LogisticRegression-experiment"):
    # Hiperparámetros
    params = {
        "C": 1.0,  # Regularización
        "solver": "liblinear"  # Solver adecuado para datasets pequeños
    }

    # Registrar parámetros en MLflow
    for param, value in params.items():
        log_param(param, value)

    # Crear el modelo de Regresión Logística
    model = LogisticRegression(**params)
    model.fit(X_train, y_train)

    # Predicción
    predictions = model.predict(X_test)

    # Evaluación
    accuracy = accuracy_score(y_test, predictions)
    log_metric("accuracy", accuracy)

    # Registrar el modelo
    mlflow.sklearn.log_model(model, "logistic-regression-model")

    print(f"Modelo registrado con precisión: {accuracy}")


2024/11/20 17:42:21 INFO mlflow.tracking.fluent: Experiment with name 'colome-logistic-regression' does not exist. Creating a new experiment.
Successfully registered model 'colome-logistic-regression'.
2024/11/20 17:42:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: colome-logistic-regression, version 1
Created version '1' of model 'colome-logistic-regression'.


Modelo registrado con precisión: 0.5713155813340364
🏃 View run LogisticRegression-experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7/runs/3a84449d69ea4bd788525e1e64fbdc5f
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7


In [29]:
mlflow.end_run()

🏃 View run zealous-auk-384 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7/runs/57aef99af7f342a4bc8e7db4bb94821f
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7


Hyperparameter tuning

Random forest

In [30]:
from sklearn.model_selection import GridSearchCV

mlflow.set_experiment("colome-random-forest")  # Configurar el experimento

with mlflow.start_run(run_name="coolme-RandomForest-Hyperparameter-Tuning"):  # Crear una nueva ejecución
    # Definir los parámetros para GridSearch
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20]
    }

    # Crear el modelo y realizar la búsqueda de hiperparámetros
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Obtener el mejor modelo y los parámetros óptimos
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Loggear los mejores parámetros
    print(f"Mejores parámetros: {best_params}")
    for param, value in best_params.items():
        mlflow.log_param(param, value)

    # Evaluar el mejor modelo
    accuracy = accuracy_score(y_test, best_model.predict(X_test))
    mlflow.log_metric("accuracy", accuracy)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(best_model, "colome-hypeparameter-random-forest")

    print(f"Modelo de Random Forest registrado con precisión: {accuracy}")



Fitting 3 folds for each of 9 candidates, totalling 27 fits




Mejores parámetros: {'max_depth': 20, 'n_estimators': 100}




Modelo de Random Forest registrado con precisión: 0.533350909570261
🏃 View run coolme-RandomForest-Hyperparameter-Tuning at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/5/runs/bb57ddfb7f3b49588002fcce72361bb6
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/5


In [31]:
mlflow.end_run()

Logistic Regression

In [32]:
mlflow.set_experiment("colome-logistic-regression")  # Configurar el experimento

with mlflow.start_run(run_name="colome-LogisticRegression-Hyperparameter-Tuning"):  # Crear una nueva ejecución
    # Definir los parámetros de GridSearch para la Regresión Logística
    param_grid_lr = {
        'C': [0.01, 0.1, 1, 10],  # Parámetros de regularización
        'solver': ['liblinear', 'saga']  # Solvers disponibles
    }

    # Crear el modelo de Regresión Logística
    logreg = LogisticRegression()

    # Realizar la búsqueda de hiperparámetros con validación cruzada
    grid_search_lr = GridSearchCV(logreg, param_grid_lr, cv=3, n_jobs=-1, verbose=1)
    grid_search_lr.fit(X_train, y_train)

    # Obtener el mejor modelo y los parámetros
    best_lr_model = grid_search_lr.best_estimator_
    best_lr_params = grid_search_lr.best_params_

    # Loggear los mejores parámetros
    print(f"Mejores parámetros: {best_lr_params}")
    for param, value in best_lr_params.items():
        mlflow.log_param(param, value)

    # Evaluar el mejor modelo
    predictions_lr = best_lr_model.predict(X_test)
    accuracy_lr = accuracy_score(y_test, predictions_lr)

    # Loggear la métrica de precisión
    mlflow.log_metric("accuracy", accuracy_lr)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(best_lr_model, "colome-hyperparameter-logistic-regression")

    print(f"Modelo de Regresión Logística registrado con precisión: {accuracy_lr}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits




Mejores parámetros: {'C': 1, 'solver': 'saga'}




Modelo de Regresión Logística registrado con precisión: 0.5810703928288954
🏃 View run colome-LogisticRegression-Hyperparameter-Tuning at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7/runs/2c0776034a8740dd9d281feaa6c7b2a1
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/7


In [33]:
mlflow.end_run()

Champion y Challenger

In [34]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# Nombre de los experimentos
experimento_logistic = "colome-logistic-regression"
experimento_rf = "colome-random-forest"

# Obtener ID de los experimentos
experiment_id_logistic = client.get_experiment_by_name(experimento_logistic).experiment_id
experiment_id_rf = client.get_experiment_by_name(experimento_rf).experiment_id

# Obtener las ejecuciones de los experimentos
runs_logistic = client.search_runs(
    experiment_ids=[experiment_id_logistic],
    filter_string="",
    order_by=["metrics.accuracy DESC"],  # Ordenar por mayor accuracy
    max_results=1
)
runs_rf = client.search_runs(
    experiment_ids=[experiment_id_rf],
    filter_string="",
    order_by=["metrics.accuracy DESC"],  # Ordenar por mayor accuracy
    max_results=1
)

# Extraer el mejor modelo y sus métricas
best_logistic_run = runs_logistic[0]
best_rf_run = runs_rf[0]

accuracy_logistic = best_logistic_run.data.metrics["accuracy"]
accuracy_rf = best_rf_run.data.metrics["accuracy"]

logistic_model_uri = f"runs:/{best_logistic_run.info.run_id}/logistic-regression-model"
rf_model_uri = f"runs:/{best_rf_run.info.run_id}/random-forest-model"

Registrar modelo

In [36]:
# Registrar el modelo Logistic Regression
logistic_model_name = "COLOME-logistic-regression"
rf_model_name = "COLOME-random-forest"

client.create_registered_model(logistic_model_name)  # Crear registro si no existe
logistic_model_version = client.create_model_version(
    name=logistic_model_name,
    source=logistic_model_uri,
    run_id=best_logistic_run.info.run_id
)

# Registrar el modelo Random Forest
client.create_registered_model(rf_model_name)  # Crear registro si no existe
rf_model_version = client.create_model_version(
    name=rf_model_name,
    source=rf_model_uri,
    run_id=best_rf_run.info.run_id
)


2024/11/20 21:51:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: COLOME-logistic-regression, version 1
2024/11/20 21:51:58 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: COLOME-random-forest, version 1


ALIASES

In [37]:
if accuracy_logistic > accuracy_rf:
    # Logistic Regression es el Champion
    client.transition_model_version_stage(
        name=logistic_model_name,
        version=logistic_model_version.version,
        stage="Production"
    )
    client.transition_model_version_stage(
        name=rf_model_name,
        version=rf_model_version.version,
        stage="Staging"
    )
    print("Champion: Logistic Regression")
    print("Challenger: Random Forest")
else:
    # Random Forest es el Champion
    client.transition_model_version_stage(
        name=rf_model_name,
        version=rf_model_version.version,
        stage="Production"
    )
    client.transition_model_version_stage(
        name=logistic_model_name,
        version=logistic_model_version.version,
        stage="Staging"
    )
    print("Champion: Random Forest")
    print("Challenger: Logistic Regression")


  client.transition_model_version_stage(
  client.transition_model_version_stage(


Champion: Logistic Regression
Challenger: Random Forest
