# MLFLOW - Colomé

In [1]:
from sklearn.model_selection import train_test_split
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import mlflow.sklearn
from mlflow import log_metric, log_param
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import dagshub
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("../data/cleaned_tickets.csv")

# Definir X e y
X_raw = data['complaint_what_happened']  # Característica textual
y = data['ticket_classification']       # Variable objetivo

# Vectorizar los textos (Transformación de texto a números)
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Puedes ajustar los parámetros
X = vectorizer.fit_transform(X_raw).toarray()

In [3]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)

In [4]:
mlflow.set_tracking_uri("https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow") 
mlflow.set_experiment("colome-experiment")

<Experiment: artifact_location='mlflow-artifacts:/ff62fd7bba484e129f79990a7cada3e8', creation_time=1732145485355, experiment_id='4', last_update_time=1732145485355, lifecycle_stage='active', name='colome-experiment', tags={}>

Random forest log

In [5]:
# Datos de ejemplo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Configuración del experimento
mlflow.set_experiment("colome-experiment")

with mlflow.start_run(run_name="RandomForest-colome"):
    # Hiperparámetros
    params = {
        "n_estimators": 100,
        "max_depth": 10,
        "min_samples_split": 2,
        "random_state": 42
    }

    # Registrar parámetros en MLflow
    for param, value in params.items():
        log_param(param, value)

    # Entrenamiento del modelo
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)

    # Predicción
    predictions = model.predict(X_test)

    # Evaluación
    accuracy = accuracy_score(y_test, predictions)
    log_metric("accuracy", accuracy)

    # Registrar el modelo
    mlflow.sklearn.log_model(model, "colome-random-forest-model")

    print(f"Modelo registrado con precisión: {accuracy}")





Modelo registrado con precisión: 0.48589506986554176
🏃 View run RandomForest-colome at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4/runs/8565de4d4a6d4ff4a1fbba3a6cb2ad61
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4


In [6]:
mlflow.end_run()

<!-- logistic regression log -->

In [7]:
mlflow.set_experiment("colome-experiment")

with mlflow.start_run(run_name="LogisticRegression-colome"):
    # Hiperparámetros
    params = {
        "C": 1.0,  # Regularización
        "solver": "liblinear"  
    }

    # Registrar parámetros en MLflow
    for param, value in params.items():
        log_param(param, value)

    # Crear el modelo de Regresión Logística
    model = LogisticRegression(**params)
    model.fit(X_train, y_train)

    # Predicción
    predictions = model.predict(X_test)

    # Evaluación
    accuracy = accuracy_score(y_test, predictions)
    log_metric("accuracy", accuracy)

    # Registrar el modelo
    mlflow.sklearn.log_model(model, "logistic-regression-model")

    print(f"Modelo registrado con precisión: {accuracy}")




Modelo registrado con precisión: 0.5713155813340364
🏃 View run LogisticRegression-colome at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4/runs/e80bc3b1a19847ee8eb722c3dc84e35c
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4


In [8]:
mlflow.end_run()

Hyperparameter tuning

Random forest

In [9]:
from sklearn.model_selection import GridSearchCV

mlflow.set_experiment("colome-experiment")  # Configurar el experimento

with mlflow.start_run(run_name="RandomForest-colome-gridsearch"):  # Crear una nueva ejecución
    # Definir los parámetros para GridSearch
    param_grid = {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, 20]
    }

    # Crear el modelo y realizar la búsqueda de hiperparámetros
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)

    # Obtener el mejor modelo y los parámetros óptimos
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Loggear los mejores parámetros
    print(f"Mejores parámetros: {best_params}")
    for param, value in best_params.items():
        mlflow.log_param(param, value)

    # Evaluar el mejor modelo
    accuracy = accuracy_score(y_test, best_model.predict(X_test))
    mlflow.log_metric("accuracy", accuracy)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(best_model, "colome-gridsearch-random-forest")

    print(f"Modelo de Random Forest registrado con precisión: {accuracy}")



Fitting 3 folds for each of 9 candidates, totalling 27 fits




Mejores parámetros: {'max_depth': 20, 'n_estimators': 100}




Modelo de Random Forest registrado con precisión: 0.533350909570261
🏃 View run RandomForest-colome-gridsearch at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4/runs/93c8f38a097a48ec96796cdec40b43d2
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4


In [10]:
mlflow.end_run()

Logistic Regression

In [11]:
mlflow.set_experiment("colome-experiment")  # Configurar el experimento

with mlflow.start_run(run_name="LogisticRegression-colome-gridearch"):  # Crear una nueva ejecución
    # Definir los parámetros de GridSearch para la Regresión Logística
    param_grid_lr = {
        'C': [0.01, 0.1, 1, 10],  # Parámetros de regularización
        'solver': ['liblinear', 'saga']  # Solvers disponibles
    }

    # Crear el modelo de Regresión Logística
    logreg = LogisticRegression()

    # Realizar la búsqueda de hiperparámetros con validación cruzada
    grid_search_lr = GridSearchCV(logreg, param_grid_lr, cv=3, n_jobs=-1, verbose=1)
    grid_search_lr.fit(X_train, y_train)

    # Obtener el mejor modelo y los parámetros
    best_lr_model = grid_search_lr.best_estimator_
    best_lr_params = grid_search_lr.best_params_

    # Loggear los mejores parámetros
    print(f"Mejores parámetros: {best_lr_params}")
    for param, value in best_lr_params.items():
        mlflow.log_param(param, value)

    # Evaluar el mejor modelo
    predictions_lr = best_lr_model.predict(X_test)
    accuracy_lr = accuracy_score(y_test, predictions_lr)

    # Loggear la métrica de precisión
    mlflow.log_metric("accuracy", accuracy_lr)

    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(best_lr_model, "colome-gridsearch-logistic-regression")

    print(f"Modelo de Regresión Logística registrado con precisión: {accuracy_lr}")


Fitting 3 folds for each of 8 candidates, totalling 24 fits




Mejores parámetros: {'C': 1, 'solver': 'saga'}




Modelo de Regresión Logística registrado con precisión: 0.5810703928288954
🏃 View run LogisticRegression-colome-gridearch at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4/runs/f1aec2099fdd465b9cdb10a91361727f
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/4


In [12]:
mlflow.end_run()

Champion y Challenger

In [13]:
from mlflow.tracking import MlflowClient

# Nombre del experimento
experiment_name = "colome-experiment"  # Ajusta el nombre si es diferente
client = MlflowClient()

# Obtener el ID del experimento
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id

# Buscar y ordenar las runs por accuracy
runs = client.search_runs(
    experiment_ids=[experiment_id],
    filter_string="",
    order_by=["metrics.accuracy DESC"],  # Ordenar por mayor accuracy
    max_results=10  # Opcional: limitar el número de runs
)

# Verificar las mejores runs
for run in runs:
    print(f"Run ID: {run.info.run_id}, Accuracy: {run.data.metrics['accuracy']}")


Run ID: f1aec2099fdd465b9cdb10a91361727f, Accuracy: 0.5810703928288954
Run ID: e80bc3b1a19847ee8eb722c3dc84e35c, Accuracy: 0.5713155813340364
Run ID: 93c8f38a097a48ec96796cdec40b43d2, Accuracy: 0.533350909570261
Run ID: 8565de4d4a6d4ff4a1fbba3a6cb2ad61, Accuracy: 0.48589506986554176


Registrar modelo

In [15]:
# Registrar el experimento en el Model Registry
model_registry_name = "colome-modelos"

try:
    client.get_registered_model(model_registry_name)
    print(f"El registro de modelos '{model_registry_name}' ya existe.")
except:
    client.create_registered_model(model_registry_name)
    print(f"Registro de modelos '{model_registry_name}' creado.")


# Asignar Champion y Challenger
if len(runs) >= 2:
    # Run con mayor accuracy
    best_run = runs[0]
    second_best_run = runs[1]

    # Registrar modelos
    best_model_version = client.create_model_version(
        name=model_registry_name,
        source=f"runs:/{best_run.info.run_id}/model",  # Ruta del modelo en la run
        run_id=best_run.info.run_id
    )

    second_best_model_version = client.create_model_version(
        name=model_registry_name,
        source=f"runs:/{second_best_run.info.run_id}/model",
        run_id=second_best_run.info.run_id
    )

    # Asignar Champion
    client.transition_model_version_stage(
        name=model_registry_name,
        version=best_model_version.version,
        stage="Production"
    )
    client.set_registered_model_alias(model_registry_name, "Champion", best_model_version.version)

    # Asignar Challenger
    client.transition_model_version_stage(
        name=model_registry_name,
        version=second_best_model_version.version,
        stage="Staging"
    )
    client.set_registered_model_alias(model_registry_name, "Challenger", second_best_model_version.version)

    print(f"Champion: Run ID {best_run.info.run_id}, Accuracy: {best_run.data.metrics['accuracy']}")
    print(f"Challenger: Run ID {second_best_run.info.run_id}, Accuracy: {second_best_run.data.metrics['accuracy']}")
else:
    print("No hay suficientes runs para asignar Champion y Challenger.")



2024/11/21 14:08:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: colome-modelos, version 1


Registro de modelos 'colome-modelos' creado.


2024/11/21 14:08:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: colome-modelos, version 2
  client.transition_model_version_stage(
  client.transition_model_version_stage(


Champion: Run ID f1aec2099fdd465b9cdb10a91361727f, Accuracy: 0.5810703928288954
Challenger: Run ID e80bc3b1a19847ee8eb722c3dc84e35c, Accuracy: 0.5713155813340364
