Importamos las librerias necesarias

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import dagshub
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
import pickle
import pathlib
from mlflow.tracking import MlflowClient

In [25]:
df = pd.read_csv("../Data/Clean_data/transformed_data.csv")

In [26]:
X = df['complaint_what_happened']  
y = df['ticket_classification']  

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [28]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="arturo-torres-experiment")

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


<Experiment: artifact_location='mlflow-artifacts:/6be75566f49a45ea9188daef3c3fa463', creation_time=1732153439189, experiment_id='11', last_update_time=1732153439189, lifecycle_stage='active', name='arturo-torres-experiment', tags={}>

In [29]:
def objective_lr(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "LogisticRegression")
        mlflow.log_params(params)
        
        model = LogisticRegression(**params, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model-lr")
    
    return {'loss': accuracy, 'status': STATUS_OK}

In [30]:
search_space_lr = {
    'C': hp.loguniform('C', -4, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
}

In [31]:
with mlflow.start_run(run_name="LogisticRegression Hyper-parameter Optimization"):
    best_params_lr = fmin(
        fn=objective_lr,
        space=search_space_lr,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Convertir parámetros al formato adecuado
    best_params_lr['solver'] = ['liblinear', 'lbfgs'][best_params_lr['solver']]
    mlflow.log_params(best_params_lr)

    # Entrenar modelo final de Regresión Logística
    best_model_lr = LogisticRegression(**best_params_lr, random_state=42)
    best_model_lr.fit(X_train, y_train)
    
    y_pred_lr = best_model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    mlflow.log_metric("accuracy", accuracy_lr)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




🏃 View run skittish-ant-968 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/9cc74add575c41f99cfc9ce8cc7d72e9

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 10%|█         | 1/10 [00:12<01:49, 12.22s/trial, best loss: 0.46752889378095763]




🏃 View run wistful-shrimp-814 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/6296bb8aa8424114876b37efb25d0442

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 20%|██        | 2/10 [00:35<02:30, 18.77s/trial, best loss: 0.46752889378095763]




🏃 View run amusing-grouse-232 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/18370b59afcd4674870ece856ec6fd0e

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 30%|███       | 3/10 [00:44<01:40, 14.43s/trial, best loss: 0.46752889378095763]




🏃 View run unique-asp-777 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/1ece69a5c75447ff912ae9797faf8a31

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 40%|████      | 4/10 [00:56<01:19, 13.28s/trial, best loss: 0.46752889378095763]




🏃 View run abrasive-foal-357 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/fb16b781361149aca5b2448cbc3868e6

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 50%|█████     | 5/10 [01:08<01:04, 12.83s/trial, best loss: 0.46752889378095763]




🏃 View run overjoyed-roo-107 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/556d77bb200244c2b784d2da892023d3

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 60%|██████    | 6/10 [01:21<00:51, 12.89s/trial, best loss: 0.46752889378095763]




🏃 View run sedate-wolf-318 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/85a60532fd4c4cb9b4eeafdba5b1ca39

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 70%|███████   | 7/10 [01:40<00:44, 14.83s/trial, best loss: 0.46752889378095763]




🏃 View run luxuriant-foal-78 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/d5d2164358414e169b99d6a500d1a3f0

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 80%|████████  | 8/10 [01:50<00:26, 13.48s/trial, best loss: 0.46752889378095763]




🏃 View run puzzled-ape-716 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/1c5e828fd18d4c309058740f426a2579

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 90%|█████████ | 9/10 [02:01<00:12, 12.51s/trial, best loss: 0.46752889378095763]




🏃 View run dazzling-stork-812 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/2fb8712e739246079678f3afaf85d1a7

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

100%|██████████| 10/10 [02:14<00:00, 13.42s/trial, best loss: 0.46752889378095763]
🏃 View run LogisticRegression Hyper-parameter Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/fd1a0a3a8b0649dfb1f299bc17c052c7
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


In [32]:
def objective_rf(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "RandomForest")
        mlflow.log_params(params)
        
        model = RandomForestClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, artifact_path="model-rf")
    
    return {'loss': -accuracy, 'status': STATUS_OK}

In [33]:
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'bootstrap': hp.choice('bootstrap', [True, False])
}


In [34]:
with mlflow.start_run(run_name="RandomForest Hyper-parameter Optimization"):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Convertir parámetros al formato adecuado
    best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
    best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
    best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
    best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])
    best_params_rf['bootstrap'] = bool(best_params_rf['bootstrap'])
    mlflow.log_params(best_params_rf)

    # Entrenar modelo final de Random Forest
    best_model_rf = RandomForestClassifier(**best_params_rf, random_state=42)
    best_model_rf.fit(X_train, y_train)
    
    y_pred_rf = best_model_rf.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    mlflow.log_metric("accuracy", accuracy_rf)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




🏃 View run amazing-elk-916 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/b620a6b810154dfeb12692c59b020b00

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 10%|█         | 1/10 [04:15<38:18, 255.42s/trial, best loss: -0.5553109521188773]




🏃 View run tasteful-grouse-518 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/7b94977984fa42018d3fb60e3265d2d2

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 20%|██        | 2/10 [05:38<20:34, 154.26s/trial, best loss: -0.5553109521188773]




🏃 View run delightful-auk-44 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/d1087c80bd09478e8eb67f7d7acee99e

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 30%|███       | 3/10 [07:13<14:48, 126.99s/trial, best loss: -0.5553109521188773]




🏃 View run gregarious-hound-190 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/e25a484b82124cbd805e8e2476c91185

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 40%|████      | 4/10 [08:26<10:33, 105.63s/trial, best loss: -0.5553109521188773]




🏃 View run placid-perch-679 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/6d2c4c6f55b34f5a98ff9063636f50d4

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 50%|█████     | 5/10 [08:36<05:56, 71.22s/trial, best loss: -0.5553109521188773] 




🏃 View run youthful-ape-994 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/d10950bd8c97434c907c76eacc8ef3b3

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 60%|██████    | 6/10 [11:00<06:23, 95.82s/trial, best loss: -0.5575123830489819]




🏃 View run masked-wolf-360 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/a6e9fd8fe5f24c0d8832b973f6a81ff3

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 70%|███████   | 7/10 [12:07<04:19, 86.64s/trial, best loss: -0.5575123830489819]




🏃 View run painted-hawk-701 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/13bee2110d1d4fdbaec207d883900ffb

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 80%|████████  | 8/10 [12:30<02:12, 66.33s/trial, best loss: -0.5575123830489819]




🏃 View run zealous-grouse-845 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/cfa67592a9044d248d89ee19e8508c06

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 90%|█████████ | 9/10 [16:10<01:54, 114.32s/trial, best loss: -0.5588882773802972]




🏃 View run colorful-yak-385 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/4863e9b9c53f4291abba7404f30dcaf9

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

100%|██████████| 10/10 [16:42<00:00, 100.26s/trial, best loss: -0.5588882773802972]
🏃 View run RandomForest Hyper-parameter Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/6b149b2d302d448f99edd4834f3caf68
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


No me gustaron los resultados del grid search así que probare la regresión logistica sin gridsearch

In [35]:
with mlflow.start_run(run_name='LogisticRegression_NoTunning'):
    mlflow.set_tag("model_family", "LogisticRegression")
    model = LogisticRegression(max_iter=2000)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    mlflow.log_metric("accuracy", accuracy)
    
    
    mlflow.sklearn.log_model(model, 'model-lr-wt')
    
    print(f"Accuracy: {accuracy}")



Accuracy: 0.596587782058338
🏃 View run LogisticRegression_NoTunning at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/4c23cde78c834f1d9f5232e53d11883e
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


Ya con todos los experimentos hechos, pasamos a la fase en la que elegimos los 2 mejores experimentos para que se conviertan en los modelos champion y challenger

In [36]:
from mlflow.tracking import MlflowClient


client = MlflowClient()

# Declaramos el experimento en el que estamos trabajando
experiment_name = "arturo-torres-experiment"


experiment = client.get_experiment_by_name(experiment_name)

# Buscamos las dos mejores ejecuciones en base al accuracy
top_runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.accuracy DESC"],  # Cambia a ASC si buscas minimizar
    max_results=2  # Recuperar las dos mejores
)

# Obtenemos los IDs de las mejores ejecuciones
champion_run = top_runs.iloc[0]
challenger_run = top_runs.iloc[1]

# Obtenemos los IDs de las ejecuciones
champion_run_id = champion_run.run_id
challenger_run_id = challenger_run.run_id

champion_model_uri = f"runs:/{champion_run_id}/model"
challenger_model_uri = f"runs:/{challenger_run_id}/model"

# Declaramos el nombre del modelo registrado
model_name = "arturo-model"

# Registramos el Champion
champion_model_version = mlflow.register_model(champion_model_uri, model_name)
client.set_registered_model_alias(model_name, "champion", champion_model_version.version)

# Registramos el Challenger
challenger_model_version = mlflow.register_model(challenger_model_uri, model_name)
client.set_registered_model_alias(model_name, "challenger", challenger_model_version.version)

# Imprimimos resultados
print(f"Champion Model: Run ID {champion_run_id}, Version {champion_model_version.version}")
print(f"Challenger Model: Run ID {challenger_run_id}, Version {challenger_model_version.version}")

Successfully registered model 'arturo-model'.
2024/11/21 19:40:53 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: arturo-model, version 1
Created version '1' of model 'arturo-model'.
Registered model 'arturo-model' already exists. Creating a new version of this model...
2024/11/21 19:40:54 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: arturo-model, version 2
Created version '2' of model 'arturo-model'.


Champion Model: Run ID e4f42c04de4f44d2ba5f149739a943e8, Version 1
Challenger Model: Run ID 2fb8712e739246079678f3afaf85d1a7, Version 2
