Importamos las librerias necesarias

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import dagshub
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from hyperopt.pyll.base import scope
import pickle
import pathlib
from mlflow.tracking import MlflowClient

In [2]:
df = pd.read_csv("../Data/Clean_data/transformed_data.csv")

In [4]:
X = df['complaint_what_happened']  # Cambia 'texto_queja' por el nombre real de tu columna
y = df['ticket_classification']  # Cambia 'categoria_queja' por el nombre real de tu columna

# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [8]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)


mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="arturo-torres-experiment")

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


2024/11/20 19:43:58 INFO mlflow.tracking.fluent: Experiment with name 'arturo-torres-experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/6be75566f49a45ea9188daef3c3fa463', creation_time=1732153439189, experiment_id='11', last_update_time=1732153439189, lifecycle_stage='active', name='arturo-torres-experiment', tags={}>

In [10]:
def objective_lr(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "LogisticRegression")
        mlflow.log_params(params)
        
        model = LogisticRegression(**params, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, "model-lr")
    
    return {'loss': accuracy, 'status': STATUS_OK}

In [11]:
search_space_lr = {
    'C': hp.loguniform('C', -4, 2),
    'solver': hp.choice('solver', ['liblinear', 'lbfgs'])
}

In [13]:
with mlflow.start_run(run_name="LogisticRegression Hyper-parameter Optimization"):
    best_params_lr = fmin(
        fn=objective_lr,
        space=search_space_lr,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Convertir parámetros al formato adecuado
    best_params_lr['solver'] = ['liblinear', 'lbfgs'][best_params_lr['solver']]
    mlflow.log_params(best_params_lr)

    # Entrenar modelo final de Regresión Logística
    best_model_lr = LogisticRegression(**best_params_lr, random_state=42)
    best_model_lr.fit(X_train, y_train)
    
    y_pred_lr = best_model_lr.predict(X_test)
    accuracy_lr = accuracy_score(y_test, y_pred_lr)
    mlflow.log_metric("accuracy", accuracy_lr)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




🏃 View run loud-yak-558 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/9eff031e3cb245bc8422c98741d6228f

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 10%|█         | 1/10 [00:09<01:23,  9.31s/trial, best loss: 0.5690698954320308]




🏃 View run rare-panda-324 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/f227ca8b7c73406888004617d43322c0

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 20%|██        | 2/10 [00:17<01:09,  8.68s/trial, best loss: 0.5013758943313154]




🏃 View run masked-stork-15 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/ef95ab328af9477db379d66d8992b654

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 30%|███       | 3/10 [00:30<01:14, 10.66s/trial, best loss: 0.5013758943313154]




🏃 View run learned-fly-333 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/407b8d6b10f04c53a5c968528cdaefd1

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 40%|████      | 4/10 [01:08<02:09, 21.59s/trial, best loss: 0.46642817831590533]




🏃 View run youthful-wolf-487 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/0db8bc41f75c412f9b267683ba8c1f2b

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 50%|█████     | 5/10 [01:17<01:23, 16.74s/trial, best loss: 0.46642817831590533]




🏃 View run puzzled-fox-329 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/6124726550054b6fbad61fb27a6196d1

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 60%|██████    | 6/10 [01:30<01:01, 15.47s/trial, best loss: 0.4488167308750688] 




🏃 View run shivering-stork-156 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/57c923252dec4711ad036e503cc3cc64

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 70%|███████   | 7/10 [02:03<01:03, 21.33s/trial, best loss: 0.4488167308750688]




🏃 View run worried-stag-67 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/e1a3ffa5299240af9eb6442d8b9c5c4f

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 80%|████████  | 8/10 [02:12<00:35, 17.56s/trial, best loss: 0.4488167308750688]




🏃 View run unleashed-snail-714 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/3fc3781fa08b40b68f4900d3431c404b

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 90%|█████████ | 9/10 [02:26<00:16, 16.24s/trial, best loss: 0.4488167308750688]




🏃 View run useful-crab-885 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/0bc16ecbae7f4a3bb0295dbc705cfd0a

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

100%|██████████| 10/10 [02:40<00:00, 16.07s/trial, best loss: 0.4488167308750688]
🏃 View run LogisticRegression Hyper-parameter Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/88e11f5f431d40ad9ed61bc316f16565
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


In [14]:
def objective_rf(params):
    with mlflow.start_run(nested=True):
        mlflow.set_tag("model_family", "RandomForest")
        mlflow.log_params(params)
        
        model = RandomForestClassifier(**params, random_state=42)
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(model, artifact_path="model-rf")
    
    return {'loss': -accuracy, 'status': STATUS_OK}

In [15]:
search_space_rf = {
    'n_estimators': scope.int(hp.quniform('n_estimators', 100, 500, 1)),
    'max_depth': scope.int(hp.quniform('max_depth', 5, 50, 1)),
    'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
    'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
    'bootstrap': hp.choice('bootstrap', [True, False])
}


In [17]:
with mlflow.start_run(run_name="RandomForest Hyper-parameter Optimization"):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Convertir parámetros al formato adecuado
    best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
    best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
    best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
    best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])
    best_params_rf['bootstrap'] = bool(best_params_rf['bootstrap'])
    mlflow.log_params(best_params_rf)

    # Entrenar modelo final de Random Forest
    best_model_rf = RandomForestClassifier(**best_params_rf, random_state=42)
    best_model_rf.fit(X_train, y_train)
    
    y_pred_rf = best_model_rf.predict(X_test)
    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    mlflow.log_metric("accuracy", accuracy_rf)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]




🏃 View run handsome-toad-429 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/33acc0687f5845baa915784a0cf91c5b

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 10%|█         | 1/10 [00:42<06:26, 42.95s/trial, best loss: -0.4678040726472207]




🏃 View run unruly-koi-641 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/64c7822e04ce4c039fce5b77c7feb9c7

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 20%|██        | 2/10 [02:38<11:23, 85.45s/trial, best loss: -0.53852504127683]  




🏃 View run marvelous-bird-955 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/570cc35387184ed990240e8dcdb3629b

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 30%|███       | 3/10 [03:24<07:52, 67.50s/trial, best loss: -0.53852504127683]




🏃 View run resilient-donkey-613 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/ca821226bfe24d988874e4625fd528c0

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 40%|████      | 4/10 [05:30<09:05, 90.85s/trial, best loss: -0.5566868464501926]




🏃 View run nimble-bee-627 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/0f5a5bc38f8e4c268a760818315176c2

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 50%|█████     | 5/10 [07:14<07:56, 95.37s/trial, best loss: -0.5594386351128233]




🏃 View run angry-hound-989 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/eb8d5784793a4282882445febaec6eb7

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 60%|██████    | 6/10 [08:29<05:54, 88.52s/trial, best loss: -0.5594386351128233]




🏃 View run popular-croc-712 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/57275bca1e2645dab90a48cdfdd91459

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 70%|███████   | 7/10 [09:59<04:26, 88.97s/trial, best loss: -0.5594386351128233]




🏃 View run useful-stag-309 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/a3c51d1c945a4b848afbab76f2bc15c4

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 80%|████████  | 8/10 [10:51<02:34, 77.35s/trial, best loss: -0.5594386351128233]




🏃 View run puzzled-sow-858 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/dcfb54b139e84380b70c74251ad34fc6

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

 90%|█████████ | 9/10 [11:33<01:06, 66.27s/trial, best loss: -0.5594386351128233]




🏃 View run fearless-shark-897 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/7c79233bf5db43729d6021a4d272afdb

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11

100%|██████████| 10/10 [13:06<00:00, 78.61s/trial, best loss: -0.5594386351128233]
🏃 View run RandomForest Hyper-parameter Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/33544082c3584abaabd447779cbb2db2
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


No me gustaron los resultados del grid search así que probare la regresión logistica sin gridsearch

In [20]:
with mlflow.start_run(run_name='LogisticRegression_NoTunning'):
    mlflow.set_tag("model_family", "LogisticRegression")
    model = LogisticRegression(max_iter=2000)
    model.fit(X_train, y_train)
        
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
        
    mlflow.log_metric("accuracy", accuracy)
    
    
    mlflow.sklearn.log_model(model, 'model-lr-wt')
    
    print(f"Accuracy: {accuracy}")



Accuracy: 0.596587782058338
🏃 View run LogisticRegression_NoTunning at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11/runs/c9db5325e8e64eeda2831f7975881a8d
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/11


Ya con todos los experimentos hechos, pasamos a la fase en la que elegimos los 2 mejores experimentos para que se conviertan en los modelos champion y challenger

In [23]:
from mlflow.tracking import MlflowClient


client = MlflowClient()

# Declaramos el experimento en el que estamos trabajando
experiment_name = "arturo-torres-experiment"


experiment = client.get_experiment_by_name(experiment_name)

# Buscamos las dos mejores ejecuciones en base al accuracy
top_runs = mlflow.search_runs(
    experiment_ids=[experiment.experiment_id],
    order_by=["metrics.accuracy DESC"],  # Cambia a ASC si buscas minimizar
    max_results=2  # Recuperar las dos mejores
)

# Obtenemos los IDs de las mejores ejecuciones
champion_run = top_runs.iloc[0]
challenger_run = top_runs.iloc[1]

# Obtenemos los IDs de las ejecuciones
champion_run_id = champion_run.run_id
challenger_run_id = challenger_run.run_id

champion_model_uri = f"runs:/{champion_run_id}/model"
challenger_model_uri = f"runs:/{challenger_run_id}/model"

# Declaramos el nombre del modelo registrado
model_name = "arturo-model"

# Registramos el Champion
champion_model_version = mlflow.register_model(champion_model_uri, model_name)
client.set_registered_model_alias(model_name, "champion", champion_model_version.version)

# Registramos el Challenger
challenger_model_version = mlflow.register_model(challenger_model_uri, model_name)
client.set_registered_model_alias(model_name, "challenger", challenger_model_version.version)

# Imprimimos resultados
print(f"Champion Model: Run ID {champion_run_id}, Version {champion_model_version.version}")
print(f"Challenger Model: Run ID {challenger_run_id}, Version {challenger_model_version.version}")

Successfully registered model 'arturo-model'.
2024/11/21 11:30:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: arturo-model, version 1
Created version '1' of model 'arturo-model'.
Registered model 'arturo-model' already exists. Creating a new version of this model...
2024/11/21 11:30:16 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: arturo-model, version 2


Champion Model: Run ID e4f42c04de4f44d2ba5f149739a943e8, Version 1
Challenger Model: Run ID 0bc16ecbae7f4a3bb0295dbc705cfd0a, Version 2


Created version '2' of model 'arturo-model'.
