## Entrenamiento, validación, evaluación y selección del modelo usando mlflow

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import dagshub
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [7]:
import mlflow
import mlflow.sklearn

In [3]:
# Ruta 
file_path = "../data/processed/tickets_cleaned.csv"
df = pd.read_csv(file_path)


In [4]:
df

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded card //2018 told agent upgrade annive...,Credit card or prepaid card + General-purpose ...
2,"chase card reported //2019 . however , fraudul...","Credit reporting, credit repair services, or o..."
3,"//2018 , trying book ticket , came across offe...","Credit reporting, credit repair services, or o..."
4,grand son give check { $ 1600.00 } deposit cha...,Checking or savings account + Checking account
...,...,...
18958,husband passed away . chase bank put check hol...,Checking or savings account + Checking account
18959,"chase card customer well decade , offered mult...",Credit card or prepaid card + General-purpose ...
18960,"wednesday , // called chas , visa credit card ...",Credit card or prepaid card + General-purpose ...
18961,familiar pay understand great risk provides co...,Checking or savings account + Checking account


In [8]:
# Dividir el Df
X = df["complaint_what_happened"] 
y = df["ticket_classification"]

### Definir los Modelos y Experimentos

In [10]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
print(MLFLOW_TRACKING_URI)

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


In [11]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="Tinoco-RandomForest-prefect")

2024/11/22 15:21:39 INFO mlflow.tracking.fluent: Experiment with name 'Tinoco-RandomForest-prefect' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/3af24886fbed4b2f9b29660d4e196769', creation_time=1732310500912, experiment_id='46', last_update_time=1732310500912, lifecycle_stage='active', name='Tinoco-RandomForest-prefect', tags={}>

# Transformar los datos usando TF-IDF

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Vectorización del texto
tfidf = TfidfVectorizer(max_features=5000)  
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Entrenamineto y optimización 

In [14]:
## Modelo base (sencillo)
rf = RandomForestClassifier(random_state=42)

# Entrenar el modelo
rf.fit(X_train_tfidf, y_train)

In [15]:
y_pred = rf.predict(X_test_tfidf)
print("Reporte de clasificación para el modelo base:")
print(classification_report(y_test, y_pred))

Reporte de clasificación para el modelo base:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                                                                               precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.57      0.02      0.04       217
                                                         Bank account or service + Other bank product/service       0.00      0.00      0.00        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking or savings account + CD (Certifi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
########## intento 2

In [10]:
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, Trials
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Dividir el conjunto de datos
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = tfidf_vectorizer.fit_transform(df['complaint_what_happened'])
y = df['ticket_classification']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Definir el espacio de búsqueda de hiperparámetros
search_space_rf = {
    'n_estimators': hp.choice('n_estimators', range(50, 301, 50)),
    'max_depth': hp.choice('max_depth', range(5, 31, 5)),
    'min_samples_split': hp.uniform('min_samples_split', 0.01, 0.5),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.01, 0.5),
}


In [12]:
# Definir la función objetivo
def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Crear el modelo con los parámetros actuales
        rf_model = RandomForestClassifier(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=params['min_samples_split'],
            min_samples_leaf=params['min_samples_leaf'],
            random_state=42
        )
        
        # Entrenar el modelo
        rf_model.fit(X_train, y_train)
        
        # Evaluar en el conjunto de prueba
        y_pred = rf_model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Loggear los parámetros y métricas en MLflow
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        
        # Retornar el negativo de accuracy (Hyperopt minimiza)
        return -accuracy

In [13]:
# Ejecutar la optimización de hiperparámetros
with mlflow.start_run(run_name="Parent Random Forest"):
    trials = Trials()
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,  # Número de experimentos
        trials=trials
    )
print("Best Parameters:", best_params_rf)

🏃 View run treasured-worm-849 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/79568e9e2fdb4955bf01c2ccfdc0b2b0

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

🏃 View run receptive-chimp-696 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/760c364de6644dc39d1b478b7805403c

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

🏃 View run valuable-mink-427 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/3e069a148ee84546926ffc12cc4a5311

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

🏃 View run capable-jay-135 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/a5d9af54014c49dcb20e7d72253f9c00

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/exp

In [14]:
# Entrenar y registrar el modelo con los mejores parámetros
best_rf_model = RandomForestClassifier(
    n_estimators=int(best_params_rf['n_estimators']),
    max_depth=int(best_params_rf['max_depth']),
    min_samples_split=best_params_rf['min_samples_split'],
    min_samples_leaf=best_params_rf['min_samples_leaf'],
    random_state=42
)

In [22]:
best_params_rf

{'max_depth': np.int64(1),
 'min_samples_leaf': np.float64(0.039898821664180927),
 'min_samples_split': np.float64(0.089245932942373),
 'n_estimators': np.int64(5)}

## MODELO RF

In [30]:
# train, test, split
X = df['complaint_what_happened']
y = df['ticket_classification']

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [31]:
mlflow.sklearn.autolog()

# Define objective function for Random Forest model
def objective_rf(params):
    with mlflow.start_run(nested=True):
        # Set model tag
        mlflow.set_tag("model_family", "random_forest")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Create the Random Forest model with given parameters
        rf_model = RandomForestClassifier(
            n_estimators=int(params['n_estimators']),
            max_depth=int(params['max_depth']),
            min_samples_split=int(params['min_samples_split']),
            min_samples_leaf=int(params['min_samples_leaf']),
            random_state=42
        )
        
        # Fit the model
        rf_model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = rf_model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log accuracy metric
        mlflow.log_metric("accuracy", accuracy)
        
        return {'loss': -accuracy, 'status': STATUS_OK}

# Define search space for RandomForestClassifier
search_space_rf = {
    'n_estimators': hp.quniform('n_estimators', 50, 300, 1),
    'max_depth': hp.quniform('max_depth', 5, 30, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),
}

# Run hyperparameter optimization
with mlflow.start_run(run_name="Parent Random Forest", nested=True):
    best_params_rf = fmin(
        fn=objective_rf,
        space=search_space_rf,
        algo=tpe.suggest,
        max_evals=10,
        trials=Trials()
    )
    
    # Log best parameters
    mlflow.log_params(best_params_rf)

# Evaluate the best model
y_pred = RandomForestClassifier(
    n_estimators=int(best_params_rf['n_estimators']),
    max_depth=int(best_params_rf['max_depth']),
    min_samples_split=int(best_params_rf['min_samples_split']),
    min_samples_leaf=int(best_params_rf['min_samples_leaf']),
    random_state=42
).fit(X_train, y_train).predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run unequaled-sponge-127 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/ba042d1151bf45848d5d5a2f16c251e3

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 10%|█         | 1/10 [00:51<07:40, 51.20s/trial, best loss: -0.49248615871341944]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run mysterious-lark-794 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/0c8dd2e109a74c798aa482779eae19ce

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 20%|██        | 2/10 [01:40<06:41, 50.19s/trial, best loss: -0.4953862378064856] 


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run melodic-ram-697 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/1d86b3ae318f428ab8c9da425cb761d1

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 30%|███       | 3/10 [03:07<07:48, 66.94s/trial, best loss: -0.4953862378064856]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run capricious-doe-626 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/19bfa3985928402b93ab41c20da15ed8

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 40%|████      | 4/10 [04:09<06:30, 65.05s/trial, best loss: -0.4953862378064856]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run grandiose-slug-792 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/09d429240b614ddca1f5884c17d7ccfd

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 50%|█████     | 5/10 [04:31<04:06, 49.38s/trial, best loss: -0.4953862378064856]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run indecisive-robin-725 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/a8e0d8f1932748c385710204c3f753ef

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 60%|██████    | 6/10 [07:26<06:08, 92.10s/trial, best loss: -0.5040864750856842]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run dapper-gnu-26 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/0f11ff7435de47c280edcd52f9d1832d

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 70%|███████   | 7/10 [09:01<04:39, 93.14s/trial, best loss: -0.5040864750856842]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run judicious-doe-481 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/c2a28f4929654ce8940f5d4b4d100b81

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 80%|████████  | 8/10 [10:27<03:01, 90.68s/trial, best loss: -0.5225415238597416]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run skillful-wren-14 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/7fc34c5d6bf34925a37c83e71639e187

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

 90%|█████████ | 9/10 [11:08<01:15, 75.28s/trial, best loss: -0.5225415238597416]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))




🏃 View run dazzling-crane-585 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/620e774450914b979a08a484e3370b1c

🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24

100%|██████████| 10/10 [12:59<00:00, 77.91s/trial, best loss: -0.5225415238597416]
🏃 View run Parent Random Forest at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/7e5357174e254c279d7b0f106ad09941
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24


2024/11/22 00:25:36 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'bc76cf240e934aaa9caf0ede27fa5493', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run powerful-mouse-1 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24/runs/bc76cf240e934aaa9caf0ede27fa5493
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/24
Accuracy: 0.52
Classification Report:
                                                                                                               precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.00      0.00      0.00       217
                                                         Bank account or service + Other bank product/servi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Registrar el modelo 

In [14]:
# Registrar modelo con MLflow
run_id = input("Ingrese el run_id: ")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="Tinoco-RandomForest-prefect"
)

print("Modelo registrado con éxito.")

Registered model 'Tinoco-RandomForest-prefect' already exists. Creating a new version of this model...
2024/11/22 15:23:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-RandomForest-prefect, version 5


Modelo registrado con éxito.


Created version '5' of model 'Tinoco-RandomForest-prefect'.


### Asignarle alias champion

In [15]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="Tinoco-RandomForest-prefect",
    description="",
)

new_alias = "champion"
date = datetime.today()
model_version = "5"

# create "champion" alias for version 5 of model RF
client.set_registered_model_alias(
    name="Tinoco-RandomForest-prefect",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="Tinoco-RandomForest-prefect",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1732310611139, current_stage='None', description='The model version 5 was transitioned to champion on 2024-11-22 15:24:10.135960', last_updated_timestamp=1732310651791, name='Tinoco-RandomForest-prefect', run_id='52753b13f6024da788d4b72d6b8d9e49', run_link='', source='mlflow-artifacts:/d93605c076ee444987bee63989aab2a2/52753b13f6024da788d4b72d6b8d9e49/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='5'>

### Registrar y asignar alias challenger

In [35]:
# Registrar modelo con MLflow
run_id = input("Ingrese el run_id: ")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="Tinoco-RandomForest"
)

print("Modelo registrado con éxito.")

Registered model 'Tinoco-RandomForest' already exists. Creating a new version of this model...
2024/11/22 01:41:02 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-RandomForest, version 2


Modelo registrado con éxito.


Created version '2' of model 'Tinoco-RandomForest'.


In [36]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="Tinoco-RandomForest",
    description="",
)

new_alias = "challenger"
date = datetime.today()
model_version = "2"

# create "challenger" alias for version 2 of model RF

client.set_registered_model_alias(
    name="Tinoco-RandomForest",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="Tinoco-RandomForest",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['challegner'], creation_timestamp=1732261262698, current_stage='None', description=('The model version 2 was transitioned to challegner on 2024-11-22 '
 '01:41:49.729976'), last_updated_timestamp=1732261310063, name='Tinoco-RandomForest', run_id='0662099bc77945f4a5158c2b1259f869', run_link='', source='mlflow-artifacts:/d93605c076ee444987bee63989aab2a2/0662099bc77945f4a5158c2b1259f869/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>