In [21]:
import mlflow.sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import dagshub
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from mlflow.tracking import MlflowClient

In [2]:
file_path = "../data/processed/tickets_cleaned.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded card //2018 told agent upgrade annive...,Credit card or prepaid card + General-purpose ...
2,"chase card reported //2019 . however , fraudul...","Credit reporting, credit repair services, or o..."
3,"//2018 , trying book ticket , came across offe...","Credit reporting, credit repair services, or o..."
4,grand son give check { $ 1600.00 } deposit cha...,Checking or savings account + Checking account
...,...,...
18958,husband passed away . chase bank put check hol...,Checking or savings account + Checking account
18959,"chase card customer well decade , offered mult...",Credit card or prepaid card + General-purpose ...
18960,"wednesday , // called chas , visa credit card ...",Credit card or prepaid card + General-purpose ...
18961,familiar pay understand great risk provides co...,Checking or savings account + Checking account


In [3]:
X = df["complaint_what_happened"] 
y = df["ticket_classification"]

In [4]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
print(MLFLOW_TRACKING_URI)

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


In [7]:
X = df['complaint_what_happened']
y = df['ticket_classification']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Hacer TF-IDF

In [8]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Establecer el nombre e iniciar mlflow

In [14]:
mlflow.set_tracking_uri('https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow')

In [9]:
# Establecer el experimento
mlflow.set_experiment('Tinoco-svc')

2024/11/22 10:11:37 INFO mlflow.tracking.fluent: Experiment with name 'Tinoco-svc' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/fbb81ddfdfe148dba7f3c002b583460d', creation_time=1732291899063, experiment_id='41', last_update_time=1732291899063, lifecycle_stage='active', name='Tinoco-svc', tags={}>

In [13]:
# Iniciar un run de mlflow
with mlflow.start_run(run_name='svc_model'):
    # Crear el modelo
    svc_model = SVC()
    
    # Entrenar el modelo
    svc_model.fit(X_train_tfidf, y_train)
    
    # Hacer predicciones
    y_pred = svc_model.predict(X_test_tfidf)
    
    # Calcular métricas
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Registrar métrica
    mlflow.log_metric('accuracy', accuracy)
    
    # Registrar el modelo
    mlflow.sklearn.log_model(svc_model, 'svc_model')
    
    # Imprimir resultados
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.5921434220933298
                                                                                                               precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.72      0.17      0.27       217
                                                         Bank account or service + Other bank product/service       0.00      0.00      0.00        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking or 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run svc_model at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/41/runs/e35f29ce6100484c8814f7d2b83e6475
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/41


## Poner a prueba el gridsearch

In [15]:
# Definir los hiperparámetros
param_grid_svc = {
    'C': [0.1, 1],  
    'kernel': ['linear'], 
}

En este caso: 
 
- C: Controla la penalización por error en el margen. Valores más pequeños hacen que el modelo sea más permisivo (subajuste), mientras que valores más grandes generan un modelo más estricto (sobreajuste).


- kernel: Define el tipo de función del núcleo.

- linear: Clasificador lineal.

- 'rbf': Clasificador no lineal basado en funciones de base radial.


In [16]:
grid_search_svc = GridSearchCV(
    SVC(),
    param_grid_svc,
    cv=3,
    scoring='accuracy',
    n_jobs=-1
)

- param_grid_svc: El diccionario con las combinaciones de hiperparámetros a probar.

- cv=3: Realiza validación cruzada con 3 particiones (folds). Esto significa que el conjunto de datos se divide en 3 partes, y el modelo se entrena en 2 partes y se evalúa en la parte restante.

- scoring='accuracy': La métrica utilizada para evaluar el modelo.

- n_jobs=-1: Permite utilizar todos los núcleos disponibles de la CPU para paralelizar las evaluaciones.

In [17]:
with mlflow.start_run(run_name='svc_model'):
    # Entrenar con diferentes hiperparámetros
    grid_search_svc.fit(X_train_tfidf, y_train)
    best_svc_model = grid_search_svc.best_estimator_
    
    y_pred = best_svc_model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    mlflow.log_params(grid_search_svc.best_params_)
    mlflow.log_metric('accuracy', accuracy)
    
    mlflow.sklearn.log_model(best_svc_model, 'best_svc_model')
    
    print(f"Best Params: {grid_search_svc.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Params: {'C': 1, 'kernel': 'linear'}
Accuracy: 0.5989981544951226
                                                                                                               precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.57      0.26      0.35       217
                                                         Bank account or service + Other bank product/service       0.50      0.04      0.07        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run svc_model at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/41/runs/db44ba774be24e248b0ebba448573a7f
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/41


## Registro Champion y challenger SVC

In [19]:
# Sacar cliente y nombre experimento
client = MlflowClient()
experiment_name = "Tinoco-svc"
experiment = client.get_experiment_by_name(experiment_name)


In [22]:
# Verificar que el experimento existe
if experiment is None:
    print(f"El experimento '{experiment_name}' no se encontró.")
else:
    # Obtener el ID del experimento
    experiment_id = experiment.experiment_id

    # Buscar los dos mejores runs enbaseal accuracy
    top_runs = mlflow.search_runs(
        experiment_ids=[experiment_id],
        order_by=["metrics.accuracy DESC"],
        max_results=2
    )

    # Verificar que tenemos al menos dos runs
    if len(top_runs) < 2:
        print("No se encontraron suficientes runs para seleccionar Champion y Challenger.")
    else:
        # Obtenemos los Ids de los mejores runs
        champion_run = top_runs.iloc[0]
        challenger_run = top_runs.iloc[1]

        # Obtenemos los Ids de las ejecuciones
        champion_run_id = champion_run.run_id
        challenger_run_id = challenger_run.run_id

        # Obtenemos las urs de los modelos
        champion_model_uri = f"runs:/{champion_run_id}/model"
        challenger_model_uri = f"runs:/{challenger_run_id}/model"

        # Declaramos el nombre del modelo registrado
        model_name = "Tinoco-svc"

        # Registrar el modelo Champion
        champion_model_version = mlflow.register_model(
            model_uri=champion_model_uri,
            name=model_name
        )
        client.set_registered_model_alias(
            name=model_name,
            alias='champion',
            version=champion_model_version.version
        )

        # Registrar el modelo Challenger
        challenger_model_version = mlflow.register_model(
            model_uri=challenger_model_uri,
            name=model_name
        )
        client.set_registered_model_alias(
            name=model_name,
            alias='challenger',
            version=challenger_model_version.version
        )

        # Imprimir resultados
        print(f"Champion Model: Run ID {champion_run_id}, Version {champion_model_version.version}")
        print(f"Challenger Model: Run ID {challenger_run_id}, Version {challenger_model_version.version}")

Successfully registered model 'Tinoco-svc'.
2024/11/22 11:01:44 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-svc, version 1
Created version '1' of model 'Tinoco-svc'.
Registered model 'Tinoco-svc' already exists. Creating a new version of this model...
2024/11/22 11:01:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-svc, version 2
Created version '2' of model 'Tinoco-svc'.


Champion Model: Run ID db44ba774be24e248b0ebba448573a7f, Version 1
Challenger Model: Run ID e35f29ce6100484c8814f7d2b83e6475, Version 2
