In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import dagshub
import mlflow 
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
file_path = "../data/processed/tickets_cleaned.csv"
df = pd.read_csv(file_path)
df

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded card //2018 told agent upgrade annive...,Credit card or prepaid card + General-purpose ...
2,"chase card reported //2019 . however , fraudul...","Credit reporting, credit repair services, or o..."
3,"//2018 , trying book ticket , came across offe...","Credit reporting, credit repair services, or o..."
4,grand son give check { $ 1600.00 } deposit cha...,Checking or savings account + Checking account
...,...,...
18958,husband passed away . chase bank put check hol...,Checking or savings account + Checking account
18959,"chase card customer well decade , offered mult...",Credit card or prepaid card + General-purpose ...
18960,"wednesday , // called chas , visa credit card ...",Credit card or prepaid card + General-purpose ...
18961,familiar pay understand great risk provides co...,Checking or savings account + Checking account


In [4]:
X = df["complaint_what_happened"] 
y = df["ticket_classification"]

In [5]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()
print(MLFLOW_TRACKING_URI)

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorización del texto
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  

In [7]:
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    df['complaint_what_happened'], 
    df['ticket_classification'], 
    test_size=0.2, 
    random_state=42
)

# Ajustar y transformar los datos con TF-IDF
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
# Probé 10 combinaciones diferentes en este caso de param grid y estas dos fueron las mejores que me salieron y por eso las dejé

param_grid = [
    {"max_iter": 300, "C": 2.0, "solver": "liblinear"},
    {"max_iter": 100, "C": 1.5, "solver": "sag"}, 
]

In [16]:
# Ejecutar cada combinación de hiperparámetros
for idx, params in enumerate(param_grid):
    with mlflow.start_run(run_name=f"Logistic Regression Run {idx+1}"):
        # Inicializar el modelo con los parámetros actuales
        logreg = LogisticRegression(
            max_iter=params["max_iter"], 
            C=params["C"], 
            solver=params["solver"], 
            random_state=42
        )
        
        # Entrenar el modelo
        logreg.fit(X_train_tfidf, y_train)
        
        # Predicciones
        y_pred = logreg.predict(X_test_tfidf)
        
        # Métrica de accuracy
        accuracy = accuracy_score(y_test, y_pred)
        
        # Loggear parámetros y métricas en MLflow
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        
        # Registrar el modelo en MLflow
        mlflow.sklearn.log_model(logreg, f"logistic-regression-model-run-{idx+1}")
        
        # Imprimir el reporte de clasificación
        print(f"Run {idx+1}:")
        print("Classification Report:\n", classification_report(y_test, y_pred))

# Confirmar que se registraron todos los experimentos en MLflow
print("All experiments finished and logged in MLflow.")



Run 1:
Classification Report:
                                                                                                                precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.47      0.27      0.35       217
                                                         Bank account or service + Other bank product/service       1.00      0.02      0.04        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking o

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run Logistic Regression Run 1 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/0/runs/cd211721847f4750bcf0c24f65e1ec5b
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/0




Run 2:
Classification Report:
                                                                                                                precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.48      0.32      0.38       217
                                                         Bank account or service + Other bank product/service       1.00      0.02      0.04        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking o

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run Logistic Regression Run 2 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/0/runs/6b2964c3ad1546249f5d7da9d6524d15
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/0
All experiments finished and logged in MLflow.


*  En esta parte estoy entrenando con los dos mejores parámetros que me salieron de los 10 intentos que hice  arriba para sacar mi champion y challenger 

## CHALLENGER LOGISTIC

In [9]:
  # Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Establecer el experimento en MLflow
mlflow.set_experiment("Tinoco-logistic-regression")

# Hiperparámetros óptimos seleccionados
best_params = {"max_iter": 300, "C": 2.0, "solver": "liblinear"}

# Entrenar el modelo con los mejores parámetros
with mlflow.start_run(run_name="Challenger Logistic Regression Model"):
    # Inicializar el modelo con los mejores parámetros
    logreg = LogisticRegression(
        max_iter=best_params["max_iter"], 
        C=best_params["C"], 
        solver=best_params["solver"], 
        random_state=42
    )
    
    # Entrenar el modelo
    logreg.fit(X_train_tfidf, y_train)
    
    # Predicciones
    y_pred = logreg.predict(X_test_tfidf)
    
    # Métrica de accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Loggear parámetros y métricas en MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy)
    
    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(logreg, "challenger-logistic-regression-model")
    
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy:.4f}")

print("Best model finished and logged in MLflow.")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                                                                                                                precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.47      0.27      0.35       217
                                                         Bank account or service + Other bank product/service       1.00      0.02      0.04        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking or savin

## CHAMPION LOGISTIC

In [10]:
# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Establecer el experimento en MLflow
mlflow.set_experiment("Tinoco-logistic-regression")

# Hiperparámetros óptimos seleccionados
best_params = {"max_iter": 100, "C": 1.5, "solver": "sag"}

# Entrenar el modelo con los mejores parámetros
with mlflow.start_run(run_name="Best Logistic Regression Model"):
    # Inicializar el modelo con los mejores parámetros
    logreg = LogisticRegression(
        max_iter=best_params["max_iter"], 
        C=best_params["C"], 
        solver=best_params["solver"], 
        random_state=42
    )
    
    # Entrenar el modelo
    logreg.fit(X_train_tfidf, y_train)
    
    # Predicciones
    y_pred = logreg.predict(X_test_tfidf)
    
    # Métrica de accuracy
    accuracy = accuracy_score(y_test, y_pred)
    
    # Loggear parámetros y métricas en MLflow
    mlflow.log_params(best_params)
    mlflow.log_metric("accuracy", accuracy)
    
    # Registrar el modelo en MLflow
    mlflow.sklearn.log_model(logreg, "best-logistic-regression-model")
    
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print(f"Accuracy: {accuracy:.4f}")

print("Best model finished and logged in MLflow.")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Classification Report:
                                                                                                                precision    recall  f1-score   support

                                                        Bank account or service + (CD) Certificate of deposit       0.00      0.00      0.00         3
                                                 Bank account or service + Cashing a check without an account       0.00      0.00      0.00         7
                                                                   Bank account or service + Checking account       0.48      0.32      0.38       217
                                                         Bank account or service + Other bank product/service       1.00      0.02      0.04        55
                                                                    Bank account or service + Savings account       0.00      0.00      0.00        17
                                                    Checking or savin

In [11]:
# Establecer el experimento en MLflow
mlflow.set_experiment('Tinoco-logistic-regression')

2024/11/21 19:06:40 INFO mlflow.tracking.fluent: Experiment with name 'Tinoco-logistic-regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/74c7ab95b6764c0c9934dbaa112073d9', creation_time=1732237600169, experiment_id='29', last_update_time=1732237600169, lifecycle_stage='active', name='Tinoco-logistic-regression', tags={}>

## Registrar el modelo 

In [34]:
# Registrar modelo con MLflow
run_id = input("Ingrese el run_id: ")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="Tinoco-modelo"
)

print("Modelo registrado con éxito.")

Registered model 'Tinoco-modelo' already exists. Creating a new version of this model...
2024/11/22 03:16:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-modelo, version 3


Modelo registrado con éxito.


Created version '3' of model 'Tinoco-modelo'.


### Asirgnarle alias champion

In [35]:
from datetime import datetime
from mlflow import MlflowClient

client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="Tinoco-modelo",
    description="",
)

new_alias = "champion"
date = datetime.today()
model_version = "3"

# create "champion" alias for version 3 of model LR
client.set_registered_model_alias(
    name="Tinoco-modelo",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="Tinoco-modelo",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['champion'], creation_timestamp=1732267004964, current_stage='None', description='The model version 3 was transitioned to champion on 2024-11-22 03:17:12.802446', last_updated_timestamp=1732267033518, name='Tinoco-modelo', run_id='8af6ab21fffa4f96ae42527daba41d84', run_link='', source='mlflow-artifacts:/55cba0c4d60444db9d22a85ca8bd8434/8af6ab21fffa4f96ae42527daba41d84/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='3'>

### Asignar alias challenger

In [30]:
# Registrar modelo con MLflow
run_id = input("Ingrese el run_id: ")
run_uri = f"runs:/{run_id}/model"

result = mlflow.register_model(
    model_uri=run_uri,
    name="Tinoco-modelo2"
)

print("Modelo registrado con éxito.")

Registered model 'Tinoco-modelo2' already exists. Creating a new version of this model...
2024/11/22 02:34:34 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Tinoco-modelo2, version 2


Modelo registrado con éxito.


Created version '2' of model 'Tinoco-modelo2'.


In [31]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
client.update_registered_model(
    name="Tinoco-modelo",
    description="",
)

new_alias = "challenger"
date = datetime.today()
model_version = "2"

# create "challenger" alias for version 2 of model LR
client.set_registered_model_alias(
    name="Tinoco-modelo",
    alias=new_alias,
    version=model_version
)

client.update_model_version(
    name="Tinoco-modelo",
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_alias} on {date}",
)

<ModelVersion: aliases=['challenger'], creation_timestamp=1732264474501, current_stage='None', description=('The model version 2 was transitioned to challenger on 2024-11-22 '
 '02:34:48.660230'), last_updated_timestamp=1732264488937, name='Tinoco-modelo2', run_id='1644066e40c7405385b48920011ad5ae', run_link='', source='mlflow-artifacts:/55cba0c4d60444db9d22a85ca8bd8434/1644066e40c7405385b48920011ad5ae/artifacts/model', status='READY', status_message='', tags={}, user_id='', version='2'>