# Primero en local

## Genera los runs

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import mlflow
import mlflow.sklearn

# Carga el dataset
df = pd.read_csv("../data/clean_data/cleaned.csv")

# Define X e y
X = df['complaint_what_happened']
y = df['ticket_classification']

# Codifica las clases de `y` como valores enteros
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.4, random_state=17, stratify=y_encoded
)

train_classes = set(y_train)
valid_indices = [i for i, label in enumerate(y_test) if label in train_classes]

if len(valid_indices) < len(y_test):
    print(f"Filtrando {len(y_test) - len(valid_indices)} instancias de prueba con clases desconocidas.")
X_test = X_test.iloc[valid_indices]
y_test = y_test[valid_indices]

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Configura el tracking de MLflow local
mlflow.set_tracking_uri("file:///tmp/mlruns")
mlflow.set_experiment("patricio-villanueva-experiments")

# Define modelos y parámetros para GridSearch
models_and_params = {
    "Logistic Regression": (LogisticRegression(max_iter=1000), {
        "C": [0.1, 1, 10],
        "penalty": ["l2"]
    }),
    "Random Forest": (RandomForestClassifier(), {
        "n_estimators": [100, 200],
        "max_depth": [None, 10, 20],
        "min_samples_split": [2, 5],
        "min_samples_leaf": [1, 2]
    })
    # "XGBoost": (XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
    #     "n_estimators": [100, 200],
    #     "max_depth": [3, 6],
    #     "learning_rate": [0.01, 0.1],
    #     "subsample": [0.8, 1.0]
    # })
}

# Entrenamiento y logging en MLflow
for model_name, (model, params) in models_and_params.items():
    grid_search = GridSearchCV(model, params, scoring='accuracy', cv=3, n_jobs=-1)
    
    with mlflow.start_run(run_name=f"GridSearch - {model_name}"):
        # Entrena el modelo usando GridSearchCV
        grid_search.fit(X_train_tfidf, y_train)
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test_tfidf)

        # Calcula métricas
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Loggea parámetros y métricas del mejor modelo
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", report["weighted avg"]["precision"])
        mlflow.log_metric("recall", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])
        
        # Loggea el modelo
        mlflow.sklearn.log_model(best_model, artifact_path=f"best_model_{model_name}")

        print(f"Model: {model_name}, Best Params: {grid_search.best_params_}, Accuracy: {accuracy}")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model: Logistic Regression, Best Params: {'C': 10, 'penalty': 'l2'}, Accuracy: 0.5995364057812926




## Selecciona el mejor modelo

In [None]:
import mlflow
from mlflow.tracking import MlflowClient

mlflow.set_tracking_uri("file:///tmp/mlruns") 

# Nombre del experimento
experiment_name = "patricio-villanueva-experiments"
client = MlflowClient() 

# Id del experimento
experiment = client.get_experiment_by_name(experiment_name)
if experiment is None:
    raise ValueError(f"No se encontró el experimento con nombre: {experiment_name}")
experiment_id = experiment.experiment_id

# Todos los runs (bueno los ultimos 1000)
runs = client.search_runs(
    experiment_ids=[experiment_id],
    filter_string="",
    run_view_type=mlflow.entities.ViewType.ACTIVE_ONLY,
    max_results=1000
)

# Encuentra el run con la mejor accuracy se que hay un order by pero neta nunca me sale asi que a la antigua
best_run = None
best_accuracy = -float("inf")

for run in runs:
    metrics = run.data.metrics
    if "accuracy" in metrics and metrics["accuracy"] > best_accuracy:
        best_accuracy = metrics["accuracy"]
        best_run = run

if best_run is None:
    raise ValueError("No se encontraron runs con la métrica 'accuracy'.")

# Log del mejor run
print(f"El mejor run es: {best_run.info.run_id} con accuracy: {best_accuracy}")

# Registra el modelo del mejor run
model_uri = f"runs:/{best_run.info.run_id}/model"
model_name = "patricio-model"

# Registra el modelo en MLflow Model Registry
registered_model = mlflow.register_model(model_uri=model_uri, name=model_name)

# Agrega un alias "champion" al modelo registrado
model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
client.set_registered_model_alias(
    name=model_name,
    alias="champion",
    version=model_version
)

# Transiciona el modelo a la etapa "Production"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage="Production",
    archive_existing_versions=True  
)

print(f"El modelo ha sido registrado como '{model_name}' con alias 'champion' y etapa 'Production'.")


El mejor run es: 40364ba68a9f4545a3fe6fe25ba05102 con accuracy: 0.5825960649676483
El modelo ha sido registrado como 'patricio-model' con alias 'champion' y etapa 'Production'.


Successfully registered model 'patricio-model'.
Created version '1' of model 'patricio-model'.
  model_version = client.get_latest_versions(name=model_name, stages=["None"])[0].version
  client.transition_model_version_stage(


# En dagshub

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
import mlflow
import mlflow.sklearn
import os

# Configuración para DagsHub
mlflow.set_tracking_uri("https://dagshub.com/<usuario>/<repo>.mlflow")
os.environ["MLFLOW_TRACKING_USERNAME"] = "<tu_usuario>"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "<tu_token>"

df = pd.read_csv("../data/clean_data/cleaned.csv")

# Define X y y
X = df['complaint_what_happened']
y = df['ticket_classification']

# División en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorización de texto
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Configura el experimento en MLflow
mlflow.set_experiment("ticket_classification_experiments")

# Define modelos a evaluar
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Entrenamiento y logging en MLflow
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Entrena el modelo
        model.fit(X_train_tfidf, y_train)
        y_pred = model.predict(X_test_tfidf)
        
        # Calcula métricas
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Loggea los parámetros y métricas
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", report["weighted avg"]["precision"])
        mlflow.log_metric("recall", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])
        
        # Loggea el modelo
        mlflow.sklearn.log_model(model, artifact_path=model_name)

        print(f"Model: {model_name}, Accuracy: {accuracy}")
