## Model Training LogReg

Para la selección de modelos, decidí entrenar un modelo de Linear Support Vector Machine y otro de Regresión Logística

En este notebook se encuentra el entrenamiento de LogReg.

A continuación, aplicaremos CountVectorizer y TF-IDF Transformer.

CountVectorizer convierte el texto de cada registro a una matriz, en la cuál cada renglón representa un documento (que es un registro de la columna de quejas) y cada columna es una palabra del vocabulario del mismo.

TF-IDF Transformer convierte el conteo de palabras a un score de TF-IDF, que normaliza la importancia de cada palabra basada en su frecuencia en cada documento y a través de todos los documentos. Nos ayuda a enfocarnos en las palabras más importantes, que no son tan comunes, pero sobre todo convierte el texto en formato numérico para poder entrenar nuestro modelo.

In [6]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.linear_model import LogisticRegression
import dagshub
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, Trials
from hyperopt.pyll import scope
import pickle
import pathlib

In [2]:
df = pd.read_csv('../data/processed.csv')
X = df.complaint_what_happened
y = df.ticket_classification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [3]:
dagshub.init(url="https://dagshub.com/zapatacc/final-exam-pcd2024-autumn", mlflow=True)

MLFLOW_TRACKING_URI = mlflow.get_tracking_uri()

print(MLFLOW_TRACKING_URI)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name="erick-machuca-logreg")

https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow


2024/11/21 14:25:51 INFO mlflow.tracking.fluent: Experiment with name 'erick-machuca-logreg' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/2821e7ade72746f79102bef93a693c9e', creation_time=1732220751696, experiment_id='22', last_update_time=1732220751696, lifecycle_stage='active', name='erick-machuca-logreg', tags={}>

In [4]:
# Start logging the experiment
with mlflow.start_run() as run:
    # Log model parameters
    mlflow.log_param("n_jobs", 1)
    mlflow.log_param("C", 1e5)
    mlflow.log_param("max_iter", 1000)
    mlflow.log_param("class_weight", "balanced")

    # Define and train the logreg pipeline
    logreg = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(n_jobs=1, C=1e5, max_iter=1000, class_weight='balanced', random_state=42)),
    ])
    logreg.fit(X_train, y_train)

    # Make predictions
    y_pred = logreg.predict(X_test)

    # Calculate and log metrics
    accuracy = accuracy_score(y_pred, y_test)
    recall = recall_score(y_pred, y_test, average='macro')
    precision = precision_score(y_pred, y_test, average='macro')

    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)

    # Print a success message
    print(f"logreg Classifier logged in MLflow with accuracy: {accuracy:.2f}, recall: {recall:.2f}, precision: {precision:.2f}")

logreg Classifier logged in MLflow with accuracy: 0.57, recall: 0.44, precision: 0.34
🏃 View run peaceful-whale-722 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/22/runs/cbe050a610c3492f8f7cb0175f3e5f45
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/22


In [7]:
# 1. Define the objective function for Logistic Regression hyperparameter tuning
def objective(params):
    # Extract parameters from the search space
    C = params['C']
    max_iter = int(params['max_iter'])

    # Build and train the Logistic Regression pipeline
    logreg = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(
            n_jobs=1,
            C=C,
            max_iter=max_iter,
            class_weight='balanced',
            random_state=42))
    ])
    logreg.fit(X_train, y_train)

    # Make predictions and calculate the objective metric (e.g., negative accuracy for minimization)
    y_pred = logreg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return -accuracy  # Return negative because fmin minimizes by default

# 2. Set up the search space for hyperparameters
from hyperopt import fmin, tpe, hp, Trials, scope
search_space = {
    'C': hp.loguniform('C', -3, 3),  # Regularization strength on a log scale
    'max_iter': scope.int(hp.quniform('max_iter', 100, 1000, 100))  # Number of iterations
}

# 3. Start MLflow run for hyperparameter optimization
with mlflow.start_run(run_name="LogReg Hyper-parameter Optimization", nested=True):
    # 4. Optimize Logistic Regression parameters using hyperopt
    trials = Trials()
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,  # Adjust for more evaluations
        trials=trials
    )

    # Convert parameters to usable types
    best_params['C'] = float(best_params['C'])
    best_params['max_iter'] = int(best_params['max_iter'])

    # Log the best parameters to MLflow
    mlflow.log_params(best_params)

    # 5. Set experiment tags for tracking
    mlflow.set_tags({
        "project": "Text Classification with Logistic Regression",
        "optimizer_engine": "hyper-opt",
        "model_family": "Logistic Regression",
        "feature_set_version": 1,
    })

    # 6. Train the Logistic Regression model with the best parameters
    logreg = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', LogisticRegression(
            n_jobs=1,
            C=best_params['C'],
            max_iter=best_params['max_iter'],
            class_weight='balanced',
            random_state=42))
    ])
    logreg.fit(X_train, y_train)

    # Make predictions and calculate metrics
    y_pred = logreg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("precision", precision)

    # 7. Save the trained Logistic Regression pipeline using mlflow.sklearn
    mlflow.sklearn.log_model(logreg, "model")

    # Print out a success message
    print(f"Best Logistic Regression model logged with accuracy: {accuracy:.2f}, recall: {recall:.2f}, precision: {precision:.2f}")


ImportError: cannot import name 'scope' from 'hyperopt' (/Users/erick/Library/CloudStorage/OneDrive-ITESO/Proyecto_Ciencia_datos/final-exam-pcd2024-autumn/.venv/lib/python3.11/site-packages/hyperopt/__init__.py)