In [3]:
# Importar librerías para preprocesamiento y modelado
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Importar mlflow para tracking de experimentos
import mlflow
import mlflow.sklearn

# Importar warnings para ignorar advertencias innecesarias
import warnings
warnings.filterwarnings('ignore')
import pandas as pd

In [2]:
import dagshub
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)

In [4]:
df = pd.read_csv('../data/processed_data/processed_tickets.csv')

df.head()

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded card //2018 told agent upgrade annive...,Credit card or prepaid card + General-purpose ...
2,"//2018 , trying book ticket , came across offe...","Credit reporting, credit repair services, or o..."
3,grand son give check { $ 1600.00 } deposit cha...,Checking or savings account + Checking account
4,please remove inquiry,"Credit reporting, credit repair services, or o..."


In [5]:
# Definir variables independientes y dependientes
X = df['complaint_what_happened']
y = df['ticket_classification']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [6]:
# Inicializar el vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Ajustar y transformar los datos de entrenamiento
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transformar los datos de prueba
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [7]:
# Establecer el experimento
mlflow.set_experiment('juan-blanco-logistic-regression')


2024/11/20 18:56:49 INFO mlflow.tracking.fluent: Experiment with name 'juan-blanco-logistic-regression' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/33a06b90f9704c8da6f10c2af583fb9e', creation_time=1732150609288, experiment_id='9', last_update_time=1732150609288, lifecycle_stage='active', name='juan-blanco-logistic-regression', tags={}>

In [8]:
with mlflow.start_run(run_name='LogisticRegression_Baseline'):
    lr_model = LogisticRegression(max_iter=1000)
    
    # Entrenar el modelo
    lr_model.fit(X_train_tfidf, y_train)
    
    # Hacer predicciones
    y_pred = lr_model.predict(X_test_tfidf)
    
    # Calcular métricas
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Registrar métrica
    mlflow.log_metric('accuracy', accuracy)
    
    # Registrar el modelo
    mlflow.sklearn.log_model(lr_model, 'logistic_regression_model')
    
    # Imprimir resultados
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))




Accuracy: 0.6144678492239468
                                                                                                 precision    recall  f1-score   support

                                                     Bank account or service + Checking account       0.66      0.22      0.34       249
                                           Bank account or service + Other bank product/service       0.00      0.00      0.00        60
                                                 Checking or savings account + Checking account       0.60      0.89      0.72       758
                                 Checking or savings account + Other banking product or service       0.00      0.00      0.00        48
                                                  Checking or savings account + Savings account       0.60      0.13      0.21        46
                                                                   Consumer Loan + Vehicle loan       0.67      0.06      0.11        35
           