In [None]:
import pandas as pd
import numpy as np
import mlflow
import pickle
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer  # Use TfidfVectorizer instead of CountVectorizer for efficiency
import dagshub

In [None]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)
mlflow.set_experiment("renata-orozco-modelos")

df = pd.read_csv(r"../Renata/datos/cleaned.csv")

X = df['complaint_what_happened']
y = df['ticket_classification']

# Split data into train and test sets
text_train, text_test, label_train, label_test = train_test_split(X, y, test_size=0.30, random_state=7)

# Encode labels using pandas factorize
encoded_labels_train = pd.factorize(label_train)[0]
encoded_labels_test = pd.factorize(label_test)[0]

# TF-IDF Vectorizer for transforming text data (this simplifies manual TF-IDF calculation)
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))

# Logistic Regression Pipeline with TF-IDF Vectorizer
logreg_pipeline = Pipeline([
    ("vectorizer", tfidf_vectorizer),
    ("logreg", LogisticRegression(max_iter=500))
])

# Grid search parameters for hyperparameter tuning
param_grid = {
    'logreg__C': [0.5, 0.15, 0.8],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['lbfgs'],
}

grid_search = GridSearchCV(logreg_pipeline, param_grid, scoring='accuracy', cv=5, n_jobs=1, verbose=1)

# Log experiment with MLflow
with mlflow.start_run(run_name="Logreg Pipeline"):
    grid_search.fit(text_train, encoded_labels_train)  # Fit directly with raw text data

    # Get the best model from the grid search
    best_logreg_model = grid_search.best_estimator_

    # Predict on the test set
    predictions = best_logreg_model.predict(text_test)

    # Calculate metrics
    accuracy = accuracy_score(encoded_labels_test, predictions)
    report = classification_report(encoded_labels_test, predictions, output_dict=True)

    # Log metrics to MLflow
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])

    # Log the best model
    mlflow.sklearn.log_model(best_logreg_model, artifact_path="logreg_pipeline_model")

    # Save and log the label encoder (if you use the label mapping)
    label_mapping = dict(enumerate(pd.unique(label_train)))
    with open("label_mapping.pkl", "wb") as f:
        pickle.dump(label_mapping, f)
    mlflow.log_artifact("label_mapping.pkl")
