In [17]:
import pickle
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import mlflow
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [18]:
df = pd.read_csv('../data/clean_data/cleaned_corpus.csv')

In [19]:
#Target y predictor
X = df['complaint_what_happened']
y = df['ticket_classification']

# Mapear categorías
labelmapping = LabelEncoder()
y_encoded = labelmapping.fit_transform(y)

In [20]:
# Vectorizamos usando TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
X_TF = vectorizer.fit_transform(X).toarray()

In [21]:
import dagshub

dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)
mlflow.set_experiment("jesus-carbajal-logreg-label-encoder")

<Experiment: artifact_location='mlflow-artifacts:/4b1db8acf1e34ea9af430f57180ec8de', creation_time=1732170344340, experiment_id='17', last_update_time=1732170344340, lifecycle_stage='active', name='jesus-carbajal-logreg-label-encoder', tags={}>

In [22]:
# Tfid
text_train, text_test, sent_train, sent_test = train_test_split(X, y, test_size=0.20, random_state=309)

pipeline = Pipeline([
    ("vectorizer", TfidfVectorizer()),
    ("logreg", LogisticRegression(max_iter=500))
])

params_grid = {
    'logreg__C': [0.01, 0.1, 1],
    'logreg__penalty': ['l2'],
    'logreg__solver': ['lbfgs'],
}

grid_search = GridSearchCV(pipeline, params_grid, scoring='accuracy', cv=5, n_jobs=1, verbose=1)

with mlflow.start_run(run_name="Logreg Pipeline"):
    grid_search.fit(text_train, sent_train)
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(text_test)
    
    # calcular métricas
    accuracy = accuracy_score(sent_test, y_pred)
    report = classification_report(sent_test, y_pred, output_dict=True)
    
    # Loggear el mejor modelo
    mlflow.log_params(grid_search.best_params_)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", report["weighted avg"]["precision"])
    mlflow.log_metric("recall", report["weighted avg"]["recall"])
    mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])
    
    mlflow.sklearn.log_model(best_model, artifact_path="pipeline_model")

    with open("labelmapping.pkl", "wb") as f:
        pickle.dump(labelmapping, f)
    mlflow.log_artifact("labelmapping.pkl")

Fitting 5 folds for each of 3 candidates, totalling 15 fits


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


🏃 View run Logreg Pipeline at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/17/runs/8ccf9f0120494d78a04c99aa0b113d82
🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/17
