In [1]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import mlflow
from sklearn.ensemble import RandomForestClassifier


In [2]:
df = pd.read_csv('../data/clean_data/cleaned_corpus.csv')

In [3]:
#Target y predictor
X = df['complaint_what_happened']
y = df['ticket_classification']

# Mapear categorías
category_mapping = {category: index for index, category in enumerate(y.unique())}
y_mapped = y.map(category_mapping)
y = y_mapped.tolist()
label_names = list(category_mapping.keys())
labels_list = list(category_mapping.values())

In [4]:
# Vectorizamos usando TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
X_TF = vectorizer.fit_transform(X).toarray()

# Logreg

In [None]:
# Tfid
text_train, text_test, sent_train, sent_test = train_test_split(X_TF, y, test_size = 0.20, random_state = 309)

#Param grid
params_grid = {
    "logreg" : (LogisticRegression(),{
        'C': [0.01, 0.1, 1],
        #'max_iter': [100, 200],
        'penalty': ['l2', 'elasticnet']}),

    "RF" : (RandomForestClassifier(),{
        "max_depth":[5, 10],
        "n_estimators": [100, 200]})
}

mlflow.set_tracking_uri("file:///tmp/mlruns")
mlflow.set_experiment("Jesus-Carbajal-experiment")


for model_name, (model, param_grid) in params_grid.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=1,
        verbose=2
    )
    

    grid_search.fit(text_train, sent_train)
    
    # Loggear en mlflow
    with mlflow.start_run(run_name=f"{model_name}_grid_search"):
        best_model = grid_search.best_estimator_
        
        # predicciones
        y_pred = best_model.predict(text_test)
        
        # calcular métricas
        accuracy = accuracy_score(sent_test, y_pred)
        report = classification_report(sent_test, y_pred, output_dict=True)
        
        # Loggear el mejor modelo
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", report["weighted avg"]["precision"])
        mlflow.log_metric("recall", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])
        
    
        mlflow.sklearn.log_model(best_model, artifact_path=f"best_model_{model_name}")
        
        print(f"\nModel: {model_name}")
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy}")
        print("Classification Report:\n", classification_report(sent_test, y_pred))

Running GridSearchCV for logreg...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV] END ...................C=0.01, max_iter=100, penalty=l2; total time= 1.6min
[CV] END ...................C=0.01, max_iter=100, penalty=l2; total time= 1.9min
[CV] END ...................C=0.01, max_iter=100, penalty=l2; total time= 1.9min
[CV] END ...................C=0.01, max_iter=200, penalty=l2; total time= 1.6min
[CV] END ...................C=0.01, max_iter=200, penalty=l2; total time= 1.9min
[CV] END ...................C=0.01, max_iter=200, penalty=l2; total time= 1.9min
[CV] END ....................C=0.1, max_iter=100, penalty=l2; total time= 1.9min


KeyboardInterrupt: 