In [1]:
import pickle
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import mlflow
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../data/clean_data/cleaned_corpus.csv')

In [3]:
#Target y predictor
X = df['complaint_what_happened']
y = df['ticket_classification']

# Mapear categorías
category_mapping = {category: index for index, category in enumerate(y.unique())}
y_mapped = y.map(category_mapping)
y = y_mapped.tolist()
label_names = list(category_mapping.keys())
labels_list = list(category_mapping.values())

In [6]:
# Vectorizamos usando TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
X_TF = vectorizer.fit_transform(X).toarray()

In [14]:
import dagshub

dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)
mlflow.set_experiment("jesus-carbajal-logreg-rf")

<Experiment: artifact_location='mlflow-artifacts:/84a5ce7783a145f4885894900ed57256', creation_time=1732137494418, experiment_id='2', last_update_time=1732137494418, lifecycle_stage='active', name='jesus-carbajal-logreg-rf', tags={}>

In [15]:
# Tfid
text_train, text_test, sent_train, sent_test = train_test_split(X_TF, y, test_size = 0.20, random_state = 309)

#Param grid
params_grid = {
    "logreg" : (LogisticRegression(),{
        'C': [0.01, 0.1, 1],
        #'max_iter': [100, 200],
        'penalty': ['l2', 'elasticnet']}),

    "RF" : (RandomForestClassifier(),{
        "max_depth":[5, 10],
        "n_estimators": [100, 200]})
}


for model_name, (model, param_grid) in params_grid.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(
        model,
        param_grid,
        cv=3,
        scoring='accuracy',
        n_jobs=1,
        verbose=2
    )
    
    grid_search.fit(text_train, sent_train)
    
    # Loggear en mlflow
    with mlflow.start_run(run_name=f"{model_name}_grid_search"):
        best_model = grid_search.best_estimator_
        
        # predicciones
        y_pred = best_model.predict(text_test)
        
        # calcular métricas
        accuracy = accuracy_score(sent_test, y_pred)
        report = classification_report(sent_test, y_pred, output_dict=True)
        
        # Loggear el mejor modelo
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", report["weighted avg"]["precision"])
        mlflow.log_metric("recall", report["weighted avg"]["recall"])
        mlflow.log_metric("f1_score", report["weighted avg"]["f1-score"])
        
    
        mlflow.sklearn.log_model(best_model, artifact_path=f"jesus-carbajal-{model_name}")
        
        print(f"\nModel: {model_name}")
        print(f"Best Parameters: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy}")
        print("Classification Report:\n", classification_report(sent_test, y_pred))

Running GridSearchCV for logreg...
Fitting 3 folds for each of 6 candidates, totalling 18 fits




[CV] END .................................C=0.01, penalty=l2; total time= 1.5min
[CV] END .................................C=0.01, penalty=l2; total time= 2.1min
[CV] END .................................C=0.01, penalty=l2; total time= 1.4min
[CV] END .........................C=0.01, penalty=elasticnet; total time=   1.5s
[CV] END .........................C=0.01, penalty=elasticnet; total time=   1.4s
[CV] END .........................C=0.01, penalty=elasticnet; total time=   1.4s
[CV] END ..................................C=0.1, penalty=l2; total time= 1.4min
[CV] END ..................................C=0.1, penalty=l2; total time= 2.0min
[CV] END ..................................C=0.1, penalty=l2; total time= 1.5min
[CV] END ..........................C=0.1, penalty=elasticnet; total time=   1.5s
[CV] END ..........................C=0.1, penalty=elasticnet; total time=   1.4s
[CV] END ..........................C=0.1, penalty=elasticnet; total time=   1.4s
[CV] END ...................

9 fits failed out of a total of 18.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\dosju\Downloads\LabExamen\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\dosju\Downloads\LabExamen\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\dosju\Downloads\LabExamen\.venv\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
         


Model: logreg
Best Parameters: {'C': 1, 'penalty': 'l2'}
Accuracy: 0.5713155813340364
Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.22      0.29        88
           1       0.65      0.91      0.76       996
           2       0.00      0.00      0.00        14
           3       0.61      0.65      0.63       407
           4       0.56      0.84      0.67       733
           5       0.51      0.52      0.52        61
           6       0.59      0.22      0.32        59
           7       0.67      0.15      0.24        41
           8       0.38      0.55      0.45       209
           9       0.71      0.14      0.23        36
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        20
          12       0.78      0.19      0.31        36
          13       0.44      0.30      0.36        92
          14       0.22      0.06      0.10        32
          15       0.00 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Running GridSearchCV for RF...
Fitting 3 folds for each of 4 candidates, totalling 12 fits




[CV] END ......................max_depth=5, n_estimators=100; total time=  10.2s
[CV] END ......................max_depth=5, n_estimators=100; total time=  10.5s
[CV] END ......................max_depth=5, n_estimators=100; total time=   7.8s
[CV] END ......................max_depth=5, n_estimators=200; total time=  12.8s
[CV] END ......................max_depth=5, n_estimators=200; total time=  12.9s
[CV] END ......................max_depth=5, n_estimators=200; total time=  12.8s
[CV] END .....................max_depth=10, n_estimators=100; total time=  12.6s
[CV] END .....................max_depth=10, n_estimators=100; total time=  12.3s
[CV] END .....................max_depth=10, n_estimators=100; total time=  12.5s
[CV] END .....................max_depth=10, n_estimators=200; total time=  22.2s
[CV] END .....................max_depth=10, n_estimators=200; total time=  22.6s
[CV] END .....................max_depth=10, n_estimators=200; total time=  22.2s


  _data = np.array(data, dtype=dtype, copy=copy,
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model: RF
Best Parameters: {'max_depth': 10, 'n_estimators': 200}
Accuracy: 0.3822831531769048
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        88
           1       0.33      0.98      0.49       996
           2       0.00      0.00      0.00        14
           3       0.93      0.10      0.19       407
           4       0.55      0.59      0.57       733
           5       0.00      0.00      0.00        61
           6       0.00      0.00      0.00        59
           7       0.00      0.00      0.00        41
           8       1.00      0.01      0.02       209
           9       0.00      0.00      0.00        36
          10       0.00      0.00      0.00         7
          11       0.00      0.00      0.00        20
          12       0.00      0.00      0.00        36
          13       0.00      0.00      0.00        92
          14       0.00      0.00      0.00        32
          15   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
