In [38]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

In [39]:
data = pd.read_csv("../data/clean/processed_data.csv")

In [40]:
def preprocess_text(text):
    # minusculas
    text = text.lower()
    
    # Eliminar puntuación
    text = ''.join([char for char in text if char not in string.punctuation])

    # Eliminar cadenas con más de dos 'X' consecutivas
    text = re.sub(r'x{2,}', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenización
    words = nltk.word_tokenize(text)
    
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Unir las palabras preprocesadas
    return ' '.join(words)

In [41]:
data['complaint_what_happened'] = data['complaint_what_happened'].apply(preprocess_text)
data['ticket_classification'] = data['ticket_classification'].apply(preprocess_text)


In [42]:
X = data['complaint_what_happened']
y = data['ticket_classification']

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [43]:
# vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [44]:
# transformar los datos de entrenamiento
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)


In [45]:
# transformar los datos de prueba
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [46]:
import dagshub
import mlflow

In [47]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)

In [48]:
mlflow.set_experiment("mariapaula-perez-logisticRegression")

<Experiment: artifact_location='mlflow-artifacts:/325c6ccf10f6419fa3a07d45f3c20ef2', creation_time=1732160772548, experiment_id='12', last_update_time=1732160772548, lifecycle_stage='active', name='mariapaula-perez-logisticRegression', tags={}>

Logistic Regression

In [49]:
with mlflow.start_run(run_name='Regression Logistica'):
    lr_model = LogisticRegression(max_iter=2000)
    
    #fit al modelo
    lr_model.fit(X_train_tfidf, y_train)
    
    # hacer predict
    y_pred = lr_model.predict(X_test_tfidf)
    
    # metricas de performace
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    mlflow.log_metric('accuracy', accuracy)
    
    # registrar el modelo
    mlflow.sklearn.log_model(lr_model, 'logistic_regression_model')
    
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.5921434220933298
                                                                                          precision    recall  f1-score   support

                                                bank account service bank productservice       0.00      0.00      0.00        60
                                      bank account service cashing check without account       0.00      0.00      0.00         4
                                             bank account service cd certificate deposit       0.00      0.00      0.00         4
                                                   bank account service checking account       0.57      0.34      0.43       249
                                                     bank account service saving account       0.00      0.00      0.00        14
                                         checking saving account banking product service       0.00      0.00      0.00        48
                                          checking saving ac

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/11/21 14:48:45 INFO mlflow.tracking._tracking_service.client: 🏃 View run Regression Logistica at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/013384fc119745069d059bfcd95de104.
2024/11/21 14:48:45 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.


Accuracy de 59%. Intentar mejorar haciendo gridsearch

In [50]:
parameters = {
    'C': [0.1, 1, 5],          #regularización inversa
    'penalty': ['l1', 'l2'],    # Tipos de penalización (l1, l2)
    'solver': ['liblinear']   
}

# GridSearchCV
grid_search_lr = GridSearchCV(
    LogisticRegression(max_iter=1000),
    parameters,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

In [51]:
#hacer el run de mlflow
with mlflow.start_run(run_name='LogisticRegression Mejorar'):
    # intentar diferentes hiperparámetros
    grid_search_lr.fit(X_train_tfidf, y_train)
    
    # Mejor modelo
    better_model = grid_search_lr.best_estimator_
    
    # predicciones
    y_pred = better_model.predict(X_test_tfidf)
    
    # Calcular métricas
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Registrar métricas y parámetros
    mlflow.log_params(grid_search_lr.best_params_)
    mlflow.log_metric('accuracy', accuracy)
    
    mlflow.sklearn.log_model(better_model, 'better_logistic_regression_mp')
    
    print(f"Best Params: {grid_search_lr.best_params_}")
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Best Params: {'C': 5, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy: 0.5974162931716319
                                                                                          precision    recall  f1-score   support

                                                bank account service bank productservice       0.25      0.02      0.03        60
                                      bank account service cashing check without account       0.00      0.00      0.00         4
                                             bank account service cd certificate deposit       0.00      0.00      0.00         4
                                                   bank account service checking account       0.50      0.39      0.44       249
                                                     bank account service saving account       1.00      0.07      0.13        14
                                         checking saving account banking product service       0.33      0.02      0.04        4

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/11/21 14:51:14 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression Mejorar at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/31d7515902d84b68ba08155a68c8deda.
2024/11/21 14:51:14 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.


Subio a 60. :)

# Modelo 2: Suport Vector Classifier

In [52]:
mlflow.set_experiment('mariapaula-perez-SVClassifier')

<Experiment: artifact_location='mlflow-artifacts:/bfdaea6ea41c4c0fa7411726fadd8133', creation_time=1732168497880, experiment_id='16', last_update_time=1732168497880, lifecycle_stage='active', name='mariapaula-perez-SVClassifier', tags={}>

In [53]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder

In [54]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [55]:
with mlflow.start_run(run_name='SVC_MP'):
    # Crear un pipeline 
    svc_model = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000)),
        ('svc', SVC(kernel='linear', C=1))
    ])
    
    # Entrenar el pipeline
    svc_model.fit(X_train, y_train_encoded)
    
    y_pred = svc_model.predict(X_test)
    
    accuracy = accuracy_score(y_test_encoded, y_pred)
    report = classification_report(y_test_encoded, y_pred, output_dict=True)
    
    # Registrar metricas
    mlflow.log_metric('accuracy', accuracy)
    
    # Registrar el modelo 
    mlflow.sklearn.log_model(svc_model, 'svc_model')
    
    print(f"Accuracy: {accuracy}")
    print(classification_report(y_test_encoded, y_pred))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.6092802530978118
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        60
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         4
           3       0.59      0.39      0.46       249
           4       1.00      0.07      0.13        14
           5       0.00      0.00      0.00        48
           6       0.25      0.20      0.22         5
           7       0.61      0.88      0.72       758
           8       0.78      0.15      0.25        46
           9       0.00      0.00      0.00         7
          10       0.00      0.00      0.00         5
          11       0.50      0.14      0.22        35
          12       0.76      0.89      0.82       984
          13       1.00      0.12      0.22         8
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2024/11/21 14:55:31 INFO mlflow.tracking._tracking_service.client: 🏃 View run SVC_MP at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/16/runs/5407b23cfc464a7593f52ffa833f2899.
2024/11/21 14:55:31 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/16.


Accuracy de 61, mejoro un poco. 

In [56]:
# from mlflow.tracking import MlflowClient

# client = MlflowClient()

# run_id_champion = "c0946a5c2dc44f379a398377d97244c5"
# run_id_challenger = "c3c0702311004d98984b4f17d6cce6cf"

# # Registra el Champion
# mlflow.register_model(
#     model_uri=f"runs:/{run_id_champion}/model",
#     name="Champion-Model"
# )

# # Registra el Challenger
# mlflow.register_model(
#     model_uri=f"runs:/{run_id_challenger}/model",
#     name="Challenger-Model"
# )
