In [23]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import pickle
import pathlib
from sklearn.feature_extraction import DictVectorizer

In [24]:
data = pd.read_csv("../data/clean/processed_data.csv")

In [25]:
def preprocess_text(text):
    # minusculas
    text = text.lower()
    
    # Eliminar puntuaci칩n
    text = ''.join([char for char in text if char not in string.punctuation])

    # Eliminar cadenas con m치s de dos 'X' consecutivas
    text = re.sub(r'x{2,}', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenizaci칩n
    words = nltk.word_tokenize(text)
    
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lematizaci칩n
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Unir las palabras preprocesadas
    return ' '.join(words)

In [26]:
def preprocess_data(data):
    processed_data = [preprocess_text(text) for text in data]
    
    # Convertir el texto procesado en una representaci칩n de caracter칤sticas usando DictVectorizer
    dv = DictVectorizer(sparse=False)
    
    # Convertir los textos procesados a un formato de diccionario 
    data_processed = dv.fit_transform([{'word_' + word: 1 for word in text.split()} for text in processed_data])
    
    return data_processed, dv

In [27]:
data['complaint_what_happened'] = data['complaint_what_happened'].apply(preprocess_text)
data['ticket_classification'] = data['ticket_classification'].apply(preprocess_text)


In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear el vectorizador TF-IDF
vectorizer = TfidfVectorizer()

# Ajustar y transformar la columna 'complaint_what_happened'
tfidf_matrix = vectorizer.fit_transform(data['complaint_what_happened'])

# Convertir la matriz en un DataFrame y asignar nombres a las columnas
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenar las nuevas columnas al DataFrame original
data = pd.concat([data, tfidf_df], axis=1)

# Eliminar la columna original si ya no la necesitas
data.drop(columns=['complaint_what_happened'], inplace=True)

print(data.head())


                               ticket_classification   00  000  0000  00000  \
0                   debt collection credit card debt  0.0  0.0   0.0    0.0   
1  credit card prepaid card generalpurpose credit...  0.0  0.0   0.0    0.0   
2  credit reporting credit repair service persona...  0.0  0.0   0.0    0.0   
3  credit reporting credit repair service persona...  0.0  0.0   0.0    0.0   
4           checking saving account checking account  0.0  0.0   0.0    0.0   

   0000000  000remarks  001  0015946  002  ...  zip  zipcode  zipcodeaddress  \
0      0.0         0.0  0.0      0.0  0.0  ...  0.0      0.0             0.0   
1      0.0         0.0  0.0      0.0  0.0  ...  0.0      0.0             0.0   
2      0.0         0.0  0.0      0.0  0.0  ...  0.0      0.0             0.0   
3      0.0         0.0  0.0      0.0  0.0  ...  0.0      0.0             0.0   
4      0.0         0.0  0.0      0.0  0.0  ...  0.0      0.0             0.0   

   zipped  zombie  zone  zoned  zoo  zoom  z

In [34]:
from sklearn.preprocessing import LabelEncoder

# Crear un objeto LabelEncoder
label_encoder = LabelEncoder()

# Aplicar LabelEncoder a la columna 'ticket_classification'
data['ticket_classification'] = label_encoder.fit_transform(data['ticket_classification'])

print(data.head())


   ticket_classification   00  000  0000  00000  0000000  000remarks  001  \
0                     24  0.0  0.0   0.0    0.0      0.0         0.0  0.0   
1                     12  0.0  0.0   0.0    0.0      0.0         0.0  0.0   
2                     20  0.0  0.0   0.0    0.0      0.0         0.0  0.0   
3                     19  0.0  0.0   0.0    0.0      0.0         0.0  0.0   
4                      7  0.0  0.0   0.0    0.0      0.0         0.0  0.0   

   0015946  002  ...  zip  zipcode  zipcodeaddress  zipped  zombie  zone  \
0      0.0  0.0  ...  0.0      0.0             0.0     0.0     0.0   0.0   
1      0.0  0.0  ...  0.0      0.0             0.0     0.0     0.0   0.0   
2      0.0  0.0  ...  0.0      0.0             0.0     0.0     0.0   0.0   
3      0.0  0.0  ...  0.0      0.0             0.0     0.0     0.0   0.0   
4      0.0  0.0  ...  0.0      0.0             0.0     0.0     0.0   0.0   

   zoned  zoo  zoom  zoomed  
0    0.0  0.0   0.0     0.0  
1    0.0  0.0   0.0 

In [32]:
data

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,debt collection credit card debt
1,upgraded card 2018 told agent upgrade annivers...,credit card prepaid card generalpurpose credit...
2,chase card reported 2019 however fraudulent ap...,credit reporting credit repair service persona...
3,2018 trying book ticket came across offer 3000...,credit reporting credit repair service persona...
4,grand son give check 160000 deposit chase acco...,checking saving account checking account
...,...,...
18958,husband passed away chase bank put check hold ...,checking saving account checking account
18959,chase card customer well decade offered multip...,credit card prepaid card generalpurpose credit...
18960,wednesday called chas visa credit card provide...,credit card prepaid card generalpurpose credit...
18961,familiar pay understand great risk provides co...,checking saving account checking account


In [35]:
# TF-IDF para las columnas preprocesadas
vectorizer = TfidfVectorizer(max_features=5000) 

# Aplicar TF-IDF 
X_complaint = vectorizer.fit_transform(data['complaint_what_happened'])
X_classification = vectorizer.fit_transform(data['ticket_classification'])

print(f"Forma de la matriz de 'complaint_what_happened': {X_complaint.shape}")
print(f"Forma de la matriz de 'ticket_classification': {X_classification.shape}")

KeyError: 'complaint_what_happened'

In [36]:

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_complaint, data['ticket_classification'], test_size=0.2, random_state=42)

print(f"Tama침o de entrenamiento: {X_train.shape}, Tama침o de prueba: {X_test.shape}")

Tama침o de entrenamiento: (15170, 5000), Tama침o de prueba: (3793, 5000)


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sup칩n que 'text_column' es una columna con texto
vectorizer = TfidfVectorizer()

# Ajusta y transforma la columna de texto
tfidf_matrix = vectorizer.fit_transform(data['text_column'])

# Convierte la matriz TF-IDF en un DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Concatenar las nuevas columnas al DataFrame original
data = pd.concat([data, tfidf_df], axis=1)

# Eliminar la columna original de texto si ya no la necesitas
data.drop(columns=['text_column'], inplace=True)

print(data.head())


KeyError: 'text_column'

In [30]:
import pandas as pd

# Verifica si hay valores de tipo texto (str) en todo el DataFrame
text_columns = data.applymap(lambda x: isinstance(x, str))

# Muestra las filas y columnas que contienen cadenas de texto
text_in_dataframe = data[text_columns]

# Para ver qu칠 columnas contienen texto
columns_with_text = text_columns.any()

print("Columnas con texto:", columns_with_text)

# Si quieres ver las filas con texto, puedes imprimir una muestra
print(text_in_dataframe.head())


Columnas con texto: complaint_what_happened    True
ticket_classification      True
dtype: bool
                             complaint_what_happened  \
0  good morning name appreciate could help put st...   
1  upgraded card 2018 told agent upgrade annivers...   
2  chase card reported 2019 however fraudulent ap...   
3  2018 trying book ticket came across offer 3000...   
4  grand son give check 160000 deposit chase acco...   

                               ticket_classification  
0                   debt collection credit card debt  
1  credit card prepaid card generalpurpose credit...  
2  credit reporting credit repair service persona...  
3  credit reporting credit repair service persona...  
4           checking saving account checking account  


  text_columns = data.applymap(lambda x: isinstance(x, str))


In [8]:
import dagshub
import mlflow

In [9]:
dagshub.init(repo_owner='zapatacc', repo_name='final-exam-pcd2024-autumn', mlflow=True)

In [12]:
mlflow.set_experiment("mariapaula-perez-logisticRegression")

2024/11/20 21:46:12 INFO mlflow.tracking.fluent: Experiment with name 'mariapaula-perez-logisticRegression' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/325c6ccf10f6419fa3a07d45f3c20ef2', creation_time=1732160772548, experiment_id='12', last_update_time=1732160772548, lifecycle_stage='active', name='mariapaula-perez-logisticRegression', tags={}>

In [13]:
from hyperopt import STATUS_OK 

def objective(params):
    with mlflow.start_run(nested=True):
         
        # Etiqueta el modelo para identificar la familia
        mlflow.set_tag("model_family", "logistic_regression")
        
        # Log de par치metros
        mlflow.log_params(params)
        
        # usar los parametris
        model = LogisticRegression(params)
        
        # Entrenar el modelo
        model.fit(X_train, y_train)
        
        # Loguear el modelo entrenado con MLflow
        mlflow.sklearn.log_model(model, artifact_path="model")
         
        # Realizar predicciones sobre el conjunto de prueba
        y_pred = model.predict(X_test)
        
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log de las m칠tricas
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("accuracy", accuracy)
        
    return {'loss': rmse, 'status': STATUS_OK}


In [14]:
import mlflow
import mlflow.sklearn
from hyperopt import fmin, tpe, hp, Trials
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
import pathlib
import pickle

# Iniciar un run en MLflow
with mlflow.start_run(run_name="Logistic Regression Optimization", nested=True):
    
    # Definir el espacio de b칰squeda para los hiperpar치metros
    search_space = {
        'C': hp.loguniform('C', -3, 3),  # Regularizaci칩n
        'solver': hp.choice('solver', ['liblinear', 'lbfgs', 'saga']),  # Algoritmos de optimizaci칩n
        'max_iter': hp.choice('max_iter', [100, 200, 300]),  # N칰mero m치ximo de iteraciones
        'penalty': hp.choice('penalty', ['l2', 'elasticnet']),  # Tipo de regularizaci칩n
        'random_state': 42  # Estado aleatorio para la reproducibilidad
    }
    
    # Optimizaci칩n de los hiperpar치metros con Hyperopt
    best_params = fmin(
        fn=objective,  # La funci칩n objetivo a minimizar
        space=search_space,  # El espacio de b칰squeda de los par치metros
        algo=tpe.suggest,  # Algoritmo TPE para optimizaci칩n
        max_evals=10,  # N칰mero m치ximo de evaluaciones
        trials=Trials()  # Instancia para almacenar todas las evaluaciones
    )
    
    # Log de los mejores par치metros obtenidos despu칠s de la optimizaci칩n
    mlflow.log_params(best_params)

    # Etiquetas para el seguimiento del modelo
    mlflow.set_tags({
        "project": "Examen final MP",   
        "optimizer_engine": "hyper-opt",
        "model_family": "logistic_regression",
        "feature_set_version": 1
    })

    # Crear y entrenar el modelo final con los mejores par치metros encontrados
    model = LogisticRegression(**best_params)
    model.fit(X_train, y_train)
    
    # Log del modelo final en MLflow
    mlflow.sklearn.log_model(model, artifact_path="final_model")

    # Verifica si 'dv' (preprocesador) existe y es v치lido antes de guardarlo
    if 'dv' in locals() or 'dv' in globals():
        pathlib.Path("models").mkdir(exist_ok=True)  # Crear el directorio si no existe
        with open("models/preprocessor.pkl", "wb") as f_out:
            pickle.dump(dv, f_out)
        
        # Log del preprocesador como artefacto en MLflow
        mlflow.log_artifact("models/preprocessor.pkl", artifact_path="preprocessor")
    
    # Realizar predicciones sobre el conjunto de prueba
    y_pred = model.predict(X_test)

    # Calcular m칠tricas finales
    accuracy = accuracy_score(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)

    # Log de las m칠tricas finales
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("rmse", rmse)


  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

2024/11/20 21:49:58 INFO mlflow.tracking._tracking_service.client: 游끢 View run inquisitive-toad-802 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/87cba906477b462db10439f47c8d4e36.

2024/11/20 21:49:58 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.

job exception: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l2', 'l1'} or None. Got {'C': 1.3268706049125965, 'max_iter': 200, 'penalty': 'elasticnet', 'random_state': 42, 'solver': 'liblinear'} instead.



  0%|          | 0/10 [00:01<?, ?trial/s, best loss=?]

2024/11/20 21:49:59 INFO mlflow.tracking._tracking_service.client: 游끢 View run Logistic Regression Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/762b4a2525214d33a1907d75fb13b26a.





2024/11/20 21:49:59 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.


InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'elasticnet', 'l2', 'l1'} or None. Got {'C': 1.3268706049125965, 'max_iter': 200, 'penalty': 'elasticnet', 'random_state': 42, 'solver': 'liblinear'} instead.

In [15]:
def objective(params):
    with mlflow.start_run(nested=True):
        # Etiqueta el modelo para identificar la familia
        mlflow.set_tag("model_family", "logistic_regression")
        
        # Log de par치metros
        mlflow.log_params(params)
        
        # Crear el modelo con los par치metros
        model = LogisticRegression(
            penalty=params['penalty'],
            C=params['C'],
            solver=params['solver'],
            max_iter=1000,  # Asegurar convergencia
            random_state=42
        )
        
        # Entrenar el modelo
        model.fit(X_train, y_train)
        
        # Loguear el modelo entrenado con MLflow
        mlflow.sklearn.log_model(model, artifact_path="model")
         
        # Realizar predicciones sobre el conjunto de prueba
        y_pred = model.predict(X_test)
        
        # Calcular m칠tricas
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log de las m칠tricas
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("accuracy", accuracy)
        
        print(f"Par치metros: {params} | RMSE: {rmse:.4f} | Accuracy: {accuracy:.4f}")
        
    return {'loss': rmse, 'status': STATUS_OK}


In [19]:
search_space = {
    'penalty': hp.choice('penalty', ['l2', 'none']),
    'C': hp.loguniform('C', -4, 2),
    'solver': hp.choice('solver', ['lbfgs', 'liblinear']),
    'max_iter': hp.quniform('max_iter', 100, 1000, 100),  
}


In [20]:
def objective(params):
    with mlflow.start_run(nested=True):
        
        # Convertir max_iter a entero
        params['max_iter'] = int(params['max_iter'])
        
        # Tag model
        mlflow.set_tag("model_family", "logistic_regression")
        
        # Log parameters
        mlflow.log_params(params)
        
        # Crear el modelo con los par치metros
        model = LogisticRegression(
            penalty=params['penalty'],
            C=params['C'],
            solver=params['solver'],
            max_iter=params['max_iter'],
            random_state=42
        )
        
        # Train model
        model.fit(X_train, y_train)
        
        # Log LogisticRegression model
        mlflow.sklearn.log_model(model, artifact_path="model")
         
        # Predict on the test dataset
        y_pred = model.predict(X_test)
        
        # Calculate metrics
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Log performance metrics
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("accuracy", accuracy)
        
        print(f"Par치metros: {params} | RMSE: {rmse:.4f} | Accuracy: {accuracy:.4f}")
    
    return {'loss': rmse, 'status': STATUS_OK}

In [21]:
with mlflow.start_run(run_name="Logistic Regression Hyper-parameter Optimization"):
    trials = Trials()
    
    # Optimizaci칩n de los hiperpar치metros
    best_params = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=10,
        trials=trials
    )
    
    # Convertir 'max_iter' a entero antes de registrar
    best_params['max_iter'] = int(best_params['max_iter'])
    
    # Loguear los mejores par치metros en MLflow
    mlflow.log_params(best_params)
    
    # Registrar el tag para identificar el tipo de modelo
    mlflow.set_tags({
        "project": "Logistic Regression Optimization",
        "optimizer_engine": "hyperopt",
        "model_family": "logistic_regression"
    })
    
    # Mostrar los mejores par치metros encontrados
    print(f"Mejores par치metros encontrados: {best_params}")

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



2024/11/20 22:05:41 INFO mlflow.tracking._tracking_service.client: 游끢 View run nosy-loon-559 at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/045b26c277674a72b34a820da0a7768f.

2024/11/20 22:05:41 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.

job exception: could not convert string to float: 'vehicle loan lease loan'



  0%|          | 0/10 [00:39<?, ?trial/s, best loss=?]


2024/11/20 22:05:42 INFO mlflow.tracking._tracking_service.client: 游끢 View run Logistic Regression Hyper-parameter Optimization at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12/runs/350a27399038460199718966ac3c0d4f.
2024/11/20 22:05:42 INFO mlflow.tracking._tracking_service.client: 游빍 View experiment at: https://dagshub.com/zapatacc/final-exam-pcd2024-autumn.mlflow/#/experiments/12.


ValueError: could not convert string to float: 'vehicle loan lease loan'