In [2]:
import pickle
import pandas as pd
from sklearn.metrics import  root_mean_squared_error
from sklearn.feature_extraction import  DictVectorizer
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report


In [3]:
df = pd.read_csv('../data/clean_data/cleaned_corpus.csv')
df.head()

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded 2018 told agent upgrade anniversary d...,Credit card or prepaid card + General-purpose ...
2,reported 2019 however fraudulent application s...,"Credit reporting, credit repair services, or o..."
3,2018 trying book ticket came across offer 300 ...,"Credit reporting, credit repair services, or o..."
4,grand son give check 1600 deposit fund clear c...,Checking or savings account + Checking account


In [4]:
#Target y predictor
X = df['complaint_what_happened']
y = df['ticket_classification']

# Mapear categorías
category_mapping = {category: index for index, category in enumerate(y.unique())}
y_mapped = y.map(category_mapping)
y = y_mapped.tolist()
label_names = list(category_mapping.keys())
labels_list = list(category_mapping.values())

In [5]:
# Vectorizamos usando TFIDF
vectorizer = TfidfVectorizer(stop_words = stopwords.words('english'))
X_TF = vectorizer.fit_transform(X).toarray()

# Logreg

In [None]:
# Tfid
text_train, text_test, sent_train, sent_test = train_test_split(X_TF, y, test_size = 0.20, random_state = 309)

#Param grid
param_grid = {
    'C': [0.01, 0.1, 1],  # Regularization strength (note the capital 'C')
    #'solver': ['lbfgs', 'liblinear'],  # Solvers for Logistic Regression
    'penalty': ['l2', 'elasticnet'],  # Penalty type ('l2' is standard for solvers like 'lbfgs' and 'liblinear')
    'max_iter': [100, 200]  # Number of iterations
}


grid_search = GridSearchCV(
    LogisticRegression(random_state=309),
    param_grid,
    cv=3,  # 5-fold cross-validation
    scoring='accuracy',  # Metric to optimize
    n_jobs=2,
    verbose=2  # Higher verbosity for detailed output
)

# Training the model
grid_search.fit(text_train, sent_train)

# Best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Predictions and evaluation
sent_pred = best_model.predict(text_test)
cm = confusion_matrix(sent_test, sent_pred)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(sent_test, sent_pred))

Fitting 3 folds for each of 12 candidates, totalling 36 fits




In [30]:
#METRICS
def auto_metrics_multiclass(sent_test, sent_pred):
    
    # Accuracy
    accuracy = accuracy_score(sent_test, sent_pred)
    
    # Classification error
    classification_error = 1 - accuracy
    
    # Precision, Recall, F1 (for multiclass use macro, micro, or weighted, with zero_division handling)
    precision = precision_score(sent_test, sent_pred, average='macro', zero_division=0)
    recall = recall_score(sent_test, sent_pred, average='macro', zero_division=0)
    f1 = f1_score(sent_test, sent_pred, average='macro', zero_division=0)

    # Table with metrics
    metrics_df = pd.DataFrame({
        'Metric': ['Accuracy', 'Classification Error', 'Precision', 'Recall', 'F1-score'],
        'Value': [accuracy, classification_error, precision, recall, f1]
    })
    
    return metrics_df

logistic_metrics = auto_metrics_multiclass(sent_test, sent_pred)
logistic_metrics

Unnamed: 0,Metric,Value
0,Accuracy,0.571579
1,Classification Error,0.428421
2,Precision,0.189042
3,Recall,0.102761
4,F1-score,0.113318


# RF

In [None]:
from sklearn.ensemble import RandomForestClassifier

vectorizer = TfidfVectorizer(max_features = 2000, stop_words = stopwords.words('english'))
X_TF = vectorizer.fit_transform(corpus).toarray()


#Train Split
X_list = corpus
y = df_corpus['category']


# Map categories into integers
category_mapping = {category: index for index, category in enumerate(y.unique())}
y_mapped = y.map(category_mapping)
y = y_mapped.tolist()
label_names = list(category_mapping.keys())
labels_list = list(category_mapping.values())

# Tfid
text_train, text_test, sent_train, sent_test = train_test_split(X_TF, y, test_size = 0.20, random_state = 309)

# Training the Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=309)
rf_classifier.fit(text_train, sent_train)

# Predicts
sent_pred = rf_classifier.predict(text_test)

# Making the confusion matrix
cm = confusion_matrix(sent_test, sent_pred)

# Make the classes on the predictions
unique_classes = sorted(set(sent_test).union(set(sent_pred)))

# Map class labels to the unique classes
class_labels = [labels_list[i] for i in unique_classes]
class_names = [label_names[i] for i in unique_classes]

# for labels to match the confusion matrix
cm_display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=class_names)

# Plot
fig, ax = plt.subplots()
cm_display.plot(cmap=plt.cm.Blues, ax=ax)
ax.set_xticklabels(class_names, rotation=90)
ax.set_yticklabels(class_names, rotation=0)
plt.title('Confusion Matrix')
plt.show()