In [10]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re

In [11]:
data = pd.read_csv("processed_data.csv")

In [12]:
def preprocess_text(text):
    # minusculas
    text = text.lower()
    
    # Eliminar puntuación
    text = ''.join([char for char in text if char not in string.punctuation])

    # Eliminar cadenas con más de dos 'X' consecutivas
    text = re.sub(r'x{2,}', '', text)

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    # Tokenización
    words = nltk.word_tokenize(text)
    
    # Eliminar stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # Lematización
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Unir las palabras preprocesadas
    return ' '.join(words)

In [13]:
data['complaint_what_happened'] = data['complaint_what_happened'].apply(preprocess_text)
data['ticket_classification'] = data['ticket_classification'].apply(preprocess_text)


In [14]:
data

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,debt collection credit card debt
1,upgraded card 2018 told agent upgrade annivers...,credit card prepaid card generalpurpose credit...
2,chase card reported 2019 however fraudulent ap...,credit reporting credit repair service persona...
3,2018 trying book ticket came across offer 3000...,credit reporting credit repair service persona...
4,grand son give check 160000 deposit chase acco...,checking saving account checking account
...,...,...
18958,husband passed away chase bank put check hold ...,checking saving account checking account
18959,chase card customer well decade offered multip...,credit card prepaid card generalpurpose credit...
18960,wednesday called chas visa credit card provide...,credit card prepaid card generalpurpose credit...
18961,familiar pay understand great risk provides co...,checking saving account checking account


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF para las columnas preprocesadas
vectorizer = TfidfVectorizer(max_features=5000) 

# Aplicar TF-IDF 
X_complaint = vectorizer.fit_transform(data['complaint_what_happened'])
X_classification = vectorizer.fit_transform(data['ticket_classification'])

print(f"Forma de la matriz de 'complaint_what_happened': {X_complaint.shape}")
print(f"Forma de la matriz de 'ticket_classification': {X_classification.shape}")


Forma de la matriz de 'complaint_what_happened': (18963, 5000)
Forma de la matriz de 'ticket_classification': (18963, 78)


In [None]:
from sklearn.model_selection import train_test_split

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X_complaint, data['ticket_classification'], test_size=0.2, random_state=42)

print(f"Tamaño de entrenamiento: {X_train.shape}, Tamaño de prueba: {X_test.shape}")


Tamaño de entrenamiento: (15170, 5000), Tamaño de prueba: (3793, 5000)


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# regresión logística
model = LogisticRegression(max_iter=1000)

# Entrenar el modelo
model.fit(X_train, y_train)

# Predecir las categorías para el conjunto de prueba
y_pred = model.predict(X_test)

# Evaluar el modelo
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"Reporte de clasificación:\n{classification_report(y_test, y_pred)}")


Accuracy: 0.5929343527550751
Reporte de clasificación:
                                                                                          precision    recall  f1-score   support

                                                bank account service bank productservice       1.00      0.02      0.04        55
                                      bank account service cashing check without account       0.00      0.00      0.00         7
                                             bank account service cd certificate deposit       0.00      0.00      0.00         3
                                                   bank account service checking account       0.57      0.36      0.44       217
                                                     bank account service saving account       0.00      0.00      0.00        17
                                         checking saving account banking product service       0.00      0.00      0.00        36
                                  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [19]:
X = data["complaint_what_happened"]
y = data["ticket_classification"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:

# Vectorizar los datos de texto
tfidf = TfidfVectorizer(max_features=5000, stop_words='english') 
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [22]:
# Entrenar un modelo Random Forest
rf_model = RandomForestClassifier(
    n_estimators=200,  
    random_state=42,
    class_weight="balanced" 
)
rf_model.fit(X_train_tfidf, y_train)

# Hacer predicciones
y_pred = rf_model.predict(X_test_tfidf)

# Evaluar el modelo
print("Reporte de clasificación:")
print(classification_report(y_test, y_pred))


Reporte de clasificación:
                                                                                          precision    recall  f1-score   support

                                                bank account service bank productservice       0.00      0.00      0.00        60
                                      bank account service cashing check without account       0.00      0.00      0.00         4
                                             bank account service cd certificate deposit       0.00      0.00      0.00         4
                                                   bank account service checking account       0.33      0.00      0.01       249
                                                     bank account service saving account       1.00      0.07      0.13        14
                                         checking saving account banking product service       0.00      0.00      0.00        48
                                          checking saving accou

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
