# Data reading and transformation

In [173]:
#Importamos librerías
import pandas as pd
import json

In [174]:
#Abrimos el json, lo cargamos y normalizamos
file_path = "../data/raw_data/tickets_classification_eng.json"
with open(file_path, "r") as file:  
    datos = json.load(file)

df = pd.json_normalize(datos)

In [175]:
#Seleccionamos sólo las variables de interés
df = df[['_source.complaint_what_happened', '_source.product', '_source.sub_product']]

#Renombramos para facilitar el manejo de las columnas
df = df.rename(columns={'_source.complaint_what_happened':'complaint_what_happened',
                '_source.product':'category',
                '_source.sub_product':'sub_product'
                })

#Creamos la nueva columna de clasificación
df['ticket_classification'] = df['category'] + " + " + df['sub_product']


In [176]:
df['category'].unique()

array(['Debt collection', 'Credit card or prepaid card', 'Mortgage',
       'Checking or savings account',
       'Credit reporting, credit repair services, or other personal consumer reports',
       'Vehicle loan or lease',
       'Money transfer, virtual currency, or money service',
       'Student loan', 'Consumer Loan', 'Credit card',
       'Bank account or service',
       'Payday loan, title loan, or personal loan', 'Money transfers',
       'Credit reporting', 'Payday loan', 'Prepaid card',
       'Other financial service'], dtype=object)

In [177]:

#Dropeamos columnas redundantes
df = df.drop(['category', 'sub_product'], axis=1)

#Llenamos los registros vacíos con nulos de pandas
df['complaint_what_happened'] = df['complaint_what_happened'].replace("", pd.NA)

#Dropeamos los nulos
df = df.dropna()

#Reseteamos el index
df = df.reindex()

In [178]:
#Checamos que todo haya salido bien
df.head()

Unnamed: 0,complaint_what_happened,ticket_classification
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account + Checking account


In [None]:
#checamos si hay errores gramaticales o pares de categorías para lo mismo
sorted(df.ticket_classification.unique())

['Bank account or service + (CD) Certificate of deposit',
 'Bank account or service + Cashing a check without an account',
 'Bank account or service + Checking account',
 'Bank account or service + Other bank product/service',
 'Bank account or service + Savings account',
 'Checking or savings account + CD (Certificate of Deposit)',
 'Checking or savings account + Checking account',
 'Checking or savings account + Other banking product or service',
 'Checking or savings account + Personal line of credit',
 'Checking or savings account + Savings account',
 'Consumer Loan + Installment loan',
 'Consumer Loan + Pawn loan',
 'Consumer Loan + Title loan',
 'Consumer Loan + Vehicle lease',
 'Consumer Loan + Vehicle loan',
 'Credit card or prepaid card + General-purpose credit card or charge card',
 'Credit card or prepaid card + General-purpose prepaid card',
 'Credit card or prepaid card + Gift card',
 'Credit card or prepaid card + Government benefit card',
 'Credit card or prepaid card + 

In [159]:
#Guardamos en el directorio de data transformada
df.to_csv('../data/transformed_data/preprocessed.csv', index=False)

# Data Cleaning

In [120]:
import re
import pandas as pd
import contractions
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

stop_words = set(stopwords.words('english'))

### Vamos a eliminar las palabras cuya frecuencia sobrepase el treshold, ya que existen palabras que se repiten mucho y no son precisamente stopwords, en especial chase, jp, y otros términos del banco que son casi constantes, por lo que no representan nada para el modelo y es mejor eliminarlas


In [121]:

def delete_frequent_words(corpus, threshold=0.75):
    # Creamos un CountVectorizer para obtener el conteo 
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    # Obtener la frecuencia de cada palabra
    word_counts = X.sum(axis=0).A1  #Convertimos en un array unidimensional
    word_list = vectorizer.get_feature_names_out()
    
    # Calculamos el treshold en base al porcentaje y el tamaño del df
    threshold_count = len(corpus) * threshold
    
    # Obtenemos las palabras que superen el treshold
    frequent_words = {word_list[i] for i, count in enumerate(word_counts) if count > threshold_count}
    
    
    filtered_corpus = []
    for doc in corpus:
        # Eliminamos esas palabras del corpus
        filtered_doc = ' '.join([word for word in doc.split() if word not in frequent_words])
        filtered_corpus.append(filtered_doc)
    
    return filtered_corpus

### Aquí vamos a hacer una pequeña limpieza, estirando contracciones a su forma base, convertir a minúsculas, eliminar todos los caracteres no alfanuméricos, las censuras y por último los stopwords

In [122]:
#Limpiamos otras cosillas con regex descontraemos y quitamos stopwords
def clean_complaint(complaint):

    #Convertimos a minúsculas
    complaint = complaint.lower()

    #Descontraemos
    complaint = contractions.fix(complaint)

    #Quitamos donde haya dos o más x
    complaint = re.sub(r'xx+', '', complaint)
    
    #Eliminar números
    #complaint = re.sub(r'\d', '', complaint)

    #Dejamos sólo alfanuméricos
    complaint = re.sub(r'\W', ' ', complaint)

    # Tokenizamos y quitamos stopwords
    complaint_tokens = word_tokenize(complaint)
    complaint = ' '.join([word for word in complaint_tokens if word not in stop_words])

    return complaint

In [123]:
# Procesamiento por batches
batch_size = 1000
cleaned_corpus = []

for start in range(0, len(df), batch_size):
    end = start + batch_size
    batch_complaints = df['complaint_what_happened'][start:end]
    
    # Limpiar cada complaint en cada batch
    cleaned_batch = batch_complaints.apply(clean_complaint)
    
    # Appendear a la lista
    cleaned_corpus.extend(cleaned_batch)

cleaned_corpus = delete_frequent_words(cleaned_corpus, threshold=0.80)

# Actualizar el df
df['complaint_what_happened'] = cleaned_corpus

In [124]:
#Checamos que todo esté en orden
df.head(5)

Unnamed: 0,complaint_what_happened,ticket_classification
1,good morning name appreciate could help put st...,Debt collection + Credit card debt
2,upgraded 2018 told agent upgrade anniversary d...,Credit card or prepaid card + General-purpose ...
10,reported 2019 however fraudulent application s...,"Credit reporting, credit repair services, or o..."
11,2018 trying book ticket came across offer 300 ...,"Credit reporting, credit repair services, or o..."
14,grand son give check 1600 deposit fund clear c...,Checking or savings account + Checking account


In [125]:
# Añadimos el treshold de rows mínimos por categoría, véase el EDA
counts = df['ticket_classification'].value_counts()
todelete = counts[counts < 10]
df = df[~df['ticket_classification'].isin(todelete)]

In [126]:
df.to_csv('../data/clean_data/cleaned_corpus.csv', index=False)