## Data wrangling

### Importamos las librerias necesarias

In [2]:
import json
import pandas as pd

### Cargamos el archivo json con el que estaremos trabajando

In [5]:
file_path = "../Data/raw_data/tickets_classification_eng.json"

with open(file_path, "r") as file:
    datos = json.load(file)

### Leemos los datos en un dataframe

In [10]:
df = pd.json_normalize(datos)

### Hacemos todo el preprocesmiento que se nos sugiere en las hints

In [11]:
# Seleccionamos las columnas
df = df[['_source.complaint_what_happened', '_source.product', '_source.sub_product']]
# Renombramos las columnas
df.rename(columns={
    '_source.complaint_what_happened': 'complaint_what_happened',
    '_source.product': 'category',
    '_source.sub_product': 'sub_product'
}, inplace=True)
# Creamos la  nueva columa
df['ticket_classification'] = df['category'] + ' + ' + df['sub_product']
# Eliminamos las columnas de sobra
df.drop(columns=['category', 'sub_product'], inplace=True)
# Nos aseguramos de que no haya nulos   
df['complaint_what_happened'].replace('', pd.NA, inplace=True)
# Eliminamos las filas con datos faltantes
df.dropna(subset=['complaint_what_happened', 'ticket_classification'], inplace=True)
# Reiniciamos el indice
df.reset_index(drop=True, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['complaint_what_happened'].replace('', pd.NA, inplace=True)


In [12]:
# Vemos que todo se hizo de manera correcta
df

Unnamed: 0,complaint_what_happened,ticket_classification
0,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...
2,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o..."
3,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o..."
4,my grand son give me check for {$1600.00} i de...,Checking or savings account + Checking account
...,...,...
18958,My husband passed away. Chase bank put check o...,Checking or savings account + Checking account
18959,After being a Chase Card customer for well ove...,Credit card or prepaid card + General-purpose ...
18960,"On Wednesday, XX/XX/XXXX I called Chas, my XXX...",Credit card or prepaid card + General-purpose ...
18961,I am not familiar with XXXX pay and did not un...,Checking or savings account + Checking account


In [13]:
# Finalmente guardamos el dataframe en un archivo CSV
output_file_path = "../Data/Clean_data/clean_data.csv"
df.to_csv(output_file_path, index=False)

### Hare una breve limpieza de texto con el fin de que el modelo tenga mejores resultados

In [15]:
import nltk
from nltk.corpus import stopwords
import re
import contractions

In [16]:
stop_words = set(stopwords.words('english'))

def preprocesar_titulo(texto):
    # Expandir contracciones
    texto = contractions.fix(texto)
    
    # Eliminamos 'X' mayúsculas porque representan datos confindenciales y no dan valor
    texto = re.sub(r'X+', '', texto)
    
    # Convertimos a minúsculas
    texto = texto.lower()
    
    # Tokenizamos el texto
    palabras = nltk.word_tokenize(texto)
    
    # Eliminamos las stopwords
    palabras = [palabra for palabra in palabras if palabra not in stop_words]
    
    # Unimos las palabras en un solo string
    texto_limpio = ' '.join(palabras)
    
    return texto_limpio

In [17]:
df['complaint_what_happened'] = df['complaint_what_happened'].apply(preprocesar_titulo)

### Así queda el dataframe final con el texto preprocesado para su posterior modelado

In [18]:
df

Unnamed: 0,complaint_what_happened,ticket_classification
0,good morning name appreciate could help put st...,Debt collection + Credit card debt
1,upgraded card //2018 told agent upgrade annive...,Credit card or prepaid card + General-purpose ...
2,"chase card reported //2019 . however , fraudul...","Credit reporting, credit repair services, or o..."
3,"//2018 , trying book ticket , came across offe...","Credit reporting, credit repair services, or o..."
4,grand son give check { $ 1600.00 } deposit cha...,Checking or savings account + Checking account
...,...,...
18958,husband passed away . chase bank put check hol...,Checking or savings account + Checking account
18959,"chase card customer well decade , offered mult...",Credit card or prepaid card + General-purpose ...
18960,"wednesday , // called chas , visa credit card ...",Credit card or prepaid card + General-purpose ...
18961,familiar pay understand great risk provides co...,Checking or savings account + Checking account


In [19]:
output_file_path = "../Data/Clean_data/transformed_data.csv"
df.to_csv(output_file_path, index=False)