## Data Preparation and Data Wrangling

In [36]:
import json 
import pandas as pd
import os

In [37]:
file_path = '../data/raw_data/tickets_classification_eng.json'

In [38]:
if not os.path.exists(file_path):
    print(f"El archivo {file_path} no existe")
else:
    with open(file_path, "r") as file:
        datos = json.load(file)

In [39]:
df = pd.json_normalize(datos)

In [40]:
df_clean = df.copy()

Renombramos columnas

In [41]:
df_clean = df_clean[['_source.complaint_what_happened', '_source.product', '_source.sub_product']]
df_clean.rename(columns={
        '_source.complaint_what_happened': 'complaint_what_happened',
        '_source.product': 'category',
        '_source.sub_product': 'sub_product'
    }, inplace=True)
df_clean

Unnamed: 0,complaint_what_happened,category,sub_product
0,,Debt collection,Credit card debt
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection,Credit card debt
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card,General-purpose credit card or charge card
3,,Mortgage,Conventional home mortgage
4,,Credit card or prepaid card,General-purpose credit card or charge card
...,...,...,...
78308,,Checking or savings account,Checking account
78309,"On Wednesday, XX/XX/XXXX I called Chas, my XXX...",Credit card or prepaid card,General-purpose credit card or charge card
78310,I am not familiar with XXXX pay and did not un...,Checking or savings account,Checking account
78311,I have had flawless credit for 30 yrs. I've ha...,Credit card or prepaid card,General-purpose credit card or charge card


In [42]:
df_clean['ticket_classification'] = df_clean['category'] + ' + ' + df_clean['sub_product']

In [43]:
df_clean.drop(['category', 'sub_product'], axis=1, inplace=True)

In [45]:
df_clean['complaint_what_happened'].replace('', pd.NA, inplace=True)
df_clean

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_clean['complaint_what_happened'].replace('', pd.NA, inplace=True)


Unnamed: 0,complaint_what_happened,ticket_classification
0,,Debt collection + Credit card debt
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...
3,,Mortgage + Conventional home mortgage
4,,Credit card or prepaid card + General-purpose ...
...,...,...
78308,,Checking or savings account + Checking account
78309,"On Wednesday, XX/XX/XXXX I called Chas, my XXX...",Credit card or prepaid card + General-purpose ...
78310,I am not familiar with XXXX pay and did not un...,Checking or savings account + Checking account
78311,I have had flawless credit for 30 yrs. I've ha...,Credit card or prepaid card + General-purpose ...


In [48]:
df_clean.dropna(subset=['complaint_what_happened', 'ticket_classification'], inplace=True)

In [49]:
df_clean

Unnamed: 0,complaint_what_happened,ticket_classification
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account + Checking account
...,...,...
78301,My husband passed away. Chase bank put check o...,Checking or savings account + Checking account
78303,After being a Chase Card customer for well ove...,Credit card or prepaid card + General-purpose ...
78309,"On Wednesday, XX/XX/XXXX I called Chas, my XXX...",Credit card or prepaid card + General-purpose ...
78310,I am not familiar with XXXX pay and did not un...,Checking or savings account + Checking account


Reseteamos index 

In [51]:
df_clean.reset_index(drop=True, inplace=True)

In [52]:
df_clean

Unnamed: 0,complaint_what_happened,ticket_classification
0,Good morning my name is XXXX XXXX and I apprec...,Debt collection + Credit card debt
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card + General-purpose ...
2,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o..."
3,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o..."
4,my grand son give me check for {$1600.00} i de...,Checking or savings account + Checking account
...,...,...
18958,My husband passed away. Chase bank put check o...,Checking or savings account + Checking account
18959,After being a Chase Card customer for well ove...,Credit card or prepaid card + General-purpose ...
18960,"On Wednesday, XX/XX/XXXX I called Chas, my XXX...",Credit card or prepaid card + General-purpose ...
18961,I am not familiar with XXXX pay and did not un...,Checking or savings account + Checking account


In [53]:
clean_data_dir = '../data/clean_data'
os.makedirs(clean_data_dir, exist_ok=True)

In [54]:
output_path = os.path.join(clean_data_dir, 'clean_tickets.csv')
df_clean.to_csv(output_path, index=False)

print(f"Datos limpios guardados en {output_path}")

Datos limpios guardados en ../data/clean_data/clean_tickets.csv


## Data Wrangling

Importamos las librerías