In [1]:
import pandas as pd
import os
import regex as re
import json
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
BASE_DIR = "../../data/json"
results = json.load(open(os.path.join(BASE_DIR, "complaints_full.json"), "r"))

In [3]:
df = pd.DataFrame(results)
df = df.loc[:, ("odiNumber","dateComplaintFiled", "components", "summary")]

In [None]:
df["components"].value_counts()

In [None]:
df['dateComplaintFiled'] = pd.to_datetime(df['dateComplaintFiled'], format="%m/%d/%Y")

df_filtrado = df[(df['dateComplaintFiled'].dt.year >= 2014) & (df["dateComplaintFiled"].dt.year <= 2024)]
df_test = df[(df['dateComplaintFiled'].dt.year < 2014)]

df_filtrado.dropna(inplace=True)
df_test.dropna(inplace=True)

df_filtrado.drop_duplicates(subset=['summary'], inplace=True)
df_test.drop_duplicates(subset=['summary'], inplace=True)

In [6]:
classes = {
    "ELECTRICAL SYSTEM": 0,
    "AIR BAGS": 1,
    "STRUCTURE": 2,
    "SERVICE BRAKES": 3,
    "OTHER": 4
    }

In [None]:
df_filtrado.loc[:, "label"] = df_filtrado["components"].apply(lambda x: x.strip() if x.strip() in classes.keys() else "OTHER")
df_test.loc[:, "label"] = df_test["components"].apply(lambda x: x.strip() if x.strip() in classes.keys() else "OTHER")

In [8]:
threshold_balanced = round(np.min(df_filtrado['label'].value_counts())/2)
labels = list(classes.keys())
dados_filtrados = [df_filtrado[df_filtrado['label'] == label].sample(n=threshold_balanced) for label in labels]
df_filtrado = pd.concat(dados_filtrados)

# Pegando um número de amostras de teste igual a 20% das amostras do dataset que será usado para treinar/avaliar.
num_amostras_test = round(len(df_filtrado) * 0.2)
df_test = df_test.sample(n=num_amostras_test, random_state=42)

In [9]:
def clean_text(text: str) -> str:
    """
    Remove caracteres indesejados.
    """
    # Remove caracteres indesejados.
    text = re.sub(r'([•●▪•_·□»«#£¢¿&^~´`¨\t])', ' ', text)
    # Remove múltiplos hífens.
    text = re.sub(r'(-)+', '-', text)
    # Remove múltiplos pontos.
    text = re.sub(r'(\.)+', '.', text)
    # Remove espaços consecutivos.
    text = re.sub(r'\s+', ' ', text)
    # Remove espaço no fim da frase.
    text = re.sub(r'\s\.$', '.', text)
    return text.lower().strip()

In [10]:
df_filtrado.loc[:,'summary'] = df_filtrado['summary'].apply(lambda x: clean_text(x))
df_test.loc[:,'summary'] = df_test['summary'].apply(lambda x: clean_text(x))

In [11]:
train, eval = train_test_split(df_filtrado, test_size=0.2, random_state=42, stratify=df_filtrado['label'])
train.reset_index(inplace=True, drop=True)
eval.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [12]:
train.to_csv("../../data/csv/train.csv", index=False)
eval.to_csv("../../data/csv/eval.csv", index=False)
df_test.to_csv("../../data/csv/test.csv", index=False)