In [None]:
import json
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS
from nltk.tokenize import word_tokenize, sent_tokenize

In [2]:
dados_complaints = json.load(open("../../data/json/complaints_full.json"))

In [None]:
df = pd.DataFrame(dados_complaints)
df

In [None]:
df.drop_duplicates(subset=['summary'], inplace=True)
df.dropna(subset=['summary'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

In [None]:
df = df.loc[:, ("manufacturer", "crash", "fire", "numberOfInjuries", "numberOfDeaths", "dateComplaintFiled", "components", "summary")]
df

In [None]:
train_df = pd.read_csv("/var/projetos/Jupyterhubstorage/victor.silva/NHTSA-Complaint-Classifier/data/csv/train.csv")
test_df = pd.read_csv("/var/projetos/Jupyterhubstorage/victor.silva/NHTSA-Complaint-Classifier/data/csv/test.csv")
val_df = pd.read_csv("/var/projetos/Jupyterhubstorage/victor.silva/NHTSA-Complaint-Classifier/data/csv/eval.csv")

dados_csv = pd.concat([train_df, test_df, val_df])
dados_csv

In [6]:
ids_selecionados = dados_csv["odiNumber"].tolist()

In [None]:
df = df[df['odiNumber'].isin(ids_selecionados)]
df

In [None]:
!pip install wordcloud

In [None]:
!pip install textstat

In [11]:
def unique_words(text):
    words = word_tokenize(text.lower())
    words = [word for word in words if word.isalpha()]
    return len(set(words))

def average_length(text):
    words = word_tokenize(text)
    return np.mean([len(word) for word in words if word.isalpha()])

def stopword_proportion(text):
    words = word_tokenize(text.lower())
    stopwords = set(STOPWORDS)
    stopwords_in_text = [word for word in words if word in stopwords]
    return len(stopwords_in_text) / len(words) if words else 0

def sentence_count(text):
    return len(sent_tokenize(text))

In [None]:
tqdm.pandas()

df['unique_words'] = df['summary'].progress_apply(unique_words)
df['average_word_length'] = df['summary'].progress_apply(average_length)
df['stopword_proportion'] = df['summary'].progress_apply(stopword_proportion)
df['sentence_count'] = df['summary'].progress_apply(sentence_count)

In [None]:
sns.set(style="whitegrid")

fig, axes = plt.subplots(2, 2, figsize=(18, 12))

sns.histplot(df['unique_words'], kde=True, color='orange', bins=30, ax=axes[0, 0])
axes[0, 0].set_title('Distribuição do Número de Palavras Únicas', fontsize=16)
axes[0, 0].set_xlabel('Número de Palavras Únicas', fontsize=12)
axes[0, 0].set_ylabel('Frequência', fontsize=12)

sns.histplot(df['average_word_length'], kde=True, color='green', bins=30, ax=axes[0, 1])
axes[0, 1].set_title('Distribuição do Comprimento Médio das Palavras', fontsize=16)
axes[0, 1].set_xlabel('Comprimento Médio das Palavras', fontsize=12)
axes[0, 1].set_ylabel('Frequência', fontsize=12)

sns.histplot(df['stopword_proportion'], kde=True, color='red', bins=30, ax=axes[1, 0])
axes[1, 0].set_title('Distribuição da Proporção de Stopwords', fontsize=16)
axes[1, 0].set_xlabel('Proporção de Stopwords', fontsize=12)
axes[1, 0].set_ylabel('Frequência', fontsize=12)

sns.histplot(df['sentence_count'], kde=True, color='purple', bins=30, ax=axes[1, 1])
axes[1, 1].set_title('Distribuição do Número de Sentenças', fontsize=16)
axes[1, 1].set_xlabel('Número de Sentenças', fontsize=12)
axes[1, 1].set_ylabel('Frequência', fontsize=12)

plt.tight_layout()

plt.show()

In [None]:
component_counts = df['components'].value_counts().reset_index()

component_counts.columns = ['Component', 'Count']

print(component_counts)


In [None]:
component_counts = component_counts[component_counts["Count"] > 50]
component_counts

In [None]:
sns.set(style="whitegrid")

plt.figure(figsize=(12, 8))

ax = sns.barplot(y=component_counts['Component'], x=component_counts['Count'], palette='viridis')

plt.title('Frequencia do Component no conjunto de dados', fontsize=18, fontweight='bold')
plt.xlabel('Contagem', fontsize=14)
plt.ylabel('Component', fontsize=14)

plt.gca().invert_yaxis()

plt.xticks(fontsize=12)
plt.yticks(fontsize=12)

plt.show()