In [1]:
# ===============================
# Engenharia de Features - NLP (Bloco 2 revisado)
# ===============================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Carregando o dataset já com o texto limpo
df = pd.read_csv(r'C:\Users\vbitu\projects\fake-news-etl-project\data\processed\dados_unificados_aosfatos_limpo.csv')

# ===============================
# Contagem de palavras e caracteres
# ===============================

# Contagem de palavras
df['qtde_palavras'] = df['texto_limpo'].apply(lambda x: len(str(x).split()))

# Contagem de caracteres
df['qtde_caracteres'] = df['texto_limpo'].apply(lambda x: len(str(x)))

# Visualizando o resultado
print(df[['qtde_palavras', 'qtde_caracteres']].describe())

# ===============================
# Bag of Words (CountVectorizer)
# ===============================

# Inicializando o CountVectorizer
vectorizer_bow = CountVectorizer(max_features=100)

# Ajuste e transformação
bow_matrix = vectorizer_bow.fit_transform(df['texto_limpo'].fillna(""))

# Convertendo para DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer_bow.get_feature_names_out())

# Concatenando ao dataframe original
df_bow = pd.concat([df.reset_index(drop=True), bow_df.reset_index(drop=True)], axis=1)

# ===============================
# TF-IDF (Term Frequency - Inverse Document Frequency)
# ===============================

# Inicializando o TF-IDF Vectorizer
vectorizer_tfidf = TfidfVectorizer(max_features=100)

# Ajuste e transformação
tfidf_matrix = vectorizer_tfidf.fit_transform(df['texto_limpo'].fillna(""))

# Convertendo para DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer_tfidf.get_feature_names_out())

# Concatenando ao dataframe original
df_tfidf = pd.concat([df.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)

# ===============================
# Salvando os datasets processados
# ===============================

# Salvando as tabelas enriquecidas
df_bow.to_csv(r'C:\Users\vbitu\projects\fake-news-etl-project\data\processed\dados_aosfatos_bow.csv', index=False)
df_tfidf.to_csv(r'C:\Users\vbitu\projects\fake-news-etl-project\data\processed\dados_aosfatos_tfidf.csv', index=False)


       qtde_palavras  qtde_caracteres
count    4418.000000      4418.000000
mean      263.995926      2140.355591
std       198.352336      1627.999295
min        33.000000       242.000000
25%       169.000000      1363.250000
50%       210.500000      1704.000000
75%       278.000000      2250.250000
max      4561.000000     37789.000000
