### EDA and Preproccessing

In [None]:
import rag_chatbot.data.filter as fl
import rag_chatbot.preprocessing.cleaning as cl
from rag_chatbot.core.settings import settings
from rag_chatbot.data.handler import DataHandler
from rag_chatbot.data.validation import validate_rag_ready
import rag_chatbot.viz.plots as viz

In [None]:
COLS = settings.get('columns')
MAPPING_COLS=COLS["mapping"]
REQUIRED_COLS = COLS['required']
FILTERS = settings.get('filters')["allowed_product_categories"]
CAT_MAPPING = settings.get('filters')["product_category_mapping"]
REQUIRED_COLS = set(MAPPING_COLS.values())


### Load Complaints Data

In [None]:
df_raw = DataHandler.from_registry("DATA", "raw_dir", "complaints.csv").load()

EDA

In [None]:
df_raw.shape

In [None]:
product_counts = df_raw["Product"].value_counts()
product_counts


In [None]:
df_raw["narrative_length"] = (
    df_raw["Consumer complaint narrative"]
    .astype(str)
    .apply(lambda x: len(x.split()))
)

In [None]:
df_raw["narrative_length"].hist(bins=50)

In [None]:
with_narrative = df_raw["Consumer complaint narrative"].notna().sum()
without_narrative = df_raw["Consumer complaint narrative"].isna().sum()

with_narrative, without_narrative

In [None]:
print("\nMissing values (%):")
# print((df_raw.isna().mean() * 100).round(2))

In [None]:
print("\nSample rows:")
display(df_raw.head())

In [None]:
print("\nData types:")
print(df_raw.dtypes)

In [None]:
viz.plot_complaint_distribution(df_raw)

Cleaning and Saving Data

In [None]:
df = cl.clean_and_select_columns(
    df_raw,
    column_mapping=MAPPING_COLS,
    required_columns=REQUIRED_COLS,
)

In [None]:
df

In [None]:
df = fl.normalize_and_filter_products(
    df,
    allowed_products=FILTERS,
    category_mapping=CAT_MAPPING,
    product_column="product_category"
)

df = fl.filter_non_empty_narratives(df)

In [None]:
df_clean = cl.apply_text_cleaning(df)

In [None]:
df_clean

In [None]:
validate_rag_ready(df_clean)

In [None]:
DataHandler.from_registry(
    section="DATA",
    path_key="interim_dir",
    filename="complaints_clean.parquet"
).save(df_clean)
print("Cleaned Data is successfully save in parquet format")

In [None]:
viz.plot_complaint_distribution(df_raw)