In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# allow imports from project root
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# local helpers
from src.file_handling import load_raw_data, save_processed_data
import src.eda as eda_mod
from src.preprocess import preprocess_data

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

print("✓ Imports complete!")
print(f"Project root: {project_root}")

In [None]:
# Path to CFPB complaints CSV (uses the repo data/raw/complaints.csv by default)
raw_data_path = project_root / "data" / "raw" / "complaints.csv"

# Check if file exists
if not raw_data_path.exists():
    print(f"❌ File not found at: {raw_data_path}")
    print("\nPlease place the CFPB complaints CSV at this location (data/raw/complaints.csv)")
else:
    print(f"✓ Found data file at: {raw_data_path}")

# Load using our file helper (nice error message if missing)
df_raw = load_raw_data(raw_data_path)
print(f"✓ Loaded {len(df_raw):,} complaints")
print(f"Columns: {df_raw.columns.tolist()}")

## 1. Initial EDA

### Analyze the distribution of complaints across different Products.

In [None]:
eda_mod.plot_product_distribution(df_raw)

### Identify the number of complaints with and without narratives.

In [None]:
presence = eda_mod.narrative_presence_analysis(df_raw)
print(f"Narrative Presence Analysis:\n{presence}")

## 2. Preprocessing & Deep Dive

### Filter, Clean, and Analyze Lengths
We will now apply the full preprocessing pipeline which includes:
- Filtering for specified products (Credit card, Personal loan, Buy Now, Pay Later (BNPL), Savings account, and Money transfers)
- Removing records with empty Consumer complaint narrative fields
- Dropping PII columns
- Cleaning text (lowercasing, boilerplate removal, special characters, whitespace normalization)

In [None]:
df_processed = preprocess_data(df_raw)
print(f"\nFinal processed dataset shape: {df_processed.shape}")

### Calculate and visualize the length (word count) of the Consumer complaint narrative.

In [None]:
eda_mod.plot_narrative_length_distribution(df_processed)

### Sample of Cleaned Narratives

In [None]:
if len(df_processed) > 0:
    print("Raw Narrative Sample:")
    print(df_processed['Consumer complaint narrative'].iloc[0][:500])
    print("\nCleaned Narrative Sample:")
    print(df_processed['clean_narrative'].iloc[0][:500])
else:
    print("No records found after filtering!")

In [None]:
# Save processed data for RAG
save_processed_data(df_processed)