In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# allow imports from project root
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# local helpers
from src.file_handling import load_raw_data, save_processed_data
import src.eda as eda_mod
from src.preprocess import preprocess_data, create_stratified_sample

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 200)

print("✓ Imports complete!")
print(f"Project root: {project_root}")

In [None]:
raw_data_path = project_root / "data" / "raw" / "complaints.csv"

if not raw_data_path.exists():
    print(f"❌ File not found at: {raw_data_path}")
else:
    print(f"✓ Found data file at: {raw_data_path}")

df_raw = load_raw_data(raw_data_path)
print(f"✓ Loaded {len(df_raw):,} complaints")

## 1. Initial EDA

### Analyze product distribution.

In [None]:
eda_mod.plot_product_distribution(df_raw)

### Narrative presence analysis.

In [None]:
presence = eda_mod.narrative_presence_analysis(df_raw)
print(f"Narrative Presence Analysis:\n{presence}")

## 2. Preprocessing & Sampling

### Full Preprocessing Pipeline

In [None]:
df_processed = preprocess_data(df_raw)
print(f"\nFinal processed dataset shape: {df_processed.shape}")

### Stratified Sampling
Create a stratified sample of 10,000-15,000 complaints to ensure proportional representation.

In [None]:
# Create a stratified sample of 12,000 complaints
df_sampled = create_stratified_sample(df_processed, target_size=12000)

print(f"\nSampled dataset shape: {df_sampled.shape}")

### Narrative length distribution (on cleaned narratives).

In [None]:
eda_mod.plot_narrative_length_distribution(df_sampled)

### Sample of Cleaned Narratives.

In [None]:
sample_row = df_sampled.iloc[0]
print("RAW NARRATIVE:")
print(sample_row['Consumer complaint narrative'])
print("\nCLEANED NARRATIVE:")
print(sample_row['clean_narrative'])
print(f"\nWord Count: {sample_row['narrative_word_count']}")

## 3. Save Processed Data

In [None]:
save_path = project_root / "data" / "processed" / "filtered_complaints.csv"
save_processed_data(df_sampled, save_path)
print(f"✓ Sampled data saved to {save_path}")