In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / "src"))

from data_loader import DataLoader
from data_cleaner import DataCleaner
from eda import EDA
from preprocessor import PreprocessingPipeline

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline


## Load and Clean Data


In [None]:
# Initialize components
loader = DataLoader(data_dir="../data/raw")
cleaner = DataCleaner(imputation_strategy="mean")
eda = EDA(output_dir="eda_outputs")

# Load fraud data
try:
    fraud_df = loader.load_csv("Fraud_Data.csv")
    print(f"Loaded data: {fraud_df.shape}")
    print(f"\nColumns: {list(fraud_df.columns)}")
except FileNotFoundError as e:
    print(f"Data file not found: {e}")
    print("Please ensure Fraud_Data.csv is in the data/raw directory")
    fraud_df = None


In [None]:
if fraud_df is not None:
    # Clean data
    fraud_df_cleaned = cleaner.clean(
        fraud_df,
        handle_missing=True,
        remove_dups=True,
        correct_types=True,
        date_columns=["purchase_time", "signup_time"] if "purchase_time" in fraud_df.columns else None
    )
    print(f"After cleaning: {fraud_df_cleaned.shape}")


## Class Distribution Analysis


In [None]:
if fraud_df is not None and "class" in fraud_df_cleaned.columns:
    class_stats = eda.analyze_class_distribution(
        fraud_df_cleaned,
        target_column="class",
        plot=True
    )
    print(f"\nImbalance ratio: {class_stats['imbalance_ratio']:.2f}")


## Univariate Analysis


In [None]:
if fraud_df is not None:
    # Select numeric columns for univariate analysis
    numeric_cols = fraud_df_cleaned.select_dtypes(include=[np.number]).columns.tolist()
    if "class" in numeric_cols:
        numeric_cols.remove("class")
    
    if numeric_cols:
        univariate_stats = eda.univariate_analysis(
            fraud_df_cleaned,
            columns=numeric_cols[:10],  # Analyze top 10 numeric columns
            plot=True
        )
        display(univariate_stats)


## Bivariate Analysis


In [None]:
if fraud_df is not None and "class" in fraud_df_cleaned.columns:
    bivariate_stats = eda.bivariate_analysis(
        fraud_df_cleaned,
        target_column="class",
        plot=True
    )
    print("Top 10 features correlated with target:")
    display(bivariate_stats.head(10))


## Generate Summary Report


In [None]:
if fraud_df is not None and "class" in fraud_df_cleaned.columns:
    report = eda.generate_summary_report(
        fraud_df_cleaned,
        target_column="class",
        output_file="eda_summary_report.txt"
    )
    print(report)


# EDA - Fraud Data

Placeholder notebook for exploratory data analysis of fraud data.
