# Complete Preprocessing Pipeline

This notebook demonstrates the complete preprocessing pipeline using the PreprocessingPipeline class.


In [None]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path

# Add src to path
sys.path.append(str(Path.cwd().parent / "src"))

from preprocessor import PreprocessingPipeline

print("Preprocessing pipeline imported successfully!")


## Initialize Pipeline


In [None]:
# Initialize the complete preprocessing pipeline
pipeline = PreprocessingPipeline(
    data_dir="../data/raw",
    output_dir="../data/processed"
)


## Process Fraud Data

This will execute the complete pipeline:
1. Load data
2. Clean data (handle missing values, remove duplicates, correct types)
3. Integrate geolocation (IP to country mapping)
4. Engineer features (time-based, transaction frequency, velocity)
5. Perform EDA
6. Transform data (scaling and encoding)
7. Handle class imbalance (SMOTE)


In [None]:
# Process fraud data through complete pipeline
try:
    processed_df, metadata = pipeline.process_fraud_data(
        fraud_data_file="Fraud_Data.csv",
        ip_country_file="IpAddress_to_Country.csv",  # Optional
        target_column="class",
        user_column="user_id",
        purchase_datetime="purchase_time",
        signup_datetime="signup_time",
        ip_column="ip_address",
        perform_eda=True,
        handle_imbalance=True,
        save_processed=True
    )
    
    print(f"\nFinal processed data shape: {processed_df.shape}")
    print(f"\nSteps completed: {metadata['steps_completed']}")
    
except FileNotFoundError as e:
    print(f"Data file not found: {e}")
    print("Please ensure the data files are in the data/raw directory")
except Exception as e:
    print(f"Error in preprocessing: {e}")
    import traceback
    traceback.print_exc()


## View Processed Data


In [None]:
# Display processed data info
if 'processed_df' in locals():
    print("Processed Data Info:")
    print(f"Shape: {processed_df.shape}")
    print(f"\nColumns ({len(processed_df.columns)}):")
    print(list(processed_df.columns))
    print(f"\nFirst few rows:")
    display(processed_df.head())
    print(f"\nData types:")
    print(processed_df.dtypes)
