In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import fraud_detection.data.ip_geolocation as ip
from fraud_detection.data.loader import DataHandler
from fraud_detection.core.settings import settings
from fraud_detection.data.cleaning import DataCleaning
from fraud_detection.viz.plots import Visualizer
from fraud_detection.features.pipeline import build_feature_pipeline
from imblearn.over_sampling import SMOTE

- Data I/O

In [None]:
# Card data 
credit_df = DataHandler.from_registry("DATA", "raw_dir", "creditcard.csv").load()

# Fraud Data
fraud_df = DataHandler.from_registry(
    "DATA", "raw_dir", "Fraud_Data.csv").load()

# geo data
ip_df = DataHandler.from_registry(
    "DATA", "raw_dir", "IpAddress_to_Country.csv").load()

In [None]:
credit_df

In [None]:
# Columns in Fraud_Data.csv
datetime_cols = ["signup_time", "purchase_time"]
numeric_cols = ["purchase_value", "age"]
protected_cols = ["user_id", "device_id", "ip_address"]  # do not strip

In [None]:
cleaner = DataCleaning(
    drop_duplicates=True,              # remove exact duplicate rows
    duplicate_subset=["user_id", "purchase_time",
                      "purchase_value"],  # business-safe keys
    strip_strings=True,                # strip text columns
    protected_string_columns=protected_cols,
    empty_string_as_nan=True,
    datetime_columns=datetime_cols,
    numeric_columns=numeric_cols,
    verbose=True
)

In [None]:
cleaned_df = cleaner.clean(fraud_df)
print(f"Cleaned shape: {cleaned_df.shape}")

In [None]:
# 1. Clean reference table
ip_country_df = ip.clean_ip_country_table(ip_df)

# 2. Normalize transaction IPs

fraud_df = ip.normalize_ip_column(fraud_df, ip_col="ip_address")

# 3. Map IPs to countries
df = ip.map_ip_to_country(cleaned_df, ip_country_df)

In [None]:
df

In [None]:
viz = Visualizer()
# 1. Check Class Imbalance
viz.analyze_class_distribution(df)


# 2. Compare Numerical Distributions (Age and Purchase Value)
viz.plot_fraud_distributions(df)

# 3. Deep dive into Purchase Value Outliers
viz.plot_purchase_value_boxplot(df)

In [None]:
FEATURES = settings.get("features")

TARGET = FEATURES["target"]
NUM_COLS = FEATURES["numeric"]
CAT_COLS = FEATURES["categorical"]

In [None]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
feature_pipeline = build_feature_pipeline(NUM_COLS, CAT_COLS)

X_train_transformed = feature_pipeline.fit_transform(X_train)
X_test_transformed = feature_pipeline.transform(X_test)


In [None]:
feature_pipeline

In [None]:
# Ensure SMOTE is available
smote = SMOTE(random_state=42)

# Apply SMOTE on training data only
X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_transformed, y_train)


print("\n--- Class Distribution AFTER SMOTE ---")
print(y_train_resampled.value_counts(normalize=True).map("{:.2%}".format))

In [None]:
# 4. Geographic Analysis (Volume vs Risk)
viz.plot_top_countries(df)
viz.plot_fraud_rate_by_country(df)

In [None]:
# 5. Temporal Analysis (When does fraud happen?)
viz.plot_time_series(df)

In [None]:
# Get transformed feature names
numeric_features = NUM_COLS
categorical_features = feature_pipeline.named_steps['preprocessing'].get_feature_names_out(
)

# OneHotEncoder expands categorical names
feature_names = list(categorical_features)
if hasattr(feature_pipeline.named_steps['preprocessing'], 'transformers_'):
    # Combine numeric and encoded categorical names
    num_cols = feature_pipeline.named_steps['preprocessing'].transformers_[
        0][2]
    cat_cols = feature_pipeline.named_steps['preprocessing'].transformers_[
        1][1].named_steps['encoder'].get_feature_names_out(CAT_COLS)
    feature_names = list(num_cols) + list(cat_cols)

# Create DataFrames
train_df = pd.DataFrame(X_train_resampled, columns=feature_names)
train_df[TARGET] = y_train_resampled.reset_index(drop=True)

test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
test_df[TARGET] = y_test.reset_index(drop=True)

In [None]:
# Train (resampled)
train_handler = DataHandler.from_registry(
    section="DATA",
    path_key="processed_dir",
    filename="train_resampled.parquet"
)
train_handler.save(train_df)

# Test (original)
test_handler = DataHandler.from_registry(
    section="DATA",
    path_key="processed_dir",
    filename="test_original.parquet"
)
test_handler.save(test_df)

print("\n--- Data saved successfully ---")