In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import fraud_detection.viz.plots  as viz
import fraud_detection.analysis.eda as eda
from fraud_detection.data.loader import DataHandler
from fraud_detection.core.settings import settings
from fraud_detection.data.cleaning import DataCleaning
from fraud_detection.features.preprocessing import build_preprocessing_pipeline
from imblearn.over_sampling import SMOTE

- Data I/O

In [None]:
# Card data 
df = DataHandler.from_registry("DATA", "raw_dir", "creditcard.csv").load()

In [None]:
df

In [None]:
# Class distribution
class_counts = df['class'].value_counts()
class_pct = df['class'].value_counts(normalize=True) * 100

print("Class Distribution:")
print(f"  Non-Fraud (0): {class_counts[0]:,} ({class_pct[0]:.2f}%)")
print(f"  Fraud (1):     {class_counts[1]:,} ({class_pct[1]:.2f}%)")
print(f"\nImbalance Ratio: 1:{class_counts[0]/class_counts[1]:.1f}")

In [None]:

# 1. Check Class Imbalance
viz.plot_class_distribution(class_counts)


# 2. Compare Numerical Distributions (Age and Purchase Value)
viz.plot_numeric_distribution(df, 'age')
viz.plot_numeric_distribution(df, 'purchase_value')

# 3. Deep dive into Purchase Value Outliers
viz.plot_numeric_by_class(df, 'purchase_value')

In [None]:
# df_with_country contains 'country' and 'class'
country_stats = eda.get_country_fraud_stats(df)

top_count = eda.get_top_countries_by_fraud_count(country_stats, top_n=10)
top_rate = eda.get_top_countries_by_fraud_rate(
    country_stats, min_transactions=50, top_n=10)
country_stats
overall_rate = df['class'].mean() * 100

In [None]:
top_count

In [None]:
top_rate

In [None]:
# 4. Geographic Analysis (Volume vs Risk)
viz.plot_country_transactions(country_stats, top_n=15)
viz.plot_country_fraud_overview(
    country_stats=country_stats,
    country_stats_filtered=top_rate,
    overall_rate=overall_rate,
    top_n=10,
    min_transactions=200,
)

In [None]:
df_features = add_fraud_features(df)

In [None]:
df_features

In [None]:
# Daily fraud trends
viz.plot_fraud_over_time(df_features, time_col='purchase_time', freq='D')

# Weekly fraud trends
viz.plot_fraud_over_time(df_features, time_col='purchase_time', freq='W')

# Hour-of-day and day-of-week patterns
viz.plot_fraud_by_hour_day(df_features)

### Feature Engineering and Sampling Imbalanced data

In [None]:

FEATURES = settings.get("features")

TARGET = FEATURES["target"]
NUM_COLS = FEATURES["numeric"]
CAT_COLS = FEATURES["categorical"]
# -----------------------------
# 1. Split features and target
# -----------------------------
X = df_features.drop(columns=[TARGET])
y = df_features[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# -----------------------------
# 2. Build and apply feature pipeline
# -----------------------------
preprocessor = build_preprocessing_pipeline(NUM_COLS, CAT_COLS)

X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [None]:
preprocessor

In [None]:
print("\n--- Class Distribution BEFORE SMOTE (Train) ---")
print(y_train.value_counts(normalize=True).map("{:.2%}".format))

# -----------------------------
# 3. Extract feature names (for DataFrame / saving)
# -----------------------------
feature_names = preprocessor.get_feature_names_out()
X_train_df = pd.DataFrame(X_train_transformed, columns=feature_names)
X_train_df[TARGET] = y_train.reset_index(drop=True)

X_test_df = pd.DataFrame(X_test_transformed, columns=feature_names)
X_test_df[TARGET] = y_test.reset_index(drop=True)


# -----------------------------
# 4. Apply SMOTE (only on training set)
# -----------------------------

# adjust ratio if needed
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(
    X_train_transformed, y_train
)

# Save to DataFrame for consistency
X_train_res_df = pd.DataFrame(X_train_resampled, columns=feature_names)
X_train_res_df[TARGET] = y_train_resampled.reset_index(drop=True)

print("\n--- Class Distribution AFTER SMOTE ---")
print(y_train_resampled.value_counts(normalize=True).map("{:.2%}".format))

# -----------------------------
# 5. Save datasets 
# -----------------------------

train_original_df = pd.DataFrame(X_train_transformed, columns=feature_names)
train_original_df[TARGET] = y_train.reset_index(drop=True)

train_original_handler = DataHandler.from_registry(
    section="DATA",
    path_key="processed_dir",
    filename="train_original.parquet"
).save(train_original_df)


DataHandler.from_registry(
    section="DATA",
    path_key="processed_dir",
    filename="train_resampled.parquet"
).save(X_train_res_df)


DataHandler.from_registry(
    section="DATA",
    path_key="processed_dir",
    filename="test_original.parquet"
    
).save(X_test_df)

print("[INFO] Preprocessing and SMOTE complete. Data saved.")