Import

In [None]:
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

sys.path.append("../src")
from data_pipeline import DataPipeline

Load Data

In [None]:
pipeline = DataPipeline(filepath="../data/sample.csv", target_col="is_fraud")
df = pipeline.load_data()

print("Dataset Shape:", df.shape)
print(df.head())
print("\nColumn Types:\n", df.dtypes)
print("\nMissing Values:\n", df.isnull().sum())

if 'is_fraud' in df.columns:
    print("\nFraud Class Distribution:\n", df['is_fraud'].value_counts(normalize=True))

Visualize Data

In [None]:
if 'is_fraud' in df.columns:
    sns.countplot(x='is_fraud', data=df)
    plt.title("Fraud vs Non-Fraud Distribution")
    plt.show()

if 'amount' in df.columns and 'is_fraud' in df.columns:
    sns.boxplot(x='is_fraud', y='amount', data=df)
    plt.title("Transaction Amount by Fraud Status")
    plt.show()

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(numeric_only=True), cmap='coolwarm', annot=False)
plt.title("Feature Correlation Heatmap")
plt.show()

Preprocess

In [None]:
df_clean = pipeline.preprocess(df)

Balance & Scale

In [None]:
X_scaled, y_res = pipeline.balance_and_scale(df_clean)
print("\nBalanced Class Distribution:\n", pd.Series(y_res).value_counts())

Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = pipeline.split_data(X_scaled, y_res)
print("\nShapes:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

Save Processed Data

In [None]:
pipeline.save_splits(X_train, X_test, y_train, y_test)
print("\nPreprocessing complete. Data saved to ../data/processed/")