# 🧑‍💻 Fraud Detection Exploration Notebook

This notebook demonstrates an end-to-end mini workflow:
- Generate synthetic fraud dataset (1000 rows)
- Perform EDA
- Train XGBoost model with SageMaker
- Evaluate model
- Deploy to real-time endpoint
- Invoke endpoint with sample transaction

In [None]:
import boto3
import pandas as pd
import numpy as np
import sagemaker
from sagemaker import get_execution_role
from sagemaker.xgboost.estimator import XGBoost
import matplotlib.pyplot as plt

session = sagemaker.Session()
role = get_execution_role()
bucket = session.default_bucket()
prefix = "fraud-detection"

print(f"Using S3 bucket: {bucket}")

## Generate Synthetic Fraud Dataset (1000 rows)

In [None]:
np.random.seed(42)
n = 1000

data = {
    "txn_id": np.arange(1, n+1),
    "amount": np.random.exponential(scale=100, size=n).round(2),
    "time_delta": np.random.exponential(scale=60, size=n).round(2),
    "device_score": np.random.uniform(0, 1, n).round(3),
    "geo_distance": np.random.exponential(scale=50, size=n).round(2),
    "num_prev_txns": np.random.poisson(lam=3, size=n),
    "is_night": np.random.choice([0, 1], size=n, p=[0.7, 0.3])
}

df = pd.DataFrame(data)

fraud_prob = (
    0.2*(df["amount"] > 500).astype(int) +
    0.2*(df["geo_distance"] > 100).astype(int) +
    0.2*(df["is_night"] == 1).astype(int) +
    0.1*(df["device_score"] < 0.3).astype(int)
)
fraud_prob = np.clip(fraud_prob, 0, 1)
df["fraud_label"] = (np.random.rand(n) < fraud_prob).astype(int)

print(df.head())
df.to_csv("synthetic_fraud_data.csv", index=False)
print("✅ synthetic_fraud_data.csv generated with", n, "rows")

## Basic EDA

In [None]:
print(df['fraud_label'].value_counts())
df['amount'].hist(bins=50)
plt.show()

## Train/Test Split & Upload to S3

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = df.drop(['fraud_label','txn_id'], axis=1)
y = df['fraud_label']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

train = pd.concat([pd.Series(y_train.values), pd.DataFrame(X_train)], axis=1)
test = pd.concat([pd.Series(y_test.values), pd.DataFrame(X_test)], axis=1)

train_file = 'train.csv'
test_file = 'test.csv'
train.to_csv(train_file, header=False, index=False)
test.to_csv(test_file, header=False, index=False)

train_s3 = session.upload_data(train_file, bucket=bucket, key_prefix=prefix)
test_s3 = session.upload_data(test_file, bucket=bucket, key_prefix=prefix)
print(train_s3, test_s3)

## Train XGBoost Model

In [None]:
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version="1.5-1")
xgb = XGBoost(entry_point=None,
              framework_version="1.5-1",
              instance_type="ml.m5.large",
              instance_count=1,
              role=role,
              output_path=f"s3://{bucket}/{prefix}/output",
              use_spot_instances=True,
              max_run=3600,
              max_wait=7200,
              hyperparameters={
                  "max_depth": 5,
                  "eta": 0.2,
                  "gamma": 4,
                  "min_child_weight": 6,
                  "subsample": 0.8,
                  "objective": "binary:logistic",
                  "num_round": 200
              })

xgb.fit({"train": train_s3, "validation": test_s3})

## Deploy Endpoint

In [None]:
predictor = xgb.deploy(initial_instance_count=1, instance_type="ml.m5.large")
print("Endpoint deployed:", predictor.endpoint_name)

## Test Endpoint

In [None]:
sample = X_test[0:1]
result = predictor.predict(sample)
print("Fraud score:", result)