# SnappTech Real-Time Fraud & Abuse Detection Demo

This Jupyter Notebook provides an interactive demonstration of the Snapp Real-Time Fraud & Abuse Detection system. It covers data simulation, model inference, and basic interpretability.

**Before running this notebook:**
1.  Ensure your local environment is set up as per `docs/03_SNAPPTECH_DEMO_GUIDE.md`.
2.  Run the data generators:
    ```bash
    python data_vault/fraud_pattern_simulator/generate_abuse_scenarios.py
    python data_vault/graph_topology_data/generate_collusion_graph.py
    ```
3.  Run the batch feature processor:
    ```bash
    python src/feature_forge/batch_features.py
    python src/feature_forge/graph_features.py
    ```
4.  Train the LightGBM model:
    ```bash
    python src/model_arsenal/train_lightgbm.py --env dev
    ```
5.  Start the FastAPI prediction engine and Kafka consumer in separate terminals:
    ```bash
    uvicorn src.prediction_engine.fraud_detection_api:app --host 0.0.0.0 --port 8000
    python src/ingestion_stream/kafka_event_consumer.py --env dev
    ```
    *Note: The Kafka consumer `handle_incoming_event` placeholder currently only logs the event. For a full loop, it would need to push to a processing queue that feeds features to Redis for the API to pull.* 
    *For this demo, we will directly call the InferenceEngine for simplicity and immediate feedback, bypassing Kafka for the `predict` step, but acknowledging Kafka's role in the real system.*

In [1]:
import requests
import json
import time
import pandas as pd
from datetime import datetime, timedelta
import numpy as np
import uuid
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import local modules (adjust sys.path if needed for direct import)
import sys
project_root = Path('../').resolve()
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Ensure we're in the correct working directory for imports
import os
os.chdir(project_root)

from src.prediction_engine.inference_logic import InferenceEngine
from src.interpretability_module.explanation_generator import ExplanationGenerator
from src.utils.common_helpers import load_config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize Inference Engine and Explanation Generator (for direct calls)
config_directory = project_root / "conf"
inference_engine = InferenceEngine(config_directory, env='dev')
explanation_generator = ExplanationGenerator(config_directory, env='dev')

# Set model and background data for ExplanationGenerator
# This requires knowing the features used by the LGBM model
if inference_engine.lightgbm_model and hasattr(inference_engine.lightgbm_model, 'feature_name_'):
    mock_events_df = pd.read_csv(project_root / "data_vault" / "synthetic_fraud_events.csv")
    mock_events_df["event_timestamp"] = pd.to_datetime(mock_events_df["event_timestamp"])
    mock_events_df['hour_of_day'] = mock_events_df['event_timestamp'].dt.hour
    mock_events_df['day_of_week'] = mock_events_df['event_timestamp'].dt.dayofweek
    mock_events_df['distance_per_duration'] = mock_events_df['distance_km'] / (mock_events_df['duration_min'].replace(0, 1e-6))
    mock_events_df['fare_per_km'] = mock_events_df['fare_amount'] / (mock_events_df['distance_km'].replace(0, 1e-6))
    
    try:
        user_batch_features = pd.read_csv(project_root / "data_vault" / "batch_user_features.csv")
        driver_batch_features = pd.read_csv(project_root / "data_vault" / "batch_driver_features.csv")
        mock_events_df = mock_events_df.merge(user_batch_features, on='user_id', how='left')
        mock_events_df = mock_events_df.merge(driver_batch_features, on='driver_id', how='left')
    except FileNotFoundError:
        print("Batch feature files not found. Skipping merge for explanation generator.")
    mock_events_df = mock_events_df.fillna(0)

    lgbm_feature_names = inference_engine.lightgbm_model.feature_name_ if hasattr(inference_engine.lightgbm_model, 'feature_name_') else []
    background_data_for_shap = mock_events_df[lgbm_feature_names]
    explanation_generator.set_model_and_features(inference_engine.lightgbm_model, lgbm_feature_names, background_data_for_shap)
else:
    print("LightGBM model not loaded or feature names not accessible for explanation generator.")

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Notebook\\Desktop\\Snapp\\2. Real-Time Fraud & Abuse Detection\\conf\\environments\\dev.yaml'

## 1. Simulate and Predict a Single Event

Let's create a synthetic event and send it to the local prediction API (or directly to the `InferenceEngine` for faster demo).

We'll simulate two types of events: a normal ride and a potentially fraudulent one.

In [3]:
API_URL = "http://localhost:8000/predict"

def create_synthetic_event(user_id, driver_id, event_type, is_fraud_scenario=False):
    event = {
        "event_id": str(uuid.uuid4()),
        "event_timestamp": datetime.now().isoformat(),
        "event_type": event_type,
        "user_id": user_id,
        "driver_id": driver_id,
        "ride_id": str(uuid.uuid4())[:8],
        "start_location_lat": 35.72 + np.random.normal(0, 0.01),
        "start_location_lon": 51.42 + np.random.normal(0, 0.01),
        "end_location_lat": 35.75 + np.random.normal(0, 0.01),
        "end_location_lon": 51.45 + np.random.normal(0, 0.01),
        "payment_method": "credit_card",
        "device_info": "android_12",
        "ip_address": f"192.168.1.{np.random.randint(1, 255)}"
    }
    
    if is_fraud_scenario:
        event["fare_amount"] = np.random.uniform(20000, 30000) # Unusually low fare
        event["distance_km"] = np.random.uniform(0.5, 2) # Very short distance
        event["duration_min"] = np.random.uniform(2, 5) # Very short duration
        event["promo_code_used"] = "FAKE_PROMO"
        event["payment_method"] = "cash" # Often preferred for fake rides
    else:
        event["fare_amount"] = np.random.uniform(50000, 150000)
        event["distance_km"] = np.random.uniform(5, 25)
        event["duration_min"] = np.random.uniform(10, 45)
        event["promo_code_used"] = None
        
    return event

In [4]:
# --- Normal Event --- 
normal_event = create_synthetic_event("user_demo_1", "driver_demo_A", "ride_completed", is_fraud_scenario=False)
print("--- Predicting Normal Event (via InferenceEngine direct call) ---")
if inference_engine.is_ready():
    normal_prediction = inference_engine.run_inference(normal_event)
    print(json.dumps(normal_prediction, indent=2))
    
    # Get Explanation for Normal Event
    if 'explanation' in normal_prediction and explanation_generator.model is not None:
        print("\n--- Explanation for Normal Event ---")
        # Pass the extracted features from the inference engine for SHAP explanation
        # In a real API call, explanation would be returned directly.
        extracted_features = inference_engine._extract_features(normal_event)
        explanation = explanation_generator.generate_shap_explanation(extracted_features)
        print(json.dumps(explanation, indent=2))
else:
    print("Inference Engine not ready.")

--- Predicting Normal Event (via InferenceEngine direct call) ---


NameError: name 'inference_engine' is not defined

In [5]:
# --- Fraudulent Event --- 
fraud_event = create_synthetic_event("user_high_risk", "driver_high_risk", "ride_completed", is_fraud_scenario=True)
print("\n--- Predicting Fraudulent Event (via InferenceEngine direct call) ---")
if inference_engine.is_ready():
    fraud_prediction = inference_engine.run_inference(fraud_event)
    print(json.dumps(fraud_prediction, indent=2))

    # Get Explanation for Fraudulent Event
    if 'explanation' in fraud_prediction and explanation_generator.model is not None:
        print("\n--- Explanation for Fraudulent Event ---")
        extracted_features = inference_engine._extract_features(fraud_event)
        explanation = explanation_generator.generate_shap_explanation(extracted_features)
        print(json.dumps(explanation, indent=2))
else:
    print("Inference Engine not ready.")


--- Predicting Fraudulent Event (via InferenceEngine direct call) ---


NameError: name 'inference_engine' is not defined

## 2. Real-time Monitoring Simulation

Let's simulate a stream of events over time, including some fraudulent ones, and visualize the fraud scores.
This helps to observe the system's behavior and detect patterns.

In [None]:
num_simulated_events = 50
fraud_injection_rate = 0.2 # 20% of events will be fraudulent

all_predictions = []

print(f"\n--- Simulating {num_simulated_events} Events ---")
for i in range(num_simulated_events):
    user = f"user_{np.random.randint(1, 100)}"
    driver = f"driver_{np.random.randint(1, 50)}"
    is_fraud = np.random.rand() < fraud_injection_rate
    
    current_event = create_synthetic_event(user, driver, "ride_completed", is_fraud_scenario=is_fraud)
    
    if inference_engine.is_ready():
        prediction = inference_engine.run_inference(current_event)
        prediction['timestamp'] = datetime.fromisoformat(current_event['event_timestamp'])
        prediction['is_actual_fraud'] = is_fraud
        all_predictions.append(prediction)
    else:
        print("Inference Engine not ready, skipping simulation.")
        break
    
    time.sleep(0.1) # Simulate real-time delay

if all_predictions:
    predictions_df = pd.DataFrame(all_predictions)
    predictions_df['timestamp'] = pd.to_datetime(predictions_df['timestamp'])
    predictions_df.set_index('timestamp', inplace=True)

    plt.figure(figsize=(14, 7))
    sns.lineplot(data=predictions_df, x=predictions_df.index, y='fraud_score', hue='is_actual_fraud', marker='o', alpha=0.7)
    plt.axhline(y=inference_engine.config["thresholds"]["action_triggers"]["manual_review_queue_score"], color='orange', linestyle='--', label='Manual Review Threshold')
    plt.axhline(y=inference_engine.config["thresholds"]["action_triggers"]["auto_block_score"], color='red', linestyle='--', label='Auto Block Threshold')
    plt.title('Real-time Fraud Scores Over Time')
    plt.xlabel('Time')
    plt.ylabel('Fraud Score')
    plt.ylim(-0.05, 1.05)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Distribution of scores
    plt.figure(figsize=(10, 6))
    sns.histplot(predictions_df, x='fraud_score', hue='is_actual_fraud', kde=True, bins=20)
    plt.title('Distribution of Fraud Scores')
    plt.xlabel('Fraud Score')
    plt.ylabel('Count')
    plt.axvline(x=inference_engine.config["thresholds"]["action_triggers"]["manual_review_queue_score"], color='orange', linestyle='--', label='Manual Review Threshold')
    plt.axvline(x=inference_engine.config["thresholds"]["action_triggers"]["auto_block_score"], color='red', linestyle='--', label='Auto Block Threshold')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

## 3. Human Review Integration Simulation

This section demonstrates how events might be added to a human review queue and subsequently processed with feedback.

In [None]:
from src.feedback_loop.human_review_integration import HumanReviewSystem

review_system = HumanReviewSystem(config_directory, env='dev')

print("--- Adding an event to human review queue ---")
high_score_event = create_synthetic_event("user_high_risk", "driver_high_risk", "ride_completed", is_fraud_scenario=True)
if inference_engine.is_ready():
    prediction_for_review = inference_engine.run_inference(high_score_event)
    if review_system.add_to_review_queue(prediction_for_review, high_score_event):
        print(f"Event {prediction_for_review['event_id']} added to queue.")
    else:
        print(f"Event {prediction_for_review['event_id']} did not meet review threshold (Score: {prediction_for_review['fraud_score']:.2f}).")

print("\n--- Retrieving pending reviews ---")
pending_reviews = review_system.get_pending_reviews()
if pending_reviews:
    print(f"Found {len(pending_reviews)} pending reviews.")
    for review in pending_reviews:
        print(f"  Event ID: {review['event_id']}, Predicted Score: {review['predicted_score']:.2f}, Suggested Action: {review['suggested_action']}")
    
    # Simulate human decision
    first_review_id = pending_reviews[0]['event_id']
    print(f"\n--- Submitting human feedback for {first_review_id} (Confirmed Fraud) ---")
    review_system.submit_human_feedback(first_review_id, human_decision=True, reviewer_id="analyst_demo_1", comments="Confirmed suspicious fare/distance for a new user.")
    
    print("\n--- Checking pending reviews after feedback ---")
    print(f"Remaining pending reviews: {len(review_system.get_pending_reviews())}")
else:
    print("No pending reviews to demonstrate.")