 # Fraud Detection System

 This notebook demonstrates a complete fraud detection pipeline including:
 - Data loading and exploration
 - Data preprocessing
 - Model training and evaluation
 - Model explainability
 - Model saving

 ## 1. Setup and Imports

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import logging
import sys
import os
from pathlib import Path

# First fix package versions to avoid import conflicts
try:
    from sklearn.utils._param_validation import _MissingValues
    from imblearn.over_sampling import SMOTE
except ImportError:
    # Auto-fix version conflicts
    import subprocess
    subprocess.run(["pip", "install", "scikit-learn==1.3.0", "imbalanced-learn==0.10.1"], check=True)
    print("⚠️ Package versions adjusted. Please restart the kernel and run again.")
    exit()

# Configure project paths
try:
    # Get the ABSOLUTE path to the project root
    PROJECT_ROOT = Path(__file__).parent.parent if "__file__" in locals() else Path.cwd()
    
    # Special handling for Jupyter notebooks
    if "ipykernel" in sys.modules:
        PROJECT_ROOT = Path(os.getcwd()).parent  # Assumes notebook is in a subfolder
    
    sys.path.insert(0, str(PROJECT_ROOT))
    
    # Verify the src package exists
    if not (PROJECT_ROOT / "src").exists():
        raise ImportError("src package not found in project root")
        
except Exception as e:
    print(f"❌ Path configuration failed: {e}")
    print(f"Current working directory: {os.getcwd()}")
    print(f"PROJECT_ROOT: {PROJECT_ROOT}")
    raise

# Now import project modules
try:
    from src.data.preprocessing import FraudDataPreprocessor
    from src.models.train import FraudDetectionModel
    from src.models.explainability import ModelExplainer
    from src.config import MODEL_CONFIG, DATA_PATHS, FEATURES
    from src.utils import load_data, save_data
    print("✅ All imports working!")
except ImportError as e:
    print(f"❌ Import failed: {e}")
    print("\nDebugging info:")
    print(f"Python path: {sys.path}")
    print(f"Files in src directory: {os.listdir(PROJECT_ROOT / 'src')}")
    raise

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("fraud_detection.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 12

⚠️ Package versions adjusted. Please restart the kernel and run again.
❌ Import failed: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\tefer\desktop\fraud-detection-system\venv\lib\site-packages\sklearn\utils\_param_validation.py)

Debugging info:
Python path: ['C:\\Users\\tefer\\desktop\\fraud-detection-system', 'C:\\Users\\tefer\\AppData\\Local\\Programs\\Python\\Python310\\python310.zip', 'C:\\Users\\tefer\\AppData\\Local\\Programs\\Python\\Python310\\DLLs', 'C:\\Users\\tefer\\AppData\\Local\\Programs\\Python\\Python310\\lib', 'C:\\Users\\tefer\\AppData\\Local\\Programs\\Python\\Python310', 'C:\\Users\\tefer\\desktop\\fraud-detection-system\\venv', '', 'C:\\Users\\tefer\\desktop\\fraud-detection-system\\venv\\lib\\site-packages', 'C:\\Users\\tefer\\desktop\\fraud-detection-system\\venv\\lib\\site-packages\\win32', 'C:\\Users\\tefer\\desktop\\fraud-detection-system\\venv\\lib\\site-packages\\win32\\lib', 'C:\\Users\\tefer\\desktop\\fraud-detectio

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\tefer\desktop\fraud-detection-system\venv\lib\site-packages\sklearn\utils\_param_validation.py)

 ## 2. Data Loading and Exploration

In [None]:
# Load data
logger.info("Loading data...")
df = load_data(DATA_PATHS['raw_data'])

In [None]:
# Initial data exploration
print("=== Data Shape ===")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}\n")

print("=== Data Types ===")
print(df.dtypes.value_counts())
print("\nCategorical Features:")
print(FEATURES['categorical_features'])
print("\nNumeric Features:")
print(FEATURES['numeric_features'])

print("\n=== Missing Values ===")
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

print("\n=== Class Distribution ===")
class_dist = df[FEATURES['target']].value_counts(normalize=True)
print(class_dist)
class_dist.plot(kind='bar', title='Class Distribution')
plt.show()

 ## 3. Data Preprocessing

In [None]:
logger.info("Preprocessing data...")
preprocessor = FraudDataPreprocessor(
    numeric_features=FEATURES['numeric_features'],
    categorical_features=FEATURES['categorical_features']
)

X = df.drop(FEATURES['target'], axis=1)
y = df[FEATURES['target']]

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)
feature_names = preprocessor.get_feature_names()

In [None]:
# Save processed data
processed_df = pd.DataFrame.sparse.from_spmatrix(X_processed, columns=feature_names)
processed_df[FEATURES['target']] = y.values
save_data(processed_df, DATA_PATHS['processed_data'])

print("Processed data shape:", X_processed.shape)
print("Feature names:", feature_names[:5], "...")

 ## 4. Model Training

In [None]:
logger.info("Training models...")
fraud_model = FraudDetectionModel(MODEL_CONFIG)

# Split data
X_train, X_test, y_train, y_test = fraud_model.train_test_split(
    X_processed, y, test_size=0.2, random_state=42
)

In [None]:
# Train ensemble model
best_model = fraud_model.train_ensemble(X_train, y_train, optimize=True)

In [None]:
# Evaluate on test set
test_metrics = fraud_model.evaluate_model(best_model, X_test, y_test)
logger.info(f"Test ROC AUC: {test_metrics['roc_auc']:.4f}")
logger.info(f"Test Precision: {test_metrics['precision']:.4f}")
logger.info(f"Test Recall: {test_metrics['recall']:.4f}")

# Plot confusion matrix
fraud_model.plot_confusion_matrix(best_model, X_test, y_test)

 ## 5. Model Explainability

In [None]:
logger.info("Generating explanations...")
explainer = ModelExplainer(best_model, preprocessor, feature_names)

In [None]:
# SHAP analysis
shap_values = explainer.shap_analysis(X_train)
explainer.plot_shap_summary(shap_values)

In [None]:
# LIME explanation for a specific instance
lime_exp = explainer.lime_explanation(X_train, instance_idx=0)
lime_exp.show_in_notebook()

In [None]:
# Permutation importance
perm_importance = explainer.permutation_importance(X_test, y_test)
explainer.plot_feature_importance(perm_importance.importances_mean)

 ## 6. Save Model

In [None]:
logger.info("Saving model...")
fraud_model.save_model(best_model, "models/fraud_detection_model.pkl")

# Also save the preprocessor
joblib.dump(preprocessor, "models/preprocessor.pkl")
logger.info("Model and preprocessor saved successfully.")

 ## 7. Conclusion

 The fraud detection model has been successfully trained with good performance metrics. The explainability tools help understand the model's decision-making process, which is crucial for fraud detection systems.

In [None]:
# Final metrics summary
print("=== Final Model Performance ===")
print(f"ROC AUC Score: {test_metrics['roc_auc']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1 Score: {test_metrics['f1']:.4f}")