In [3]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=39856e1c025c4f3681054841e3f0c6f2bf059b50b77756a0f6652254ab805f49
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [6]:
# Fraud Detection Model Explainability using SHAP and LIME

# Import necessary libraries
import pandas as pd
import numpy as np
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ------------------------------
# Section 1: Data Loading
# ------------------------------
# Load datasets
fraud_data = pd.read_csv('Fraud_Data.csv')
creditcard_data = pd.read_csv('creditcard.csv')

# Display the first few rows of the e-commerce fraud data
print("E-commerce Fraud Data:")
print(fraud_data.head())


E-commerce Fraud Data:
   user_id          signup_time        purchase_time  purchase_value  \
0    22058  2015-02-24 22:55:49  2015-04-18 02:47:11              34   
1   333320  2015-06-07 20:39:50  2015-06-08 01:38:54              16   
2     1359  2015-01-01 18:52:44  2015-01-01 18:52:45              15   
3   150084  2015-04-28 21:13:25  2015-05-04 13:54:50              44   
4   221365  2015-07-21 07:09:52  2015-09-09 18:40:53              39   

       device_id source browser sex  age    ip_address  class  
0  QVPSPJUOCKZAR    SEO  Chrome   M   39  7.327584e+08      0  
1  EOGFQPIZPYXFZ    Ads  Chrome   F   53  3.503114e+08      0  
2  YSSKYOSJHPPLJ    SEO   Opera   M   53  2.621474e+09      1  
3  ATGTXKYKUDUQN    SEO  Safari   M   41  3.840542e+09      0  
4  NAUITBZFJKHWW    Ads  Safari   M   45  4.155831e+08      0  


In [7]:
# ------------------------------
# Section 2: Data Preprocessing
# ------------------------------
# Convert timestamps to datetime
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Drop columns not needed for modeling
X = fraud_data.drop(columns=['class', 'user_id', 'device_id', 'signup_time', 'purchase_time'])
y = fraud_data['class']

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Preprocessing completed. Training and test sets are ready.")


Preprocessing completed. Training and test sets are ready.


In [8]:
# ------------------------------
# Section 3: Model Training
# ------------------------------
# Train a Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
report = classification_report(y_test, y_pred)
print("Model Training and Evaluation Report:")
print(report)


Model Training and Evaluation Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     19657
           1       0.87      0.53      0.66      2030

    accuracy                           0.95     21687
   macro avg       0.91      0.76      0.82     21687
weighted avg       0.95      0.95      0.94     21687



In [None]:
# ------------------------------
# Section 4: SHAP Explainability
# ------------------------------
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# SHAP Summary Plot
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values[1], X_test)
plt.title('SHAP Summary Plot')
plt.show()

# SHAP Force Plot for a single prediction
shap.initjs()
sample_index = 0  # Change this index to visualize different samples
shap.force_plot(explainer.expected_value[1], shap_values[1][sample_index], X_test.iloc[sample_index])
plt.show()

# SHAP Dependence Plot for a specific feature
feature_name = 'purchase_value'  # Change as needed
plt.figure(figsize=(10, 6))
shap.dependence_plot(feature_name, shap_values[1], X_test)
plt.title(f'SHAP Dependence Plot for {feature_name}')
plt.show()


In [None]:
# ------------------------------
# Section 5: LIME Explainability
# ------------------------------
# Initialize LIME explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    mode='classification',
    feature_names=X_train.columns,
    class_names=['Non-Fraud', 'Fraud'],
    discretize_continuous=True
)

# Explain a single prediction
sample_index = 0  # Change this index to visualize different samples
lime_explanation = lime_explainer.explain_instance(
    data_row=X_test.iloc[sample_index],
    predict_fn=model.predict_proba
)

# LIME Feature Importance Plot
lime_explanation.as_pyplot_figure()
plt.title('LIME Feature Importance')
plt.show()
