In [1]:
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import joblib  # To load saved models
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import warnings

warnings.filterwarnings('ignore')



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load fraud data
fraud_data = pd.read_csv('Fraud_Data.csv')
ip_data = pd.read_csv('IpAddress_to_Country.csv')

# Load credit card data
creditcard_data = pd.read_csv('creditcard.csv')

# Create date-related features
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

fraud_data['signup_year'] = fraud_data['signup_time'].dt.year
fraud_data['signup_month'] = fraud_data['signup_time'].dt.month
fraud_data['signup_day'] = fraud_data['signup_time'].dt.day
fraud_data['signup_hour'] = fraud_data['signup_time'].dt.hour

fraud_data['purchase_year'] = fraud_data['purchase_time'].dt.year
fraud_data['purchase_month'] = fraud_data['purchase_time'].dt.month
fraud_data['purchase_day'] = fraud_data['purchase_time'].dt.day
fraud_data['purchase_hour'] = fraud_data['purchase_time'].dt.hour

# Focus on essential columns and remove potential problematic ones
columns_to_keep = ['purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'signup_year', 'signup_month', 'signup_day', 'signup_hour', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_hour']

# Create a new dataframe with the selected columns
fraud_data_clean = fraud_data[columns_to_keep]

# Explicitly convert categorical columns to strings
categorical_cols = ['device_id', 'source', 'browser', 'sex']
fraud_data_clean[categorical_cols] = fraud_data_clean[categorical_cols].astype(str)

# Convert numerical columns to floats
numerical_cols = ['purchase_value', 'age', 'signup_year', 'signup_month', 'signup_day', 'signup_hour', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_hour']
fraud_data_clean[numerical_cols] = fraud_data_clean[numerical_cols].astype(float)

# Handle potential missing values (optional but recommended)
fraud_data_clean.fillna(0, inplace=True)

# Verify the columns and their types
print(fraud_data_clean.dtypes)

# Define column transformer with one-hot encoding for categorical features and scaling for numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numerical_cols)
    ]
)

# Separate features and target
X_fraud = fraud_data_clean
y_fraud = fraud_data['class']

# Transform the features
X_fraud_transformed = preprocessor.fit_transform(X_fraud)

# Standardize credit card data
X_creditcard = StandardScaler().fit_transform(creditcard_data.drop(columns=['Class']).values)
y_creditcard = creditcard_data['Class']

# Split the data into train and test sets
from sklearn.model_selection import train_test_split

X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud_transformed, y_fraud, test_size=0.2, random_state=42)
X_creditcard_train, X_creditcard_test, y_creditcard_train, y_creditcard_test = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)


purchase_value    float64
device_id          object
source             object
browser            object
sex                object
age               float64
signup_year       float64
signup_month      float64
signup_day        float64
signup_hour       float64
purchase_year     float64
purchase_month    float64
purchase_day      float64
purchase_hour     float64
dtype: object


In [3]:
import joblib

# Paths to your saved models
logistic_regression_model_path = 'mlruns/901575089113022840/ff9e67a75ca546cc9ed7f1c819e8f834/artifacts/Logistic Regression/model.pkl'
random_forest_model_path = 'mlruns/901575089113022840/46bc3d1e00ce4070bd98466edca7034e/artifacts/Random Forest/model.pkl'
gradient_boosting_model_path = 'mlruns/901575089113022840/e58badcec43a4e06b8ba78efc3c626d4/artifacts/Gradient Boosting/model.pkl'

# Load models
logistic_regression_model = joblib.load(logistic_regression_model_path)
random_forest_model = joblib.load(random_forest_model_path)
gradient_boosting_model = joblib.load(gradient_boosting_model_path)


In [4]:
import scipy
# Use a sample of the data to avoid memory issues
sample_size = 1000  # Adjust sample size based on available memory
X_fraud_train_sample = X_fraud_train[:sample_size].toarray() if scipy.sparse.issparse(X_fraud_train) else X_fraud_train[:sample_size]
X_fraud_test_sample = X_fraud_test[:sample_size].toarray() if scipy.sparse.issparse(X_fraud_test) else X_fraud_test[:sample_size]

# Assuming feature names are available
feature_names = ['purchase_value', 'age', 'signup_year', 'signup_month', 'signup_day', 'signup_hour', 'purchase_year', 'purchase_month', 'purchase_day', 'purchase_hour'] + list(preprocessor.named_transformers_['cat'].get_feature_names_out())


In [None]:
# Create a LIME explainer for the Logistic Regression model
explainer_lr = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Logistic Regression
i = 0  # Index of the instance to explain
exp_lr = explainer_lr.explain_instance(X_fraud_test_sample[i], logistic_regression_model.predict_proba, num_features=10)

# Feature importance plot for Logistic Regression
exp_lr.show_in_notebook(show_table=True)

# Create a LIME explainer for the Random Forest model
explainer_rf = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Random Forest
exp_rf = explainer_rf.explain_instance(X_fraud_test_sample[i], random_forest_model.predict_proba, num_features=10)

# Feature importance plot for Random Forest
exp_rf.show_in_notebook(show_table=True)

# Create a LIME explainer for the Gradient Boosting model
explainer_gb = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Gradient Boosting
exp_gb = explainer_gb.explain_instance(X_fraud_test_sample[i], gradient_boosting_model.predict_proba, num_features=10)

# Feature importance plot for Gradient Boosting
exp_gb.show_in_notebook(show_table=True)


In [5]:
import lime
from lime.lime_tabular import LimeTabularExplainer
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# Assume you have already preprocessed your data and split into training and test sets
# X_fraud_train_sample, X_fraud_test_sample, y_fraud_train, y_fraud_test, feature_names
# For the sake of example, I'll create dummy data here
np.random.seed(0)
X_fraud_train_sample = np.random.rand(100, 10)
X_fraud_test_sample = np.random.rand(10, 10)
y_fraud_train = np.random.randint(2, size=100)
y_fraud_test = np.random.randint(2, size=10)
feature_names = [f'Feature {i}' for i in range(10)]

# Create a Logistic Regression model and fit it to the data
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_fraud_train_sample, y_fraud_train)

# Create a LIME explainer for the Logistic Regression model
explainer_lr = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Logistic Regression
i = 0  # Index of the instance to explain
exp_lr = explainer_lr.explain_instance(X_fraud_test_sample[i], logistic_regression_model.predict_proba, num_features=10)

# Save the explanation to an HTML file
exp_lr.save_to_file('explanation_logistic_regression.html')


In [6]:
import lime
from lime.lime_tabular import LimeTabularExplainer
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

# Assume you have already preprocessed your data and split into training and test sets
# X_fraud_train_sample, X_fraud_test_sample, y_fraud_train, y_fraud_test, feature_names
# For the sake of example, I'll create dummy data here
np.random.seed(0)
X_fraud_train_sample = np.random.rand(100, 10)
X_fraud_test_sample = np.random.rand(10, 10)
y_fraud_train = np.random.randint(2, size=100)
y_fraud_test = np.random.randint(2, size=10)
feature_names = [f'Feature {i}' for i in range(10)]

# Create a Random Forest model and fit it to the data
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_fraud_train_sample, y_fraud_train)

# Create a LIME explainer for the Random Forest model
explainer_rf = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Random Forest
i = 0  # Index of the instance to explain
exp_rf = explainer_rf.explain_instance(X_fraud_test_sample[i], random_forest_model.predict_proba, num_features=10)

# Save the explanation to an HTML file
exp_rf.save_to_file('explanation_random_forest.html')


In [None]:
import lime
from lime.lime_tabular import LimeTabularExplainer
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier

# Assume you have already preprocessed your data and split into training and test sets
# X_fraud_train_sample, X_fraud_test_sample, y_fraud_train, y_fraud_test, feature_names
# For the sake of example, I'll create dummy data here
np.random.seed(0)
X_fraud_train_sample = np.random.rand(100, 10)
X_fraud_test_sample = np.random.rand(10, 10)
y_fraud_train = np.random.randint(2, size=100)
y_fraud_test = np.random.randint(2, size=10)
feature_names = [f'Feature {i}' for i in range(10)]

# Create a Gradient Boosting model and fit it to the data
gradient_boosting_model = GradientBoostingClassifier()
gradient_boosting_model.fit(X_fraud_train_sample, y_fraud_train)

# Create a LIME explainer for the Gradient Boosting model
explainer_gb = LimeTabularExplainer(X_fraud_train_sample,
                                    mode='classification', 
                                    feature_names=feature_names,  
                                    class_names=['Not Fraud', 'Fraud'], 
                                    discretize_continuous=True)

# Explain a single prediction for Gradient Boosting
i = 0  # Index of the instance to explain
exp_gb = explainer_gb.explain_instance(X_fraud_test_sample[i], gradient_boosting_model.predict_proba, num_features=10)

# Save the explanation to an HTML file
exp_gb.save_to_file('explanation_gradient_boosting.html')
