# Model Building and Training

In [10]:
import pandas as pd

fraud_data=pd.read_csv('../data/processed/fraud_data.csv')
credit_data=pd.read_csv('../data/processed/creditcard_data.csv')

In [23]:
fraud_data.columns

Index(['Unnamed: 0', 'user_id', 'signup_time', 'purchase_time',
       'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age',
       'ip_address', 'class', 'ip_address_int', 'country', 'transaction_freq',
       'transaction_velocity', 'hour_of_day', 'day_of_week', 'source_encoded',
       'browser_encoded', 'sex_encoded'],
      dtype='object')

In [25]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            151112 non-null  int64  
 1   user_id               151112 non-null  int64  
 2   signup_time           151112 non-null  object 
 3   purchase_time         151112 non-null  object 
 4   purchase_value        151112 non-null  float64
 5   device_id             151112 non-null  object 
 6   source                151112 non-null  object 
 7   browser               151112 non-null  object 
 8   sex                   151112 non-null  object 
 9   age                   151112 non-null  float64
 10  ip_address            151112 non-null  float64
 11  class                 151112 non-null  int64  
 12  ip_address_int        151112 non-null  int64  
 13  country               151112 non-null  object 
 14  transaction_freq      151112 non-null  float64
 15  

In [11]:
credit_data.columns

Index(['Unnamed: 0', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8',
       'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18',
       'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28',
       'Amount', 'Class'],
      dtype='object')

In [12]:
credit_data.head()

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Feature and Target Separation

In [18]:
# Fraud data target and features separation
X_fraud = fraud_data[['purchase_value', 'transaction_freq', 'transaction_velocity', 'hour_of_day', 'day_of_week', 'source_encoded', 'browser_encoded', 'sex_encoded']]
y_fraud = fraud_data['class']


In [19]:
# Credit card data target and features separation
X_credit = credit_data.drop({'Class','Unnamed: 0','Time'}, axis=1)
y_credit = credit_data['Class']

In [15]:
X_credit.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')

## Train-Test Split

In [20]:
from sklearn.model_selection import train_test_split

# Split Fraud_Data into training and test sets
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)

# Split Creditcard data into training and test sets
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.3, random_state=42)


## Model Selection

### Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize Logistic Regression model
lr_model = LogisticRegression()

# Train the model on Fraud_Data
lr_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_lr = lr_model.predict(X_test_fraud)
print("Fraud Data Logistic Regression Report:")
print(classification_report(y_test_fraud, y_pred_fraud_lr))

# Train the model on Creditcard data
lr_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_lr = lr_model.predict(X_test_credit)
print("Credit Card Data Logistic Regression Report:")
print(classification_report(y_test_credit, y_pred_credit_lr))


Fraud Data Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Credit Card Data Logistic Regression Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.80      0.50      0.61       134

    accuracy                           1.00     85118
   macro avg       0.90      0.75      0.81     85118
weighted avg       1.00      1.00      1.00     85118



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [6]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()

# Train the model on Fraud_Data
dt_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_dt = dt_model.predict(X_test_fraud)
print("Fraud Data Decision Tree Report:")
print(classification_report(y_test_fraud, y_pred_fraud_dt))

# Train the model on Creditcard data
dt_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_dt = dt_model.predict(X_test_credit)
print("Credit Card Data Decision Tree Report:")
print(classification_report(y_test_credit, y_pred_credit_dt))


Fraud Data Decision Tree Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     41117
           1       0.50      0.56      0.53      4217

    accuracy                           0.91     45334
   macro avg       0.73      0.75      0.74     45334
weighted avg       0.91      0.91      0.91     45334

Credit Card Data Decision Tree Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.71      0.78      0.74       134

    accuracy                           1.00     85118
   macro avg       0.86      0.89      0.87     85118
weighted avg       1.00      1.00      1.00     85118



### Random Forest

In [21]:
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn

# Ensure the models directory exists
os.makedirs('../models', exist_ok=True)

# Initialize Random Forest model
rf_model = RandomForestClassifier()

# Function to train, evaluate, and save the model
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    with mlflow.start_run():
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        # Log parameters and metrics with MLflow
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        
        # Log the model with MLflow
        mlflow.sklearn.log_model(model, "model")
        
        print(f"Model training completed for {model_name}.")
        print(f"Accuracy: {accuracy}")
        print(f"Classification Report:\n{report}")
        
        # Save the trained model to the models folder using joblib
        model_path = f'../models/{model_name}_model.pkl'
        joblib.dump(model, model_path)
        
        print(f"Trained model saved to {model_path}")
        
        # Verify that the model can be reloaded successfully
        loaded_model = joblib.load(model_path)
        if isinstance(loaded_model, RandomForestClassifier):
            print(f"Model {model_name} loaded successfully as a RandomForestClassifier instance.")
        else:
            print(f"Warning: Model {model_name} did not load as RandomForestClassifier.")

# Train and evaluate model on Fraud_Data
train_and_evaluate_model(rf_model, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, 'fraud_detection_rf')

# Train and evaluate model on Creditcard data
train_and_evaluate_model(rf_model, X_train_credit, X_test_credit, y_train_credit, y_test_credit, 'creditcard_fraud_rf')




Model training completed for fraud_detection_rf.
Accuracy: 0.9569638681784092
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       0.99      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.97      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Trained model saved to ../models/fraud_detection_rf_model.pkl
Model fraud_detection_rf loaded successfully as a RandomForestClassifier instance.




Model training completed for creditcard_fraud_rf.
Accuracy: 0.9994948189572124
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.95      0.72      0.82       134

    accuracy                           1.00     85118
   macro avg       0.98      0.86      0.91     85118
weighted avg       1.00      1.00      1.00     85118

Trained model saved to ../models/creditcard_fraud_rf_model.pkl
Model creditcard_fraud_rf loaded successfully as a RandomForestClassifier instance.


### Gradient Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Train the model on Fraud_Data
gb_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_gb = gb_model.predict(X_test_fraud)
print("Fraud Data Gradient Boosting Report:")
print(classification_report(y_test_fraud, y_pred_fraud_gb))

# Train the model on Creditcard data
gb_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_gb = gb_model.predict(X_test_credit)
print("Credit Card Data Gradient Boosting Report:")
print(classification_report(y_test_credit, y_pred_credit_gb))


Fraud Data Gradient Boosting Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       1.00      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.98      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Credit Card Data Gradient Boosting Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.86      0.48      0.62       134

    accuracy                           1.00     85118
   macro avg       0.93      0.74      0.81     85118
weighted avg       1.00      1.00      1.00     85118



### Multi-Layer Perceptron (MLP)

In [9]:
from sklearn.neural_network import MLPClassifier

# Initialize Multi-Layer Perceptron model
mlp_model = MLPClassifier(max_iter=500)

# Train the model on Fraud_Data
mlp_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_mlp = mlp_model.predict(X_test_fraud)
print("Fraud Data MLP Report:")
print(classification_report(y_test_fraud, y_pred_fraud_mlp))

# Train the model on Creditcard data
mlp_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_mlp = mlp_model.predict(X_test_credit)
print("Credit Card Data MLP Report:")
print(classification_report(y_test_credit, y_pred_credit_mlp))


Fraud Data MLP Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     41117
           1       0.90      0.55      0.68      4217

    accuracy                           0.95     45334
   macro avg       0.93      0.77      0.83     45334
weighted avg       0.95      0.95      0.95     45334

Credit Card Data MLP Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.37      0.43      0.40       134

    accuracy                           1.00     85118
   macro avg       0.69      0.72      0.70     85118
weighted avg       1.00      1.00      1.00     85118



In [7]:
import shap
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt
import joblib
import os
from sklearn.ensemble import RandomForestClassifier

# Load the trained models
with open('../models/fraud_detection_rf_model.pkl', 'rb') as model_file:
    rf_model_fraud = joblib.load(model_file)
    print(type(rf_model_fraud))  # Debugging line

with open('../models/creditcard_fraud_rf_model.pkl', 'rb') as model_file:
    rf_model_credit = joblib.load(model_file)
    print(type(rf_model_credit))  # Debugging line

# Ensure the models are RandomForestClassifier objects
if not isinstance(rf_model_fraud, RandomForestClassifier):
    raise TypeError("rf_model_fraud is not a RandomForestClassifier")

if not isinstance(rf_model_credit, RandomForestClassifier):
    raise TypeError("rf_model_credit is not a RandomForestClassifier")

# Ensure the explainability directory exists
os.makedirs('../explainability', exist_ok=True)

# SHAP Explainability
def shap_explain(model, X, model_name):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X)

    # Summary Plot
    shap.summary_plot(shap_values[1], X, show=False)
    plt.savefig(f'../explainability/{model_name}_shap_summary_plot.png')
    plt.clf()

    # Force Plot (for the first instance)
    shap.force_plot(explainer.expected_value[1], shap_values[1][0], X.iloc[0], show=False, matplotlib=True)
    plt.savefig(f'../explainability/{model_name}_shap_force_plot.png')
    plt.clf()

    # Dependence Plot (for the first feature)
    shap.dependence_plot(0, shap_values[1], X, show=False)
    plt.savefig(f'../explainability/{model_name}_shap_dependence_plot.png')
    plt.clf()

# LIME Explainability
def lime_explain(model, X, model_name):
    explainer = lime.lime_tabular.LimeTabularExplainer(X.values, feature_names=X.columns, class_names=['Non-Fraud', 'Fraud'], discretize_continuous=True)
    exp = explainer.explain_instance(X.iloc[0], model.predict_proba, num_features=10)

    # Feature Importance Plot
    exp.as_pyplot_figure()
    plt.savefig(f'../explainability/{model_name}_lime_feature_importance.png')
    plt.clf()

# # Explain Fraud Detection Model
# shap_explain(rf_model_fraud, X_test_fraud, 'fraud_detection_rf')
# lime_explain(rf_model_fraud, X_test_fraud, 'fraud_detection_rf')

# # Explain Credit Card Fraud Detection Model
# shap_explain(rf_model_credit, X_test_credit, 'creditcard_fraud_rf')
# lime_explain(rf_model_credit, X_test_credit, 'creditcard_fraud_rf')

# print("Model explainability completed and plots saved in the 'explainability' folder.")


  from .autonotebook import tqdm as notebook_tqdm


<class 'sklearn.ensemble._forest.RandomForestClassifier'>
<class 'sklearn.ensemble._forest.RandomForestClassifier'>


AssertionError: The shape of the shap_values matrix does not match the shape of the provided data matrix.

In [22]:
import os
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
import mlflow
import mlflow.sklearn

# Ensure the models directory exists
os.makedirs('../models', exist_ok=True)

# Function to train, evaluate, and save the model
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    with mlflow.start_run():
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Evaluate the model
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        report = classification_report(y_test, y_pred)
        
        # Log parameters and metrics with MLflow
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        
        # Log the model with MLflow
        mlflow.sklearn.log_model(model, "model")
        
        print(f"Model training completed for {model_name}.")
        print(f"Accuracy: {accuracy}")
        print(f"Classification Report:\n{report}")
        
        # Save the trained model to the models folder using joblib
        model_path = f'../models/{model_name}_model.pkl'
        joblib.dump(model, model_path)
        
        print(f"Trained model saved to {model_path}")
        
        # Verify that the model can be reloaded successfully
        loaded_model = joblib.load(model_path)
        if isinstance(loaded_model, type(model)):
            print(f"Model {model_name} loaded successfully as a {type(model).__name__} instance.")
        else:
            print(f"Warning: Model {model_name} did not load as {type(model).__name__}.")

# Initialize models
rf_model_fraud = RandomForestClassifier()
rf_model_credit = RandomForestClassifier()
dt_model_fraud = DecisionTreeClassifier()
dt_model_credit = DecisionTreeClassifier()

# Train and evaluate Random Forest model on Fraud_Data
train_and_evaluate_model(rf_model_fraud, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, 'fraud_detection_rf')

# Train and evaluate Random Forest model on Creditcard data
train_and_evaluate_model(rf_model_credit, X_train_credit, X_test_credit, y_train_credit, y_test_credit, 'creditcard_fraud_rf')

# Train and evaluate Decision Tree model on Fraud_Data
train_and_evaluate_model(dt_model_fraud, X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud, 'fraud_detection_dt')

# Train and evaluate Decision Tree model on Creditcard data
train_and_evaluate_model(dt_model_credit, X_train_credit, X_test_credit, y_train_credit, y_test_credit, 'creditcard_fraud_dt')




Model training completed for fraud_detection_rf.
Accuracy: 0.9569418096792695
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       0.99      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.97      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Trained model saved to ../models/fraud_detection_rf_model.pkl
Model fraud_detection_rf loaded successfully as a RandomForestClassifier instance.




Model training completed for creditcard_fraud_rf.
Accuracy: 0.9995183157499001
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.95      0.73      0.83       134

    accuracy                           1.00     85118
   macro avg       0.98      0.87      0.91     85118
weighted avg       1.00      1.00      1.00     85118

Trained model saved to ../models/creditcard_fraud_rf_model.pkl
Model creditcard_fraud_rf loaded successfully as a RandomForestClassifier instance.




Model training completed for fraud_detection_dt.
Accuracy: 0.9066484316407112
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     41117
           1       0.50      0.56      0.53      4217

    accuracy                           0.91     45334
   macro avg       0.73      0.75      0.74     45334
weighted avg       0.91      0.91      0.91     45334

Trained model saved to ../models/fraud_detection_dt_model.pkl
Model fraud_detection_dt loaded successfully as a DecisionTreeClassifier instance.




Model training completed for creditcard_fraud_dt.
Accuracy: 0.9991071218778637
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.69      0.78      0.73       134

    accuracy                           1.00     85118
   macro avg       0.85      0.89      0.87     85118
weighted avg       1.00      1.00      1.00     85118

Trained model saved to ../models/creditcard_fraud_dt_model.pkl
Model creditcard_fraud_dt loaded successfully as a DecisionTreeClassifier instance.
