# Model Building and Training

In [2]:
import pandas as pd

fraud_data=pd.read_csv('../data/processed/fraud_data.csv')
credit_data=pd.read_csv('../data/processed/creditcard_data.csv')

## Feature and Target Separation

In [3]:
# Fraud data target and features separation
X_fraud = fraud_data[['purchase_value', 'transaction_freq', 'transaction_velocity', 'hour_of_day', 'day_of_week', 'source_encoded', 'browser_encoded', 'sex_encoded']]
y_fraud = fraud_data['class']


In [4]:
# Credit card data target and features separation
X_credit = credit_data.drop('Class', axis=1)
y_credit = credit_data['Class']

## Train-Test Split

In [5]:
from sklearn.model_selection import train_test_split

# Split Fraud_Data into training and test sets
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.3, random_state=42)

# Split Creditcard data into training and test sets
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.3, random_state=42)


## Model Selection

### Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize Logistic Regression model
lr_model = LogisticRegression()

# Train the model on Fraud_Data
lr_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_lr = lr_model.predict(X_test_fraud)
print("Fraud Data Logistic Regression Report:")
print(classification_report(y_test_fraud, y_pred_fraud_lr))

# Train the model on Creditcard data
lr_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_lr = lr_model.predict(X_test_credit)
print("Credit Card Data Logistic Regression Report:")
print(classification_report(y_test_credit, y_pred_credit_lr))


Fraud Data Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     41117
           1       0.00      0.00      0.00      4217

    accuracy                           0.91     45334
   macro avg       0.45      0.50      0.48     45334
weighted avg       0.82      0.91      0.86     45334



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Credit Card Data Logistic Regression Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.80      0.50      0.61       134

    accuracy                           1.00     85118
   macro avg       0.90      0.75      0.81     85118
weighted avg       1.00      1.00      1.00     85118



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Decision Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

# Initialize Decision Tree model
dt_model = DecisionTreeClassifier()

# Train the model on Fraud_Data
dt_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_dt = dt_model.predict(X_test_fraud)
print("Fraud Data Decision Tree Report:")
print(classification_report(y_test_fraud, y_pred_fraud_dt))

# Train the model on Creditcard data
dt_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_dt = dt_model.predict(X_test_credit)
print("Credit Card Data Decision Tree Report:")
print(classification_report(y_test_credit, y_pred_credit_dt))


Fraud Data Decision Tree Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     41117
           1       0.50      0.56      0.53      4217

    accuracy                           0.91     45334
   macro avg       0.73      0.75      0.74     45334
weighted avg       0.91      0.91      0.91     45334

Credit Card Data Decision Tree Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.71      0.79      0.75       134

    accuracy                           1.00     85118
   macro avg       0.86      0.90      0.87     85118
weighted avg       1.00      1.00      1.00     85118



### Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest model
rf_model = RandomForestClassifier()

# Train the model on Fraud_Data
rf_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_rf = rf_model.predict(X_test_fraud)
print("Fraud Data Random Forest Report:")
print(classification_report(y_test_fraud, y_pred_fraud_rf))

# Train the model on Creditcard data
rf_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_rf = rf_model.predict(X_test_credit)
print("Credit Card Data Random Forest Report:")
print(classification_report(y_test_credit, y_pred_credit_rf))


Fraud Data Random Forest Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       0.99      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.97      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Credit Card Data Random Forest Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.96      0.73      0.83       134

    accuracy                           1.00     85118
   macro avg       0.98      0.87      0.92     85118
weighted avg       1.00      1.00      1.00     85118



### Gradient Boosting

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize Gradient Boosting model
gb_model = GradientBoostingClassifier()

# Train the model on Fraud_Data
gb_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_gb = gb_model.predict(X_test_fraud)
print("Fraud Data Gradient Boosting Report:")
print(classification_report(y_test_fraud, y_pred_fraud_gb))

# Train the model on Creditcard data
gb_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_gb = gb_model.predict(X_test_credit)
print("Credit Card Data Gradient Boosting Report:")
print(classification_report(y_test_credit, y_pred_credit_gb))


Fraud Data Gradient Boosting Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     41117
           1       1.00      0.54      0.70      4217

    accuracy                           0.96     45334
   macro avg       0.98      0.77      0.84     45334
weighted avg       0.96      0.96      0.95     45334

Credit Card Data Gradient Boosting Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.76      0.49      0.59       134

    accuracy                           1.00     85118
   macro avg       0.88      0.74      0.80     85118
weighted avg       1.00      1.00      1.00     85118



### Multi-Layer Perceptron (MLP)

In [10]:
from sklearn.neural_network import MLPClassifier

# Initialize Multi-Layer Perceptron model
mlp_model = MLPClassifier(max_iter=500)

# Train the model on Fraud_Data
mlp_model.fit(X_train_fraud, y_train_fraud)

# Evaluate the model
y_pred_fraud_mlp = mlp_model.predict(X_test_fraud)
print("Fraud Data MLP Report:")
print(classification_report(y_test_fraud, y_pred_fraud_mlp))

# Train the model on Creditcard data
mlp_model.fit(X_train_credit, y_train_credit)

# Evaluate the model
y_pred_credit_mlp = mlp_model.predict(X_test_credit)
print("Credit Card Data MLP Report:")
print(classification_report(y_test_credit, y_pred_credit_mlp))


Fraud Data MLP Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     41117
           1       0.89      0.55      0.68      4217

    accuracy                           0.95     45334
   macro avg       0.92      0.77      0.83     45334
weighted avg       0.95      0.95      0.95     45334

Credit Card Data MLP Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     84984
           1       0.60      0.16      0.25       134

    accuracy                           1.00     85118
   macro avg       0.80      0.58      0.62     85118
weighted avg       1.00      1.00      1.00     85118

