In [1]:
import os, sys
notebook_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(notebook_dir,'..')))

In [2]:
from scripts.data_load import load_data
from scripts.preprocess import preprocess_data
from scripts.data_visualization import *
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
import ipaddress
import joblib

In [3]:
# Load data
fraud_data = load_data('../data/cleaned_fraud_data.csv')
credit_data = load_data('../data/creditcard.csv')
ip_country = load_data('../data/IpAddress_to_Country.csv')

In [4]:
# Fraud_Data.csv
X_fraud = fraud_data.drop(columns=['device_id', 'class', 'country'])
y_fraud = fraud_data['class']

# creditcard.csv
X_credit = credit_data.drop(columns=['Class'])
y_credit = credit_data['Class']

In [5]:

# Fraud_Data.csv
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# creditcard.csv
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(
    X_credit, y_credit, test_size=0.2, random_state=42, stratify=y_credit
)

X_train_fraud['purchase_time'] = pd.to_datetime(X_train_fraud['purchase_time'])
X_train_fraud['purchase_time'] = X_train_fraud['purchase_time'].astype(int) / 10**9  # Convert to seconds

X_test_fraud['purchase_time'] = pd.to_datetime(X_test_fraud['purchase_time'])
X_test_fraud['purchase_time'] = X_test_fraud['purchase_time'].astype(int) / 10**9  # Convert to seconds


X_train_fraud['signup_time'] = pd.to_datetime(X_train_fraud['signup_time'])
X_train_fraud['signup_time'] = X_train_fraud['signup_time'].astype(int) / 10**9  # Convert to seconds

X_test_fraud['signup_time'] = pd.to_datetime(X_test_fraud['signup_time'])
X_test_fraud['signup_time'] = X_test_fraud['signup_time'].astype(int) / 10**9  # Convert to seconds


In [6]:
print(X_train_fraud.dtypes)

user_id                    int64
signup_time              float64
purchase_time            float64
purchase_value             int64
age                        int64
ip_address               float64
transaction_count          int64
time_diff                float64
hour_of_day                int64
day_of_week                int64
purchase_value_scaled    float64
source_Direct               bool
source_SEO                  bool
browser_FireFox             bool
browser_IE                  bool
browser_Opera               bool
browser_Safari              bool
sex_M                       bool
dtype: object


In [25]:
from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Train Logistic Regression
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_fraud_imputed, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud_imputed)
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, model.predict_proba(X_test_fraud_imputed)[:, 1]))

joblib.dump(model, "../models/logistic_regression_model.pkl")





              precision    recall  f1-score   support

           0       0.95      0.64      0.76     27393
           1       0.17      0.69      0.27      2830

    accuracy                           0.64     30223
   macro avg       0.56      0.67      0.52     30223
weighted avg       0.88      0.64      0.72     30223

ROC-AUC: 0.7575597387019124


['../models/logistic_regression_model.pkl']

In [26]:
from sklearn.tree import DecisionTreeClassifier

# Train
model = DecisionTreeClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Decision Tree:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

joblib.dump(model, "../models/decision_tree_model.pkl")

Decision Tree:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     27393
           1       0.53      0.55      0.54      2830

    accuracy                           0.91     30223
   macro avg       0.74      0.75      0.75     30223
weighted avg       0.91      0.91      0.91     30223

ROC-AUC: 0.7514647225523428


['../models/decision_tree_model.pkl']

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Train
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Random Forest:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

joblib.dump(model, "../models/random_forest_model.pkl")

Random Forest:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       0.97      0.53      0.68      2830

    accuracy                           0.95     30223
   macro avg       0.96      0.76      0.83     30223
weighted avg       0.96      0.95      0.95     30223

ROC-AUC: 0.7630690580335773


['../models/random_forest_model.pkl']

In [28]:
from xgboost import XGBClassifier

# Train
model = XGBClassifier(scale_pos_weight=len(y_train_fraud[y_train_fraud == 0]) / len(y_train_fraud[y_train_fraud == 1]))
model.fit(X_train_fraud, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud)
print("Gradient Boosting:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

model.save_model("../models/xgboost_model.json")

Gradient Boosting:
              precision    recall  f1-score   support

           0       0.95      0.98      0.97     27393
           1       0.74      0.54      0.62      2830

    accuracy                           0.94     30223
   macro avg       0.85      0.76      0.79     30223
weighted avg       0.93      0.94      0.93     30223

ROC-AUC: 0.7581157601455789


In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Train
model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
model.fit(X_train_fraud_imputed, y_train_fraud)

# Evaluate
y_pred = model.predict(X_test_fraud_imputed)
print("MLP:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

joblib.dump(model, "../models/mlp_classifier_model.pkl")



MLP:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

ROC-AUC: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['../models/mlp_classifier_model.pkl']

In [30]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

# Convert data to numeric type and reshape for CNN
X_train_cnn = X_train_fraud.astype(float).values.reshape(X_train_fraud.shape[0], X_train_fraud.shape[1], 1)
X_test_cnn = X_test_fraud.astype(float).values.reshape(X_test_fraud.shape[0], X_test_fraud.shape[1], 1)

# Build CNN
model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_cnn.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(50, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train_cnn, y_train_fraud, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_test_cnn) > 0.5).astype(int)
print("CNN:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

model.save("../models/cnn_model.h5")  # Saves in HDF5 format

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9062 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 2/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.9053 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 3/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9066 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 4/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 7ms/step - accuracy: 0.9050 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 5/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9056 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 6/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9071 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 7/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


CNN:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

ROC-AUC: 0.5


In [31]:
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.models import Sequential

# Convert data to numeric type and reshape for RNN
X_train_rnn = X_train_fraud.astype(float).values.reshape(X_train_fraud.shape[0], X_train_fraud.shape[1], 1)
X_test_rnn = X_test_fraud.astype(float).values.reshape(X_test_fraud.shape[0], X_test_fraud.shape[1], 1)

# Build RNN
model = Sequential([
    SimpleRNN(50, input_shape=(X_train_rnn.shape[1], 1)),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train_rnn, y_train_fraud, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_test_rnn) > 0.5).astype(int)
print("RNN:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

model.save("../models/rnn_model.h5")  # Saves in HDF5 format

  super().__init__(**kwargs)


Epoch 1/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step - accuracy: 0.9070 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 2/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9079 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 3/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step - accuracy: 0.9068 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 4/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9066 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 5/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - accuracy: 0.9072 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 6/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - accuracy: 0.9070 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 7/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


RNN:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

ROC-AUC: 0.5


In [35]:
from tensorflow.keras.layers import LSTM

# Convert boolean columns to integers
X_train_fraud = X_train_fraud.astype({col: 'int' for col in X_train_fraud.select_dtypes('bool').columns})
X_test_fraud = X_test_fraud.astype({col: 'int' for col in X_test_fraud.select_dtypes('bool').columns})

# Reshape data for LSTM
X_train_lstm = X_train_fraud.values.reshape(X_train_fraud.shape[0], X_train_fraud.shape[1], 1)
X_test_lstm = X_test_fraud.values.reshape(X_test_fraud.shape[0], X_test_fraud.shape[1], 1)

# Build LSTM
model = Sequential([
    LSTM(50, input_shape=(X_train_lstm.shape[1], 1)),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train
model.fit(X_train_lstm, y_train_fraud, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate
y_pred = (model.predict(X_test_lstm) > 0.5).astype(int)
print("LSTM:")
print(classification_report(y_test_fraud, y_pred))
print("ROC-AUC:", roc_auc_score(y_test_fraud, y_pred))

model.save("../models/lstm_model.h5")  # Saves in HDF5 format

Epoch 1/10


  super().__init__(**kwargs)


[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step - accuracy: 0.9064 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 2/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 8ms/step - accuracy: 0.9051 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 3/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 6ms/step - accuracy: 0.9072 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 4/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9055 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 5/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 4ms/step - accuracy: 0.9064 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 6/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.9062 - loss: nan - val_accuracy: 0.9075 - val_loss: nan
Epoch 7/10
[1m3023/3023[0m [32m━━━━━━━━━━━━━━━━━━━━[0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


LSTM:
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     27393
           1       0.00      0.00      0.00      2830

    accuracy                           0.91     30223
   macro avg       0.45      0.50      0.48     30223
weighted avg       0.82      0.91      0.86     30223

ROC-AUC: 0.5


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Track Logistic Regression
with mlflow.start_run():
    mlflow.log_param("model", "LogisticRegression")
    model = LogisticRegression(class_weight='balanced', random_state=42)
    model.fit(X_train_fraud_imputed, y_train_fraud)
    y_pred = model.predict(X_test_fraud_imputed)
    f1 = f1_score(y_test_fraud, y_pred)
    roc_auc = roc_auc_score(y_test_fraud, y_pred)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.sklearn.log_model(model, "logistic_regression_model")

joblib.dump(model, "../models/l_model.pkl")




Create a Meta Model

In [14]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer
import numpy as np

# Ensure X_train_fraud is not empty
if X_train_fraud.empty:
	# Load data again or handle the empty dataframe case
	fraud_data = load_data('../data/cleaned_fraud_data.csv')
	X_fraud = fraud_data.drop(columns=['device_id', 'class', 'country'])
	y_fraud = fraud_data['class']
	X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
		X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
	)
	X_train_fraud['purchase_time'] = pd.to_datetime(X_train_fraud['purchase_time'])
	X_train_fraud['purchase_time'] = X_train_fraud['purchase_time'].astype(int) / 10**9  # Convert to seconds
	X_train_fraud['signup_time'] = pd.to_datetime(X_train_fraud['signup_time'])
	X_train_fraud['signup_time'] = X_train_fraud['signup_time'].astype(int) / 10**9  # Convert to seconds

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)

# Base models
xgb_model = XGBClassifier(random_state=42)
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(random_state=42)

# Get predictions using cross-validation
xgb_preds = cross_val_predict(xgb_model, X_train_fraud_imputed, y_train_fraud, method='predict_proba', cv=5)
rf_preds = cross_val_predict(rf_model, X_train_fraud_imputed, y_train_fraud, method='predict_proba', cv=5)
lr_preds = cross_val_predict(lr_model, X_train_fraud_imputed, y_train_fraud, method='predict_proba', cv=5)

# Stack predictions
stacked_features = np.column_stack((xgb_preds[:, 1], rf_preds[:, 1], lr_preds[:, 1]))



In [15]:
# Meta-model
meta_model = LogisticRegression(random_state=42)

# Train meta-model on stacked features
meta_model.fit(stacked_features, y_train_fraud)

In [19]:
# Convert datetime columns to numeric values
X_train_fraud['signup_time'] = pd.to_datetime(X_train_fraud['signup_time']).astype(int) / 10**9  # Convert to seconds
X_train_fraud['purchase_time'] = pd.to_datetime(X_train_fraud['purchase_time']).astype(int) / 10**9  # Convert to seconds

X_test_fraud['signup_time'] = pd.to_datetime(X_test_fraud['signup_time']).astype(int) / 10**9  # Convert to seconds
X_test_fraud['purchase_time'] = pd.to_datetime(X_test_fraud['purchase_time']).astype(int) / 10**9  # Convert to seconds

# Handle missing values
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Train base models on full training data
xgb_model.fit(X_train_fraud_imputed, y_train_fraud)
rf_model.fit(X_train_fraud_imputed, y_train_fraud)
lr_model.fit(X_train_fraud_imputed, y_train_fraud)

# Get test predictions
xgb_test_preds = xgb_model.predict_proba(X_test_fraud_imputed)[:, 1]
rf_test_preds = rf_model.predict_proba(X_test_fraud_imputed)[:, 1]
lr_test_preds = lr_model.predict_proba(X_test_fraud_imputed)[:, 1]

# Stack test predictions
stacked_test_features = np.column_stack((xgb_test_preds, rf_test_preds, lr_test_preds))

# Final predictions
final_preds = meta_model.predict(stacked_test_features)



In [20]:
from sklearn.metrics import f1_score, roc_auc_score

print("Stacked Model F1-Score:", f1_score(y_test_fraud, final_preds))
print("Stacked Model ROC-AUC:", roc_auc_score(y_test_fraud, final_preds))

Stacked Model F1-Score: 0.0014005602240896359
Stacked Model ROC-AUC: 0.4999152887708668


In [22]:
# Handle missing values
X_train_fraud_imputed = imputer.fit_transform(X_train_fraud)

# Train base models
xgb_model.fit(X_train_fraud_imputed, y_train_fraud)
rf_model.fit(X_train_fraud_imputed, y_train_fraud)
lr_model.fit(X_train_fraud_imputed, y_train_fraud)



In [23]:
# Example weights (adjust based on model performance)
weights = {
    'xgb': 0.5,
    'rf': 0.3,
    'lr': 0.2
}

In [26]:
# Ensure test data has the same columns as training data
X_test_fraud = X_test_fraud[X_train_fraud.columns]

# Handle missing values
X_test_fraud_imputed = imputer.transform(X_test_fraud)

# Get predictions
xgb_preds = xgb_model.predict_proba(X_test_fraud_imputed)[:, 1]
rf_preds = rf_model.predict_proba(X_test_fraud_imputed)[:, 1]
lr_preds = lr_model.predict_proba(X_test_fraud_imputed)[:, 1]

# Weighted average
weighted_preds = (
    weights['xgb'] * xgb_preds +
    weights['rf'] * rf_preds +
    weights['lr'] * lr_preds
)

# Convert probabilities to binary predictions
final_preds = (weighted_preds >= 0.5).astype(int)



In [27]:
print("Weighted Average F1-Score:", f1_score(y_test_fraud, final_preds))
print("Weighted Average ROC-AUC:", roc_auc_score(y_test_fraud, final_preds))

Weighted Average F1-Score: 0.0
Weighted Average ROC-AUC: 0.49994524148505093


In [28]:
from scipy.optimize import minimize

# Define objective function
def objective(weights):
    weighted_preds = (
        weights[0] * xgb_preds +
        weights[1] * rf_preds +
        weights[2] * lr_preds
    )
    return -roc_auc_score(y_test_fraud, weighted_preds)  # Minimize negative ROC-AUC

# Initial weights
initial_weights = [0.5, 0.3, 0.2]

# Optimize weights
result = minimize(objective, initial_weights, bounds=[(0, 1), (0, 1), (0, 1)])
optimized_weights = result.x

print("Optimized Weights:", optimized_weights)

Optimized Weights: [0.5 0.3 0.2]


In [30]:
joblib.dump(meta_model, "../models/meta_model.pkl")

['../models/meta_model.pkl']