# KAIM Week 8 and 9 Challenges

## **Task 2: MOdel Building**

## Import Necessary Libraries

In [44]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, SimpleRNN, LSTM
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import warnings

warnings.filterwarnings('ignore')

# Set plot style for better visuals
sns.set(style="whitegrid")

## Load Datasets

In [45]:
# Load the datasets
fraud_data = pd.read_csv('../data/cleaned_data_1.csv')
credit_data = pd.read_csv('../data/cleaned_data_2.csv')

In [46]:
fraud_data.shape, credit_data.shape

((138846, 15), (283726, 31))

In [47]:
fraud_data.columns, credit_data.columns

(Index(['user_id', 'purchase_value', 'source', 'browser', 'sex', 'age',
        'ip_address', 'Class', 'country', 'lower_bound_ip_addres',
        'upper_bound_ip_adress', 'signup_purchase_diff', 'transaction_count',
        'hour_of_day', 'day_of_week'],
       dtype='object'),
 Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
        'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
        'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
        'Class'],
       dtype='object'))

## Model Building

In [48]:
# Prepare data for the model (e-commerce)
X1 = fraud_data.drop(columns=['Class'])
X2 = credit_data.drop(columns=['Class'])

y1 = fraud_data['Class']
y2 = credit_data['Class']

print(np.unique(y1, return_counts=True))
print(np.unique(y2, return_counts=True))

(array([0, 1]), array([125849,  12997]))
(array([0, 1]), array([283253,    473]))


In [49]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)


### Logistic Regression

In [8]:
log_reg = LogisticRegression(C=1, solver='liblinear')
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)
print(classification_report(y_test, y_pred_log_reg))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     25193
           1       0.00      0.00      0.00      2577

    accuracy                           0.91     27770
   macro avg       0.45      0.50      0.48     27770
weighted avg       0.82      0.91      0.86     27770



### Random Forest

In [9]:
rf_clf = RandomForestClassifier(n_estimators=30, max_depth=3)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     25193
           1       1.00      0.55      0.71      2577

    accuracy                           0.96     27770
   macro avg       0.98      0.77      0.84     27770
weighted avg       0.96      0.96      0.95     27770



### XGBoost 

In [10]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98     25193
           1       0.94      0.55      0.69      2577

    accuracy                           0.96     27770
   macro avg       0.95      0.77      0.84     27770
weighted avg       0.95      0.96      0.95     27770



#### Class Balancing using SMOTE

In [32]:
from imblearn.combine import SMOTETomek

smt = SMOTETomek(random_state=42)
X_train_res1, y_train_res1 = smt.fit_resample(X_train1, y_train1)
X_train_res2, y_train_res2 = smt.fit_resample(X_train2, y_train2)

print(np.unique(y_train_res1, return_counts=True))
print(np.unique(y_train_res2, return_counts=True))

(array([0, 1]), array([88947, 88947]))
(array([0, 1]), array([225994, 225994]))


In [12]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_clf.fit(X_train_res, y_train_res)
y_pred_xgb = xgb_clf.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.96      0.99      0.97     25193
           1       0.81      0.56      0.66      2577

    accuracy                           0.95     27770
   macro avg       0.88      0.77      0.82     27770
weighted avg       0.94      0.95      0.94     27770



## Experiments

In [50]:
datasets = {
    "fraud_data": (X_train1, y_train1, X_test1, y_test1),
    "credit_data": (X_train2, y_train2, X_test2, y_test2),
}

In [51]:
'''
models = [
    (
        "Logistic Regression", 
        LogisticRegression(C=1, solver='liblinear'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "Random Forest", 
        RandomForestClassifier(n_estimators=30, max_depth=3), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
        (X_train, y_train),
        (X_test, y_test)
    ),
    (
        "XGBClassifier With SMOTE",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
        (X_train_res1, y_train_res1),
        (X_test1, y_test1)
    )
]
'''

'\nmodels = [\n    (\n        "Logistic Regression", \n        LogisticRegression(C=1, solver=\'liblinear\'), \n        (X_train, y_train),\n        (X_test, y_test)\n    ),\n    (\n        "Random Forest", \n        RandomForestClassifier(n_estimators=30, max_depth=3), \n        (X_train, y_train),\n        (X_test, y_test)\n    ),\n    (\n        "XGBClassifier",\n        XGBClassifier(use_label_encoder=False, eval_metric=\'logloss\'), \n        (X_train, y_train),\n        (X_test, y_test)\n    ),\n    (\n        "XGBClassifier With SMOTE",\n        XGBClassifier(use_label_encoder=False, eval_metric=\'logloss\'), \n        (X_train_res1, y_train_res1),\n        (X_test1, y_test1)\n    )\n]\n'

In [53]:
# Defining each model
models = [
    # Logistic Regression
    (
        "Logistic Regression", 
        LogisticRegression(C=1, solver='liblinear')
    ),
    
    # Random Forest
    (
        "Random Forest", 
        RandomForestClassifier(n_estimators=30, max_depth=3)
    ),
    
    # XGBoost
    (
        "XGBClassifier", 
        XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    ),

    # Decision Tree
    (
        "Decision Tree", 
        DecisionTreeClassifier(max_depth=5)
    ),
    
    # Multi-Layer Perceptron (MLP)
    (
        "MLP Classifier", 
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
    ),
    
    # Convolutional Neural Network (CNN)
    (
        "CNN", 
        Sequential([
            Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
            Flatten(),
            Dense(128, activation='relu'),
            Dense(1 , activation='sigmoid')
        ])
    ),
    
    # Recurrent Neural Network (RNN)
    (
        "RNN", 
        Sequential([
            SimpleRNN(50, input_shape=(100, 1), activation='relu'),
            Dense(1, activation='sigmoid')
        ])
    ),
    
    # Long Short-Term Memory (LSTM)
    (
        "LSTM", 
        Sequential([
            LSTM(50, input_shape=(100, 1), activation='relu'),
            Dense(1, activation='sigmoid')
        ])
    )
]


ValueError: object __array__ method not producing an array

In [54]:
models = [
    (
        "Logistic Regression", 
        LogisticRegression(C=1, solver='liblinear')
    ),
    (
        "Random Forest", 
        RandomForestClassifier(n_estimators=30, max_depth=3)
    ),
    (
        "XGBClassifier",
        XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    )
]

In [16]:
# Initialize MLflow
mlflow.set_experiment("Fraud Detection Models - Single Dataset")
mlflow.set_tracking_uri("http://localhost:5000")

reports = []

for model_name, model, train_set, test_set in models:
    X_train = train_set[0]
    y_train = train_set[1]
    X_test = test_set[0]
    y_test = test_set[1]
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    reports.append(report)

for i, element in enumerate(models):
    model_name = element[0]
    model = element[1]
    report = reports[i]
    
    with mlflow.start_run(run_name=model_name):        
        mlflow.log_param("model", model_name)
        mlflow.log_metric('accuracy', report['accuracy'])
        mlflow.log_metric('recall_class_1', report['1']['recall'])
        mlflow.log_metric('recall_class_0', report['0']['recall'])
        mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])        
        
        if "XGB" in model_name:
            mlflow.xgboost.log_model(model, "model")
        else:
            mlflow.sklearn.log_model(model, "model")  

2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: http://localhost:5000/#/experiments/935012191474515353/runs/66416527b5ad44a0be0886ed9e43b658.
2024/10/20 13:43:16 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.
2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest at: http://localhost:5000/#/experiments/935012191474515353/runs/3c3923af256d4ca4947318c3e70a6fbf.
2024/10/20 13:43:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191474515353.
2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier at: http://localhost:5000/#/experiments/935012191474515353/runs/019f76c5419a4e24a15a654ad2358b1c.
2024/10/20 13:43:25 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/935012191

In [55]:
# Initialize MLflow
mlflow.set_experiment("Fraud Detection Models - 2 Datasets")
mlflow.set_tracking_uri("http://localhost:5000")

# Iterate through datasets
for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    reports = []

    # Train each model on the current dataset
    for model_name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        reports.append(report)

    # Log each model's performance metrics to MLflow
    for i, (model_name, model) in enumerate(models):
        report = reports[i]

        with mlflow.start_run(run_name=f"{model_name}_{dataset_name}"):
            mlflow.log_param("model", model_name)
            mlflow.log_param("dataset", dataset_name)
            mlflow.log_metric('accuracy', report['accuracy'])
            mlflow.log_metric('recall_class_1', report['1']['recall'])
            mlflow.log_metric('recall_class_0', report['0']['recall'])
            mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

            # Log the model using the appropriate MLflow method
            if "XGBoost" in model_name:
                mlflow.xgboost.log_model(model, "model")
            else:
                mlflow.sklearn.log_model(model, "model")


2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/8273f777699647a39535d873391b9d13.
2024/10/20 20:16:21 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.
2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🏃 View run Random Forest_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/65913aaab7ce49948b90f6752b771312.
2024/10/20 20:16:24 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost:5000/#/experiments/478268722598582565.
2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🏃 View run XGBClassifier_fraud_data at: http://localhost:5000/#/experiments/478268722598582565/runs/e262b2c32c8442b2a720a086f36a2f63.
2024/10/20 20:16:28 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://local

In [None]:

# Load the datasets
fraud_data = pd.read_csv('../data/cleaned_data_1.csv')
credit_data = pd.read_csv('../data/cleaned_data_2.csv')

# Prepare data for the model (e-commerce)
X1 = fraud_data.drop(columns=['Class'])
X2 = credit_data.drop(columns=['Class'])

y1 = fraud_data['Class']
y2 = credit_data['Class']

print(np.unique(y1, return_counts=True))
print(np.unique(y2, return_counts=True))

# Train-test split
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)


# Datasets
datasets = {
    "fraud_data": (X_train1, y_train1, X_test1, y_test1),
    "credit_data": (X_train2, y_train2, X_test2, y_test2),
}


# Defining each model
models = [
    # Logistic Regression
    (
        "Logistic Regression", 
        LogisticRegression(C=1, solver='liblinear')
    ),
    
    # Random Forest
    (
        "Random Forest", 
        RandomForestClassifier(n_estimators=30, max_depth=3)
    ),
    
    # XGBoost
    (
        "XGBClassifier", 
        XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    ),

    # Decision Tree
    (
        "Decision Tree", 
        DecisionTreeClassifier(max_depth=5)
    ),
    
    # Multi-Layer Perceptron (MLP)
    (
        "MLP Classifier", 
        MLPClassifier(hidden_layer_sizes=(100,), max_iter=300)
    ),
    
    # Convolutional Neural Network (CNN)
    (
        "CNN", 
        Sequential([
            Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(28, 28, 1)),
            Flatten(),
            Dense(128, activation='relu'),
            Dense(1 , activation='sigmoid')
        ])
    ),
    
    # Recurrent Neural Network (RNN)
    (
        "RNN", 
        Sequential([
            SimpleRNN(50, input_shape=(100, 1), activation='relu'),
            Dense(1, activation='sigmoid')
        ])
    ),
    
    # Long Short-Term Memory (LSTM)
    (
        "LSTM", 
        Sequential([
            LSTM(50, input_shape=(100, 1), activation='relu'),
            Dense(1, activation='sigmoid')
        ])
    )
]



# Initialize MLflow
mlflow.set_experiment("Fraud Detection Models - 2 Datasets")
mlflow.set_tracking_uri("http://localhost:5000")

# Iterate through datasets
for dataset_name, (X_train, y_train, X_test, y_test) in datasets.items():
    reports = []

    # Train each model on the current dataset
    for model_name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        report = classification_report(y_test, y_pred, output_dict=True)
        reports.append(report)

    # Log each model's performance metrics to MLflow
    for i, (model_name, model) in enumerate(models):
        report = reports[i]

        with mlflow.start_run(run_name=f"{model_name}_{dataset_name}"):
            mlflow.log_param("model", model_name)
            mlflow.log_param("dataset", dataset_name)
            mlflow.log_metric('accuracy', report['accuracy'])
            mlflow.log_metric('recall_class_1', report['1']['recall'])
            mlflow.log_metric('recall_class_0', report['0']['recall'])
            mlflow.log_metric('f1_score_macro', report['macro avg']['f1-score'])

            # Log the model using the appropriate MLflow method
            if "XGBoost" in model_name:
                mlflow.xgboost.log_model(model, "model")
            else:
                mlflow.sklearn.log_model(model, "model")
