## Setup Environment <br>

Python 3.8 <br>
Numpy: 1.23.0 <br>
Pandas: 1.5.3 <br>
matplotlib: 3.7.1 <br>
seaborn: 0.10.1 <br>
Scikit-Learn: 1.1.3 <br>
MLFlow: 1.30.0 <br>

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib #
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn #
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, plot_roc_curve, confusion_matrix
from sklearn.model_selection import KFold 
import mlflow
import mlflow.sklearn


In [None]:
print("Numpy: {}".format(np.__version__))
print("Pandas: {}".format(pd.__version__))
print("matplotlib: {}".format(matplotlib.__version__))
print("seaborn: {}".format(sns.__version__))
print("Scikit-Learn: {}".format(sklearn.__version__))
print("MLFlow: {}".format(mlflow.__version__))

### Load the Data

Download the dataset from this link : https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud

In [3]:
data_path = "creditcard.csv"
df = pd.read_csv(data_path)
df = df.drop("Time", axis=1)
# remove time column since it was found that it is not helpful

In [None]:
df.head()

## Data Preparation

#### Split data to normal and anomaly

In [5]:
normal = df[df.Class == 0].sample(frac=0.5, random_state=2020).reset_index(drop=True) 
anomaly = df[df.Class == 1]

In [None]:
print(f"Normal: {normal.shape}")
print(f"Anomaly: {anomaly.shape}")

#### Split into train/ dev/ test sets

In [7]:
normal_train, normal_test = train_test_split(normal, test_size = 0.2, random_state = 2020)
anomaly_train, anomaly_test = train_test_split(anomaly, test_size = 0.2, random_state = 2020)
normal_train, normal_validate = train_test_split(normal_train,test_size = 0.25, random_state = 2020)
anomaly_train, anomaly_validate = train_test_split(anomaly_train, test_size = 0.25, random_state = 2020)


#### Concatenate set to creates X-Y sets

In [8]:
x_train = pd.concat((normal_train, anomaly_train))
x_test = pd.concat((normal_test, anomaly_test))
x_validate = pd.concat((normal_validate,anomaly_validate))

y_train = np.array(x_train["Class"])
y_test = np.array(x_test["Class"])
y_validate = np.array(x_validate["Class"])

x_train = x_train.drop("Class", axis=1)
x_test = x_test.drop("Class", axis=1)
x_validate = x_validate.drop("Class", axis=1)


In [None]:
print("Training sets:\nx_train: {} \ny_train:{}".format(x_train.shape, y_train.shape))
print("\nTesting sets:\nx_test: {} \ny_test:{}".format(x_test.shape, y_test.shape))
print("\nValidation sets:\nx_validate: {} \ny_validate: {}".format(x_validate.shape, y_validate.shape))

#### Scale the data

In [10]:
scaler = StandardScaler()
scaler.fit(pd.concat((normal, anomaly)).drop("Class", axis=1))
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
x_validate = scaler.transform(x_validate)

## Train and Evaluate

Train function

In [11]:
def train(sk_model, x_train, y_train): 
    sk_model = sk_model.fit(x_train, y_train)
    train_acc = sk_model.score(x_train, y_train)
    mlflow.log_metric("train_acc", train_acc)
    print(f"Train Accuracy: {train_acc:.3%}")

Evaluate function

In [12]:
def evaluate(sk_model, x_test, y_test):
    eval_acc = sk_model.score(x_test, y_test)
    preds = sk_model.predict(x_test)
    auc_score = roc_auc_score(y_test, preds)
    # ask MLFlow to log two more metrics
    mlflow.log_metric("eval_acc", eval_acc)
    mlflow.log_metric("auc_score", auc_score)
    print(f"Auc Score: {auc_score:.3%}")
    print(f"Eval Accuracy: {eval_acc:.3%}")
    roc_plot = plot_roc_curve(sk_model, x_test, y_test,name='Scikit-learn ROC Curve')
    plt.savefig("sklearn_roc_plot.png")
    plt.show()
    plt.clf()

    conf_matrix = confusion_matrix(y_test, preds)
 
    ax = sns.heatmap(conf_matrix, annot=True,fmt='g') 
    ax.invert_xaxis()
    ax.invert_yaxis()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.title("Confusion Matrix") 
    plt.savefig("sklearn_conf_matrix.png")
    # save the plots generated by matplotlib and by seaborn.
    mlflow.log_artifact("sklearn_roc_plot.png")
    mlflow.log_artifact("sklearn_conf_matrix.png")

## Log and View MLFlow Runs

In [13]:
sk_model = LogisticRegression(random_state=None, max_iter=400, solver='newton-cg')

In [None]:
# puts the run under experiment name
mlflow.set_experiment("Credit_Card_Fraud_experiment") 
mlflow.set_tracking_uri("localhost:5000")
# chunk all code under the context of one MLFlow run.
with mlflow.start_run():
    train(sk_model, x_train, y_train)
    evaluate(sk_model, x_test, y_test)
    # save the model
    mlflow.sklearn.log_model(sk_model, "log_reg_model")
    print("Model run: ", mlflow.active_run().info.run_uuid)
mlflow.end_run()

On terminal, write: mlflow ui -p 1234
so you can access mlflow UI

and then go to 127.0.0.1:1234

## Loading a Logged Model

In [16]:
loaded_model = mlflow.sklearn.load_model("runs:/d96759e62aaf44dba44dc50575de258a/log_reg_model")

In [None]:
loaded_model.score(x_test, y_test)

## MLFlow Parameter Tuning

### Broad Search

In [18]:
anomaly_weights = [1, 5, 10, 15]
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=2020)

In [None]:
mlflow.set_experiment("sklearn_creditcard_broad_search") 
logs = []
for f in range(len(anomaly_weights)):
    fold = 1
    accuracies = []
    auc_scores= []
    for train, test in kfold.split(x_validate, y_validate):
        with mlflow.start_run():
            weight = anomaly_weights[f] 
            mlflow.log_param("anomaly_weight", weight)
            class_weights= {0: 1,1: weight }
            sk_model = LogisticRegression(random_state=None, max_iter=400,
                                        solver='newton-cg',
                                        class_weight=class_weights).fit(x_validate[train],y_validate[train])
            for h in range(40): print('-', end="") 
            print(f"\nfold {fold}\nAnomaly Weight: {weight}")
            train_acc = sk_model.score(x_validate[train], y_validate[train])
            mlflow.log_metric("train_acc", train_acc)
            eval_acc = sk_model.score(x_validate[test], y_validate[test])
            preds = sk_model.predict(x_validate[test])
            mlflow.log_metric("eval_acc", eval_acc)

            try:
                auc_score = roc_auc_score(y_validate[test], preds)
            except:
                auc_score = -1
            mlflow.log_metric("auc_score", auc_score)
            print("AUC: {}\neval_acc: {}".format(auc_score,
            eval_acc))
            accuracies.append(eval_acc)
            auc_scores.append(auc_score)
            log = [sk_model, x_validate[test],
            y_validate[test], preds]
            logs.append(log)
            mlflow.sklearn.log_model(sk_model,
            f"anom_weight_{weight}_fold_{fold}")
            fold = fold + 1
            mlflow.end_run()
    print("\nAverages: ")
    print("Accuracy: ", np.mean(accuracies))
    print("AUC: ", np.mean(auc_scores))
    print("Best: ")
    print("Accuracy: ", np.max(accuracies))
    print("AUC: ", np.max(auc_scores))

### Guided Search

In [20]:
anomaly_weights = [10, 50, 150, 200]
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True, random_state=2020)

Create a new experiment for this

In [None]:
mlflow.set_experiment("sklearn_creditcard_Guided_search") 
logs = []
for f in range(len(anomaly_weights)):
    fold = 1
    accuracies = []
    auc_scores= []
    for train, test in kfold.split(x_validate, y_validate):
        with mlflow.start_run():
            weight = anomaly_weights[f] 
            mlflow.log_param("anomaly_weight", weight)
            class_weights= {0: 1,1: weight }
            sk_model = LogisticRegression(random_state=None, max_iter=400,
                                        solver='newton-cg',
                                        class_weight=class_weights).fit(x_validate[train],y_validate[train])
            for h in range(40): print('-', end="") 
            print(f"\nfold {fold}\nAnomaly Weight: {weight}")
            train_acc = sk_model.score(x_validate[train], y_validate[train])
            mlflow.log_metric("train_acc", train_acc)
            eval_acc = sk_model.score(x_validate[test], y_validate[test])
            preds = sk_model.predict(x_validate[test])
            mlflow.log_metric("eval_acc", eval_acc)

            try:
                auc_score = roc_auc_score(y_validate[test], preds)
            except:
                auc_score = -1
            mlflow.log_metric("auc_score", auc_score)
            print("AUC: {}\neval_acc: {}".format(auc_score,
            eval_acc))
            accuracies.append(eval_acc)
            auc_scores.append(auc_score)
            log = [sk_model, x_validate[test],
            y_validate[test], preds]
            logs.append(log)
            mlflow.sklearn.log_model(sk_model,
            f"anom_weight_{weight}_fold_{fold}")
            fold = fold + 1
            mlflow.end_run()
    print("\nAverages: ")
    print("Accuracy: ", np.mean(accuracies))
    print("AUC: ", np.mean(auc_scores))
    print("Best: ")
    print("Accuracy: ", np.max(accuracies))
    print("AUC: ", np.max(auc_scores))