In [1]:
import numpy as np 
import pandas as pd
import tempfile
import os
import re
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,make_scorer, f1_score, accuracy_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import model_selection
from mldl_classes import ChinIsolationForestCls
from joblib import dump, load
import mlflow.sklearn
import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient

## Download the CSV data
https://www.kaggle.com/datasets/cgrodrigues/credit-card-transactions-synthetic-data-generation/data?select=transactions_df.csv

## Store file at location
- the location to put the file is at data folder



In [2]:
# transactions_df = pd.read_csv("../data/transactions_df.csv")

transactions_df = pd.read_csv("../data/transactions_df.csv", parse_dates = ['post_ts'])
# terminal_profiles_df = pd.read_csv("data/terminal_profiles_table.csv")
# customer_profiles_df = pd.read_csv("data/customer_profiles_table.csv")
# join_terminal = pd.merge(transactions_df, terminal_profiles_df, on='terminal_id', how='inner') #join dataset base on key value
# join_customer = pd.merge(join_terminal, customer_profiles_df, on='customer_id', how='inner')


In [3]:
transactions_df

Unnamed: 0,transaction_id,post_ts,customer_id,bin,terminal_id,amt,entry_mode,fraud,fraud_scenario
0,OyWUo6ruReKft-P_QtjJbQ,2023-02-01 00:00:30,C00005143,424208,T001014,38.97,Contactless,0,0
1,rrgYMZWnRK6kKtWqlGN6HA,2023-02-01 00:00:54,C00002570,364329,T001023,84.07,Contactless,0,0
2,H4G-WgpGQluYWIT17jdN8Q,2023-02-01 00:01:13,C00005507,455573,T001024,113.18,Contactless,0,0
3,_4WehzsiRCK2WA8LTBkvsA,2023-02-01 00:01:57,C00003688,552755,T001064,28.96,Chip,0,0
4,81or3lX-Q9-2EEOfOgLCEQ,2023-02-01 00:01:57,C00003353,465808,T001097,65.45,Chip,0,0
...,...,...,...,...,...,...,...,...,...
1785303,BHKu4l59Ssim7_zoEqldyQ,2023-07-30 23:59:04,C00001001,557888,T001040,79.79,Contactless,0,0
1785304,lVGniiCzSFWxMCw4_kbOVg,2023-07-30 23:59:12,C00002473,375650,T001044,19.60,Swipe,1,2
1785305,Zqtf1KexRa2SJb74EPvCvg,2023-07-30 23:59:21,C00004828,364321,T001021,23.67,Contactless,1,2
1785306,ScPMd8vhSDCtZ7bBE4rYAw,2023-07-30 23:59:33,C00004280,375562,T001049,6.26,Chip,0,0


## The ML model is build with the reference project given at below link
- https://github.com/cgrodrigues/credit_card_synthetic_data?tab=readme-ov-file

### Generate Extra Features based on the transactions dataframe
- use of customer profile is avoided.
- use of terminal profile is avoided.

In [4]:
# transaction data is initialised at the class
cc_df = transactions_df.copy()
isolation_forest_cls = ChinIsolationForestCls(cc_df)

In [5]:
# perform business pipeline
%time
isolation_forest_cls.perform_pipeline()

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 10 µs


In [6]:
%time
isolation_forest_cls.cc_trans_df.columns

CPU times: user 4 µs, sys: 1 µs, total: 5 µs
Wall time: 32.9 µs


Index(['transaction_id', 'post_ts', 'bin', 'amt', 'entry_mode', 'fraud',
       'fraud_scenario', 'during_weekend', 'during_night',
       'customer_id_nb_tx_1day_window', 'customer_id_avg_amount_1day_window',
       'customer_id_nb_tx_7day_window', 'customer_id_avg_amount_7day_window',
       'customer_id_nb_tx_30day_window', 'customer_id_avg_amount_30day_window',
       'terminal_id_nb_tx_1day_window', 'terminal_id_risk_1day_window',
       'terminal_id_nb_tx_7day_window', 'terminal_id_risk_7day_window',
       'terminal_id_nb_tx_30day_window', 'terminal_id_risk_30day_window'],
      dtype='object')

In [7]:
X_train, y_train, X_test, y_test, ml_metadata = isolation_forest_cls.ml_train_pipeline()

In [8]:
# with open('./models/scikit-learn-random-forest-model/scikit-learn-random-forest-metadata.json', 'w') as metadata_file:
#     json.dump(metadata, metadata_file)
params = {'n_estimators': 10, 'random_state': 42, 'max_depth': 30}

# Initialize the Random Forest Classifier
clfRandomForest = RandomForestClassifier(n_estimators=params['n_estimators'], random_state=params['random_state'], max_depth=params['max_depth'])

# Train the model
clfRandomForest.fit(X_train, y_train)



In [9]:
# Make predictions on the test set
clfRandomForest_predictions = clfRandomForest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, clfRandomForest_predictions)
conf_matrix = confusion_matrix(y_test, clfRandomForest_predictions)
f_score = f1_score(y_test, clfRandomForest_predictions)

print(f'Accuracy: {accuracy}')
print(f'confusion matrix:')
print(f' {conf_matrix}')
print(f'f1 score = {f_score}')

Accuracy: 0.9720910598871251
confusion matrix:
 [[1171216    3148]
 [  30637    5543]]
f1 score = 0.2470638051302623


In [10]:
# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

experiment_name = 'Chin_ML_Models'
filter_str = f"name='{experiment_name}'"
experiments = mlflow.search_experiments(filter_string=filter_str)
if not experiments:
    mlflow.create_experiment(experiment_name)

# Create a new MLflow Experiment
mlflow.set_experiment(experiment_name)


<Experiment: artifact_location='/Users/chinchai/my_workspace/unsw/COMP9900_Project/capstone-project-9900f16agptea/mlflow_tracking_server/2', creation_time=1713032561178, experiment_id='2', last_update_time=1713032561178, lifecycle_stage='active', name='Chin_ML_Models', tags={}>

In [11]:
# create random run name
f = tempfile.NamedTemporaryFile(delete=False)
run_name = "blah"
f.close()
os.unlink(f.name)
re_match = re.search(r"\w+$", f.name)
if re_match:
    run_name = re_match.group()

# Start an MLflow run
with mlflow.start_run(run_name=run_name):
    # Log the hyperparameters
    mlflow.log_params(ml_metadata)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag that we can use to remind ourselves what this run was for
    # mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # Infer the model signature
    signature = infer_signature(X_train.iloc[[0]], clfRandomForest_predictions[0], params=params)

    # Log the model
    # model_info = mlflow.sklearn.log_model(sk_model=best_model, artifact_path="artifact_location",input_example=X_train,signature = infer_signature(X_test, y_test), registered_model_name="Isolation Forest")
    model_info = mlflow.sklearn.log_model(
        sk_model=clfRandomForest,
        artifact_path="artifact_location",
        input_example=X_test.iloc[[0]],
        signature=signature,
        registered_model_name="Random Forest",
    )


Successfully registered model 'Random Forest'.
2024/04/14 04:22:46 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Random Forest, version 1
Created version '1' of model 'Random Forest'.


In [12]:
# Initialize the Random Forest Classifier
params = {'max_samples': 50000, 'random_state': 42}
clf_IsolationForest = IsolationForest(max_samples=params['max_samples'], random_state=params['random_state'])

# Train the model
clf_IsolationForest.fit(X_train, y_train)



In [13]:
# Make predictions on the test set
clf_IsolationForest_predictions = clf_IsolationForest.predict(X_test)

# Evaluate the model
# Isolation forest needs to deal with non 1
clf_IsolationForest_predictions = clf_IsolationForest_predictions == -1

accuracy = accuracy_score(y_test, clf_IsolationForest_predictions)
conf_matrix = confusion_matrix(y_test, clf_IsolationForest_predictions)
f_score = f1_score(y_test, clf_IsolationForest_predictions)

print(f'Accuracy: {accuracy}')
print(f'confusion matrix:')
print(f' {conf_matrix}')
print(f'f1 score = {f_score}')

Accuracy: 0.9171364279200095
confusion matrix:
 [[1103657   70707]
 [  29603    6577]]
f1 score = 0.11593104420785447


In [14]:
# create random run name
f = tempfile.NamedTemporaryFile(delete=False)
run_name = "blah"
f.close()
os.unlink(f.name)
re_match = re.search(r"\w+$", f.name)
if re_match:
    run_name = re_match.group()

# Start an MLflow run
with mlflow.start_run(run_name=run_name):
    # Log the hyperparameters
    mlflow.log_params(ml_metadata)

    # Log the loss metric
    mlflow.log_metric("accuracy", accuracy)

    # Set a tag that we can use to remind ourselves what this run was for
    # mlflow.set_tag("Training Info", "Basic LR model for iris data")

    # Infer the model signature
    signature = infer_signature(X_train.iloc[[0]], clf_IsolationForest_predictions[0], params=params)

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=clf_IsolationForest,
        artifact_path="artifact_location",
        input_example=X_test.iloc[[0]],
        signature=signature,
        registered_model_name="Basic Isolation Forest",
    )


Successfully registered model 'Basic Isolation Forest'.
2024/04/14 04:23:04 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Basic Isolation Forest, version 1
Created version '1' of model 'Basic Isolation Forest'.


In [15]:
# data = self.cc_trans_df[self.cc_trans_df['post_ts'] > pd.Timestamp("2022-08-01")]
sample_data = transactions_df[transactions_df['post_ts'] > pd.Timestamp("2022-10-01")]
sample_data[:5000].to_csv('../data/user_sample_data1.csv', index=False)
sample_data[5000:10000].to_csv('../data/user_sample_data2.csv', index=False)

In [16]:
transactions_df.columns

Index(['transaction_id', 'post_ts', 'customer_id', 'bin', 'terminal_id', 'amt',
       'entry_mode', 'fraud', 'fraud_scenario'],
      dtype='object')

In [17]:
# isolation_forest_cls.perform_pipeline()
# clf_IsolationForest_predictions = clf_IsolationForest.predict(isolation_forest_cls.cc_trans_df)

# # Evaluate the model
# # Isolation forest needs to deal with non 1
# clf_IsolationForest_predictions = clf_IsolationForest_predictions == 1

# accuracy = accuracy_score(y_test, clf_IsolationForest_predictions)