In [9]:
import numpy as np 
import pandas as pd
import os
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,make_scorer, f1_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import model_selection
from joblib import dump, load
import mlflow.sklearn
import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from result_display import show_result,export_anomaly
from reject_anomalies import pred_baseon_threshold,make_use_reject_anomalies
from Feature_engineer import remove_unwanted_col,feature_engineer_steps

In [10]:
transactions_df = pd.read_csv("data/transactions_df.csv")
terminal_profiles_df = pd.read_csv("data/terminal_profiles_table.csv")
customer_profiles_df = pd.read_csv("data/customer_profiles_table.csv")
join_terminal = pd.merge(transactions_df, terminal_profiles_df, on='terminal_id', how='inner') #join dataset base on key value
join_customer = pd.merge(join_terminal, customer_profiles_df, on='customer_id', how='inner')


## Download the CSV data
https://www.kaggle.com/datasets/cgrodrigues/credit-card-transactions-synthetic-data-generation/data?select=transactions_df.csv

## Store file at location
- the location to put the file is at data folder



In [11]:
print(join_customer.columns.tolist())

['transaction_id', 'post_ts', 'customer_id', 'bin_x', 'terminal_id', 'amt', 'entry_mode', 'fraud', 'fraud_scenario', 'lat_terminal', 'log_terminal', 'mcc', 'mean_amount', 'std_amount', 'mean_nb_tx_per_day', 'network_id', 'bin_y', 'lat_customer', 'log_customer', 'available_terminals', 'nb_terminals']


In [12]:
#creating sample file for client demo purpose
samle_file = transactions_df.sample(n=100, random_state=42)
samle_file.to_csv('data/user_demo_data.csv', index=False)

In [13]:
# Feature engineer step and one-hot enconding for categorical feature. 
# Stored in Feature_engineer.py
train_X,train_y = feature_engineer_steps(join_customer)

In [17]:
train_X = remove_unwanted_col(train_X)
print(train_X.columns.tolist())

['amt', 'lat_terminal', 'log_terminal', 'mean_amount', 'std_amount', 'mean_nb_tx_per_day', 'lat_customer', 'log_customer', 'nb_terminals', 'location_different', 'per_day_difference_count', 'mean_difference', 'timestamp_numeric', 'entry_mode_Chip', 'entry_mode_Contactless', 'entry_mode_Swipe', 'network_id_A', 'network_id_D', 'network_id_M', 'network_id_V']


In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y,test_size=0.05, random_state=42)

In [16]:
# Train model using grid seach for best model, very time consuming. 
#Use two sections below for usual function testing.
contamination_rate = 0.15 #default contamination rate 15% of data are anomalies
clf = IsolationForest(contamination=contamination_rate,random_state=42)

param_grid = {#'n_estimators': list(range(100, 300, 100)), #number of decision trees
              #'contamination': [0.3,0.1,0.05], #percentage of anomalies
              'max_features': [5,7,9,12], 
              'bootstrap': [True],
                #'max_samples':[10000]
                }
f1sc = make_scorer(f1_score, average='macro') #using macro_avg_f1 as score

grid_dt_estimator = model_selection.GridSearchCV(clf, 
                                                 param_grid,
                                                 scoring='accuracy', 
                                                 refit=True,
                                                 cv=5, 
                                                 return_train_score=True)
grid_dt_estimator.fit(X_train, y_train)
dump(grid_dt_estimator.best_estimator_, 'saved_model/best_model.joblib')
#print(grid_dt_estimator.best_score_)
#print(grid_dt_estimator.best_params_)

['saved_model/best_model.joblib']