In [1]:
import numpy as np 
import pandas as pd
import os
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,ConfusionMatrixDisplay,classification_report,make_scorer, f1_score
from sklearn.utils import resample
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn import model_selection
from joblib import dump, load
import mlflow.sklearn
import mlflow
from mlflow.models import infer_signature
from mlflow.tracking import MlflowClient
from result_display import show_result,export_anomaly
from reject_anomalies import pred_baseon_threshold,make_use_reject_anomalies
from Feature_engineer import remove_unwanted_col,feature_engineer_steps
from sklearn import decomposition
from Data_preprocessing_method import apply_PCA

In [2]:
transactions_df = pd.read_csv("data/transactions_df.csv")
terminal_profiles_df = pd.read_csv("data/terminal_profiles_table.csv")
customer_profiles_df = pd.read_csv("data/customer_profiles_table.csv")
join_terminal = pd.merge(transactions_df, terminal_profiles_df, on='terminal_id', how='inner') #join dataset base on key value
join_customer = pd.merge(join_terminal, customer_profiles_df, on='customer_id', how='inner')


## Download the CSV data
https://www.kaggle.com/datasets/cgrodrigues/credit-card-transactions-synthetic-data-generation/data?select=transactions_df.csv

## Store file at location
- the location to put the file is at data folder



In [3]:
print(join_customer.columns.tolist())

['transaction_id', 'post_ts', 'customer_id', 'bin_x', 'terminal_id', 'amt', 'entry_mode', 'fraud', 'fraud_scenario', 'lat_terminal', 'log_terminal', 'mcc', 'mean_amount', 'std_amount', 'mean_nb_tx_per_day', 'network_id', 'bin_y', 'lat_customer', 'log_customer', 'available_terminals', 'nb_terminals']


In [4]:
#creating sample file for client demo purpose
#samle_file = transactions_df.sample(n=10000, random_state=42)
#samle_file.to_csv('data/user_demo_data.csv', index=False)

In [5]:
# Feature engineer step and one-hot enconding for categorical feature. 
# Stored in Feature_engineer.py
train_X,train_y = feature_engineer_steps(join_customer)

In [6]:
train_X = remove_unwanted_col(train_X)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_X, train_y,test_size=0.05, random_state=42)

In [8]:
def custom_scorer(y_true, y_pred):
    # Convert Isolation Forest's anomaly labels to binary labels
    # Inliers are 1, converted to 0 (negative class)
    # Outliers are -1, converted to 1 (positive class)
    y_pred[y_pred == 1] = 0
    y_pred[y_pred == -1] = 1
 
    return f1_score(y_true, y_pred, pos_label=1)
contamination_rate = 0.15 #default contamination rate 15% of data are anomalies
clf = IsolationForest(contamination=contamination_rate,random_state=42)
    

In [9]:
# Train model using grid seach for best model, very time consuming. 


param_grid = {#'n_estimators': list(range(100, 300, 100)), #number of decision trees
              'contamination': [0.05,0.15,0.02], #percentage of anomalies
              'max_features': [5,9,12], 
              'bootstrap': [True],
                #'max_samples':[10000]
                }
#f1sc = make_scorer(f1_score, average='macro') #using macro_avg_f1 as score
scorer = make_scorer(custom_scorer, greater_is_better=True)
grid_dt_estimator = model_selection.GridSearchCV(clf, 
                                                 param_grid,
                                                 scoring=scorer, 
                                                 refit=True,
                                                 cv=5, 
                                                 return_train_score=True)
grid_dt_estimator.fit(X_train, y_train)
dump(grid_dt_estimator.best_estimator_, 'saved_model/best_model.joblib')


['saved_model/best_model.joblib']

In [10]:
print(grid_dt_estimator.best_score_)
print(grid_dt_estimator.best_params_)

0.14963228584024568
{'bootstrap': True, 'contamination': 0.05, 'max_features': 9}


In [11]:
# anomalyclassifier = IsolationForest(random_state=42, n_estimators=200)
# anomalyclassifier.fit(X_train)
# dump(anomalyclassifier, 'saved_model/best_model.joblib')

## Train model with PCA option

In [12]:
X_pca = apply_PCA(train_X,5)
X_train, X_test, y_train, y_test = train_test_split(X_pca, train_y, test_size=0.05, 
                                                    stratify=train_y, 
                                                    random_state=42)
param_grid_PCA = {'n_estimators': [100,200,300], #number of decision trees
              'contamination': [0.05,0.15,0.02], #percentage of anomalies
              #'max_features': [5,9,12], 
              'bootstrap': [True],
                #'max_samples':[10000]
                }
scorer = make_scorer(custom_scorer, greater_is_better=True)
grid_dt_estimator_PCA = model_selection.GridSearchCV(clf, 
                                                 param_grid_PCA,
                                                 scoring=scorer, 
                                                 refit=True,
                                                 cv=5, 
                                                 return_train_score=True)
grid_dt_estimator_PCA.fit(X_train, y_train)
dump(grid_dt_estimator_PCA.best_estimator_, 'saved_model/best_model_PCA.joblib')

['saved_model/best_model_PCA.joblib']

In [13]:
print(grid_dt_estimator_PCA.best_score_)
print(grid_dt_estimator_PCA.best_params_)

0.25547322764346475
{'bootstrap': True, 'contamination': 0.05, 'n_estimators': 200}
