In [1]:
import dataset
import datetime
from datetime import timedelta
from parser import get_parser
import numpy as np 
import pandas as pd 
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm, tqdm_notebook, trange
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from collections import defaultdict
import random
from xgboost import XGBClassifier
import os
import warnings
warnings.filterwarnings("ignore")
%config Completer.use_jedi = False

In [2]:
data = dataset.Ndata(path='../Custom-Semi-Supervised/data/ndata.csv')
parser = get_parser()
args = parser.parse_args(args=
                         ["--data","real-n", 
                          "--sampling","xgb",
                          "--train_from","20140101",
                          "--test_from","20170101",
                          "--test_length","365",
                          "--valid_length","90",
                          "--initial_inspection_rate", "5",
                          "--final_inspection_rate", "10",
                         ])

In [3]:
# args
seed = args.seed
epochs = args.epoch
dim = args.dim
lr = args.lr
weight_decay = args.l2
initial_inspection_rate = args.initial_inspection_rate
inspection_rate_option = args.inspection_plan
train_begin = args.train_from 
test_begin = args.test_from
test_length = args.test_length
valid_length = args.valid_length
chosen_data = args.data
numWeeks = args.numweeks
semi_supervised = args.semi_supervised
save = args.save
gpu_id = args.device

# Initial dataset split
np.random.seed(seed)

# Initial dataset split
train_start_day = datetime.date(int(train_begin[:4]), int(train_begin[4:6]), int(train_begin[6:8]))
test_start_day = datetime.date(int(test_begin[:4]), int(test_begin[4:6]), int(test_begin[6:8]))
test_length = timedelta(days=test_length)    
test_end_day = test_start_day + test_length
valid_length = timedelta(days=valid_length)
valid_start_day = test_start_day - valid_length

# data
data.split(train_start_day, valid_start_day, test_start_day, test_end_day, valid_length, test_length, args)
data.featureEngineering()

Data size:
Train labeled: (54134, 52), Train unlabeled: (1028538, 52), Valid labeled: (70917, 52), Valid unlabeled: (0, 26), Test: (274808, 52)
Checking label distribution
Training: 0.05022795615481618
Validation: 0.035556788645191434
Testing: 0.025360899366070794


In [4]:
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from utils import *
xgb = XGBClassifier(n_estimators=100, max_depth=4, n_jobs=-1, eval_metric="error", scale_pos_weight = 1)
xgb.fit(data.dftrainx_lab,data.train_cls_label)

best_thresh, best_auc = find_best_threshold(xgb,data.dfvalidx_lab, data.valid_cls_label)
xgb_test_pred = xgb.predict_proba(data.dftestx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,args,best_thresh)

Checking top 1% suspicious transactions: 2749
Precision: 0.1550, Recall: 0.0627, Revenue: 0.1090
Checking top 2% suspicious transactions: 5497
Precision: 0.0906, Recall: 0.0733, Revenue: 0.1344
Checking top 5% suspicious transactions: 13741
Precision: 0.0463, Recall: 0.0936, Revenue: 0.1792
Checking top 10% suspicious transactions: 27480
Precision: 0.0462, Recall: 0.1867, Revenue: 0.2948


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X_train_leaves = xgb.apply(data.dftrainx_lab).reshape(-1,100)
X_valid_leaves = xgb.apply(data.dfvalidx_lab).reshape(-1,100)
X_test_leaves = xgb.apply(data.dftestx).reshape(-1,100)

# One-hot encoding for leaf index
xgbenc = OneHotEncoder(categories="auto")
lr_trainx = xgbenc.fit_transform(X_train_leaves)
lr_validx = xgbenc.transform(X_valid_leaves)
lr_testx = xgbenc.transform(X_test_leaves)

lr = LogisticRegression(n_jobs=-1)
lr.fit(lr_trainx, data.train_cls_label)

best_thresh, best_auc = find_best_threshold(lr,lr_validx, data.valid_cls_label)
xgb_test_pred = lr.predict_proba(lr_testx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,args,best_thresh)

Checking top 1% suspicious transactions: 2749
Precision: 0.1510, Recall: 0.0611, Revenue: 0.0912
Checking top 2% suspicious transactions: 5497
Precision: 0.0941, Recall: 0.0761, Revenue: 0.1321
Checking top 5% suspicious transactions: 13739
Precision: 0.0646, Recall: 0.1305, Revenue: 0.1884
Checking top 10% suspicious transactions: 27481
Precision: 0.1071, Recall: 0.4328, Revenue: 0.3864


## Vime

In [6]:
from vime_self import vime_self
from vime_semi import vime_semi
from vime_utils import perf_metric
from supervised_models import logit, xgb_model, mlp
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.utils import to_categorical

In [7]:
# Experimental parameters
label_no = 1000  
model_sets = ['logit','xgboost','mlp']
  
# Hyper-parameters
p_m = 0.3
alpha = 2.0
K = 3
beta = 1.0
label_data_rate = 0.1

# Metric
metric = 'acc'
  
# Define output
results = np.zeros([len(model_sets)+2])  

In [8]:
# MLP
mlp_parameters = dict()
mlp_parameters['hidden_dim'] = 100
mlp_parameters['epochs'] = 100
mlp_parameters['activation'] = 'relu'
mlp_parameters['batch_size'] = 100

In [9]:
# Train VIME-Semi
vime_semi_parameters = dict()
vime_semi_parameters['hidden_dim'] = 256
vime_semi_parameters['batch_size'] = 128
vime_semi_parameters['iterations'] = 1000

In [12]:
# train
x_train = data.dftrainx_lab.values
x_train = MinMaxScaler().fit_transform(x_train)
y_train = to_categorical(data.train_cls_label)

# unlab
x_unlab = data.dftrainx_unlab.values
x_unlab = MinMaxScaler().fit_transform(x_unlab)

# test
x_test = data.dftestx.values
x_test = MinMaxScaler().fit_transform(x_test)
y_test = to_categorical(data.test_cls_label)

In [None]:
# # Train VIME-Self
vime_self_parameters = dict()
vime_self_parameters['batch_size'] = 16
vime_self_parameters['epochs'] = 50
vime_self_encoder = vime_self(x_unlab, p_m, alpha, vime_self_parameters)
  
# Save encoder
if not os.path.exists('save_model'):
    os.makedirs('save_model')

file_name = './save_model/vime_Ndata2.h5'
  
vime_self_encoder.save(file_name)  
        
# Test VIME-Self
x_train_hat = vime_self_encoder.predict(x_train)
x_test_hat = vime_self_encoder.predict(x_test)

Train on 1028538 samples


2022-01-25 19:17:24.287433: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2022-01-25 19:17:24.316925: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 19:17:24.318509: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 0 with properties: 
name: NVIDIA GeForce RTX 2070 SUPER major: 7 minor: 5 memoryClockRate(GHz): 1.83
pciBusID: 0000:01:00.0
2022-01-25 19:17:24.318754: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:983] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-25 19:17:24.320219: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1618] Found device 1 with properties: 
name: NVIDIA GeForce GTX 1060 6GB major: 6 minor: 1 memoryClockRate(GHz): 1.746

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Epoch 7/50

In [None]:
y_test_hat = mlp(x_train_hat, y_train, x_test_hat, mlp_parameters)
results[3] = perf_metric(metric, y_test, y_test_hat)
    
print('VIME-Self Performance: ' + str(results[3]))

In [None]:
# Train VIME-Semi
vime_semi_parameters = dict()
vime_semi_parameters['hidden_dim'] = 512
vime_semi_parameters['batch_size'] = 32
vime_semi_parameters['iterations'] = 1000
y_test_hat = vime_semi(x_train, y_train, x_unlab, x_test, 
                       vime_semi_parameters, p_m, K, beta, file_name)

In [None]:
overall_f1,auc,pr, re, f, rev = metrics(y_test_hat[:,1], data.test_cls_label,data.test_reg_label,args)