In [1]:
import dataset
import datetime
from datetime import timedelta
from parser import get_parser
import numpy as np 
import pandas as pd 
import torch
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from torch_geometric.utils import from_networkx, to_undirected
from torch_geometric.data import Data, DataLoader, Dataset
from tqdm import tqdm, tqdm_notebook, trange
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from collections import defaultdict
import random
from xgboost import XGBClassifier
%config Completer.use_jedi = False

In [2]:
data = dataset.Ndata(path='../Custom-Semi-Supervised/data/ndata.csv')
parser = get_parser()
args = parser.parse_args(args=
                         ["--data","real-n", 
                          "--sampling","xgb",
                          "--train_from","20140101",
                          "--test_from","20170101",
                          "--test_length","365",
                          "--valid_length","90",
                          "--initial_inspection_rate", "5",
                          "--final_inspection_rate", "10",
                         ])

In [3]:
# args
seed = args.seed
epochs = args.epoch
dim = args.dim
lr = args.lr
weight_decay = args.l2
initial_inspection_rate = args.initial_inspection_rate
inspection_rate_option = args.inspection_plan
train_begin = args.train_from 
test_begin = args.test_from
test_length = args.test_length
valid_length = args.valid_length
chosen_data = args.data
numWeeks = args.numweeks
semi_supervised = args.semi_supervised
save = args.save
gpu_id = args.device

# Initial dataset split
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)

# Initial dataset split
train_start_day = datetime.date(int(train_begin[:4]), int(train_begin[4:6]), int(train_begin[6:8]))
test_start_day = datetime.date(int(test_begin[:4]), int(test_begin[4:6]), int(test_begin[6:8]))
test_length = timedelta(days=test_length)    
test_end_day = test_start_day + test_length
valid_length = timedelta(days=valid_length)
valid_start_day = test_start_day - valid_length

# data
data.split(train_start_day, valid_start_day, test_start_day, test_end_day, valid_length, test_length, args)
data.featureEngineering()

Data size:
Train labeled: (54134, 52), Train unlabeled: (1028538, 52), Valid labeled: (70917, 52), Valid unlabeled: (0, 26), Test: (274808, 52)
Checking label distribution
Training: 0.05022795615481618
Validation: 0.035556788645191434
Testing: 0.025360899366070794


In [8]:
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from utils import *
xgb = XGBClassifier(n_estimators=100, max_depth=4, n_jobs=-1, eval_metric="error", scale_pos_weight = 50)
xgb.fit(data.dftrainx_lab,data.train_cls_label)

best_thresh, best_auc = find_best_threshold(xgb,data.dfvalidx_lab, data.valid_cls_label)
xgb_test_pred = xgb.predict_proba(data.dftestx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,args,best_thresh)

Checking top 1% suspicious transactions: 2749
Precision: 0.1404, Recall: 0.0568, Revenue: 0.0942
Checking top 2% suspicious transactions: 5497
Precision: 0.0871, Recall: 0.0705, Revenue: 0.1269
Checking top 5% suspicious transactions: 13741
Precision: 0.0442, Recall: 0.0893, Revenue: 0.1859
Checking top 10% suspicious transactions: 27481
Precision: 0.0888, Recall: 0.3590, Revenue: 0.3713


In [7]:
xgb_test_pred = xgb.predict_proba(data.dfvalidx_lab)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.valid_cls_label,data.valid_reg_label,args,best_thresh)

Checking top 1% suspicious transactions: 271
Precision: 0.8450, Recall: 0.3491, Revenue: 0.2835
Checking top 2% suspicious transactions: 541
Precision: 0.5508, Recall: 0.4543, Revenue: 0.3817
Checking top 5% suspicious transactions: 1278
Precision: 0.3192, Recall: 0.6220, Revenue: 0.5636
Checking top 10% suspicious transactions: 2698
Precision: 0.2124, Recall: 0.8735, Revenue: 0.8082


In [10]:
xgb = XGBClassifier(n_estimators=100, max_depth=4, n_jobs=-1, eval_metric="error", scale_pos_weight = 2)
xgb.fit(data.dftrainx_lab,data.train_cls_label)

best_thresh, best_auc = find_best_threshold(xgb,data.dfvalidx_lab, data.valid_cls_label)
xgb_test_pred = xgb.predict_proba(data.dftestx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,args,best_thresh)

Checking top 1% suspicious transactions: 996
Precision: 0.5452, Recall: 0.4772, Revenue: 0.4519
Checking top 2% suspicious transactions: 1981
Precision: 0.3448, Recall: 0.6002, Revenue: 0.5477
Checking top 5% suspicious transactions: 4985
Precision: 0.2050, Recall: 0.8981, Revenue: 0.8762
Checking top 10% suspicious transactions: 9412
Precision: 0.1139, Recall: 0.9420, Revenue: 0.9362


In [None]:
pd.Series(dict(zip(data.dftrainx_lab.columns,xgb.feature_importances_))).sort_values()

In [None]:
pd.DataFrame(pd.Series(dict(zip(data.dftrainx_lab.columns,xgb.feature_importances_))).sort_values()).T

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler

X_train_leaves = xgb.apply(data.dftrainx_lab).reshape(-1,100)
X_valid_leaves = xgb.apply(data.dfvalidx_lab).reshape(-1,100)
X_test_leaves = xgb.apply(data.dftestx).reshape(-1,100)

# One-hot encoding for leaf index
xgbenc = OneHotEncoder(categories="auto")
lr_trainx = xgbenc.fit_transform(X_train_leaves)
lr_validx = xgbenc.transform(X_valid_leaves)
lr_testx = xgbenc.transform(X_test_leaves)

lr = LogisticRegression(n_jobs=-1)
lr.fit(lr_trainx, data.train_cls_label)

best_thresh, best_auc = find_best_threshold(lr,lr_validx, data.valid_cls_label)
xgb_test_pred = lr.predict_proba(lr_testx)[:,-1]
overall_f1,auc,pr, re, f, rev = metrics(xgb_test_pred, data.test_cls_label,data.test_reg_label,args,best_thresh)

In [None]:
from torchfm.model.dfm import DeepFactorizationMachineModel
from torchfm.model.afm import AttentionalFactorizationMachineModel
from torchfm.model.afn import AdaptiveFactorizationNetwork
from torchfm.model.xdfm import ExtremeDeepFactorizationMachineModel
from torchfm.layer import FeaturesEmbedding, FeaturesLinear, AttentionalFactorizationMachine
import torch
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torchtools.optim import RangerLars
from utils import process_leaf_idx

In [None]:
# Get leaf index from xgboost model 
X_train_leaves = xgb.apply(data.dftrainx_lab)
X_trainunlab_leaves = xgb.apply(data.dftrainx_unlab)
X_valid_leaves = xgb.apply(data.dfvalidx_lab)
X_test_leaves = xgb.apply(data.dftestx)

# One-hot encoding for leaf index
X_leaves = np.concatenate((X_train_leaves, X_trainunlab_leaves, X_valid_leaves, X_test_leaves), axis=0)
transformed_leaves, leaf_dim, new_leaf_index = process_leaf_idx(X_leaves)
train_rows = X_train_leaves.shape[0]
trainunlab_rows = X_trainunlab_leaves.shape[0] + train_rows
valid_rows = X_valid_leaves.shape[0] + trainunlab_rows
train_leaves, trainunlab_leaves, valid_leaves, test_leaves = transformed_leaves[:train_rows],\
                                  transformed_leaves[train_rows:trainunlab_rows],\
                                  transformed_leaves[trainunlab_rows:valid_rows],\
                                  transformed_leaves[valid_rows:]

In [None]:
train_leaves, valid_leaves, test_leaves = torch.tensor(train_leaves), torch.tensor(valid_leaves), torch.tensor(test_leaves)

In [None]:
class LabelEncoder(object):
    def __init__(self,min_count = 5):
        self.min_count = min_count
        self.map = dict()
        self.inv_map = dict()
        self.is_fitted = False
        
    def fit(self,x):
        self.is_fitted = True
        if not isinstance(x,pd.Series):
            x = pd.Series(x)
        self.counts = x.value_counts()
        valid_ser = self.counts[self.counts > self.min_count]
        valid_category = list(valid_ser.index)
        for i,v in enumerate(valid_category):
            self.map[v] = i + 1
            self.inv_map[i + 1] = v
        self.nunique = i + 2
            
    def fit_transform(self,x):
        self.fit(x)
        result = [self.map.get(i,0) for i in x]
        return result
    
    def transform(self,x):
        if not self.is_fitted:
            raise NotImplementedError
        result = [self.map.get(i,0) for i in x]
        return result

In [None]:
num_fields = []
train_fields = []
valid_fields = []
test_fields = []
for col in data.profile_candidates:
    df_col = data.train_lab[col]
    val_col = data.valid_lab[col]
    test_col = data.test[col]
    encoder = LabelEncoder()
    relabed = encoder.fit_transform(df_col)
    valid_relabel = encoder.transform(val_col)
    test_relabel = encoder.transform(test_col)
    num_fields.append(encoder.nunique)
    train_fields.append(relabed)
    valid_fields.append(valid_relabel)
    test_fields.append(test_relabel)

In [None]:
class BeatDATE(nn.Module):
    def __init__(self,field_dims,hidden_dim,attn_size, dropouts):
        super().__init__()
        self.leaf_embedding = nn.Embedding(leaf_dim,hidden_dim)
        elf.num_fields = len(field_dims)
        self.embedding = FeaturesEmbedding(field_dims, hidden_dim)
        self.linear = FeaturesLinear(field_dims)
        self.afm = AttentionalFactorizationMachine(hidden_dim, attn_size, dropouts)
        
    def forward(self,x,leaf):
        leaf_emb = self.leaf_embedding(x)
        leaf_emb = torch.sum(leaf_emb,dim=1)

In [None]:
train_fields = torch.tensor(train_fields).T
valid_fields = torch.tensor(valid_fields).T
test_fields = torch.tensor(test_fields).T
tr_y = torch.FloatTensor(data.train_cls_label)
val_y = torch.FloatTensor(data.valid_cls_label)
test_y = torch.FloatTensor(data.test_cls_label)

In [None]:
train_dataset = TensorDataset(train_fields, tr_y)
valid_dataset = TensorDataset(valid_fields, val_y)
test_dataset = TensorDataset(test_fields, test_y)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=256, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

In [None]:
loss_func = nn.BCELoss()
device = "cuda:3"
model = DeepFactorizationMachineModel(num_fields,32,(32,16),0.2).to(device)
# model = AttentionalFactorizationMachineModel(num_fields, 32,attn_size=4,dropouts=(0.2,0.2)).to(device)
# optimizer = RangerLars(model.parameters(), lr = 0.05, weight_decay= 0.001)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1, weight_decay= 0.001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer,gamma=0.9)

In [None]:
def train():
    for step, (x,y) in enumerate(train_loader):
        x,y = x.to(device), y.to(device)
        pred = model(x)
        loss = loss_func(pred,y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    scheduler.step()
#         if step % 20 ==0:
#             print("Loss:%.4f" % loss.item())
                
def test(loader):
    preds = []
    with torch.no_grad():
        for step, (x,y) in enumerate(loader):
            x,y = x.to(device), y.to(device)
            pred = model(x)
            preds.append(pred)
    preds = torch.cat(preds).detach().cpu().numpy()
    return preds

In [None]:
for epoch in range(30):
    train()
    print("="*30+"[Epoch %d]"%(epoch+1) + "="*30)
    preds = test(test_loader)
    res = torch_metrics(preds,data.test_cls_label,data.test_reg_label)
#     preds = test(valid_loader)
#     res = torch_metrics(preds,data.valid_cls_label,data.valid_reg_label)

In [None]:
preds = test(test_loader)
res = torch_metrics(preds,data.test_cls_label,data.test_reg_label)

Checking top 1% suspicious transactions: 998
Precision: 0.1784, Recall: 0.1564, Revenue: 0.1629
Checking top 2% suspicious transactions: 1996
Precision: 0.1493, Recall: 0.2619, Revenue: 0.2424
Checking top 5% suspicious transactions: 4990
Precision: 0.1255, Recall: 0.5501, Revenue: 0.4845
Checking top 10% suspicious transactions: 9980
Precision: 0.0828, Recall: 0.7258, Revenue: 0.6765