## Preprocess

In [None]:
%pip install -qU -r requirements.txt

In [None]:
!kaggle competitions download -c ieee-fraud-detection -p ./data/ieee-fraud-detection/ --force

In [None]:
!unzip ./data/ieee-fraud-detection/ieee-fraud-detection.zip -d ./data/ieee-fraud-detection/

In [None]:
df_identity = pd.read_csv('./data/ieee-fraud-detection/train_identity.csv')
df_transaction = pd.read_csv('./data/ieee-fraud-detection/train_transaction.csv')

df=pd.merge(df_identity, df_transaction, on='TransactionID', how='inner')

df.sort_values(by='TransactionDT', ascending=True, inplace=True)

In [None]:
n_total = len(df)
n_train = int(n_total*0.8)
n_test  = n_total - n_train

In [None]:
print(f"Total transactions: {n_total}, training transactions: {n_train}, testing transaction: {n_test}")

In [None]:
df_train = df.head(n_train)
df_test  = df.tail(n_test)

In [None]:
df_train.to_parquet("./data/train.parquet", index=False)
df_test.to_parquet("./data/test.parquet", index=False)

## Train

In [None]:
import numpy as np
import pandas as pd

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

import pandas as pd
from fgnn.fraud_detector import FraudRGCN

  from .autonotebook import tqdm as notebook_tqdm
  pd.options.mode.use_inf_as_na = True


In [2]:
df_train = pd.read_parquet('./data/train.parquet')
df_test = pd.read_parquet('./data/test.parquet')

In [8]:
params = {
    'embedding_size': 64,
    'n_layers': 2,
    'n_epochs': 2,
    'n_hidden': 16,
    'dropout': 0.2,
    'weight_decay': 5e-05,
    'lr': 0.01,
}

In [9]:
### print default model parameters
FraudRGCN()._default_params

{'num_gpus': 0,
 'embedding_size': 128,
 'n_layers': 2,
 'n_epochs': 50,
 'n_hidden': 16,
 'dropout': 0.2,
 'weight_decay': 5e-06,
 'lr': 0.01,
 'target_col': 'TransactionID',
 'node_cols': 'card1,card2,card3,card4,card5,card6,ProductCD,addr1,addr2,P_emaildomain,R_emaildomain',
 'label_col': 'isFraud',
 'cat_cols': 'M1,M2,M3,M4,M5,M6,M7,M8,M9,DeviceType,DeviceInfo,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38',
 'num_cols': 'TransactionAmt,dist1,dist2,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,id_10,id_11,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,D12,D13,D14,D15,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,V61,V62,V63,V64,V65,V66,V67,V68,V

In [5]:
# train model in inductive setting

In [6]:
import warnings
### disable CUDA-related warnings from torch library 
warnings.filterwarnings("ignore", category=UserWarning)

In [10]:
for ii in range(1,6):
    fd = FraudRGCN()
    fd.train_fg(df_train, params=params)
    fd.save_fg(f"model/inductive_{ii}")

  result = func(self.values, **kwargs)
  result = func(self.values, **kwargs)


Constructed heterograph with the following metagraph structure: Node types ['P_emaildomain', 'ProductCD', 'R_emaildomain', 'addr1', 'addr2', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'target'], Edge types[('P_emaildomain', 'P_emaildomain<>target', 'target'), ('ProductCD', 'ProductCD<>target', 'target'), ('R_emaildomain', 'R_emaildomain<>target', 'target'), ('addr1', 'addr1<>target', 'target'), ('addr2', 'addr2<>target', 'target'), ('card1', 'card1<>target', 'target'), ('card2', 'card2<>target', 'target'), ('card3', 'card3<>target', 'target'), ('card4', 'card4<>target', 'target'), ('card5', 'card5<>target', 'target'), ('card6', 'card6<>target', 'target'), ('target', 'self_relation', 'target'), ('target', 'target<>P_emaildomain', 'P_emaildomain'), ('target', 'target<>ProductCD', 'ProductCD'), ('target', 'target<>R_emaildomain', 'R_emaildomain'), ('target', 'target<>addr1', 'addr1'), ('target', 'target<>addr2', 'addr2'), ('target', 'target<>card1', 'card1'), ('target', 'target

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

from fgnn.fraud_detector import FraudRGCN

import torch as th

import seaborn as sns
import matplotlib.pyplot as plt

## Evaluation

Evaluate trained models on full test set and save results

In [None]:
results = []

## evaluate models on full test set
# for mode in ['transductive', 'inductive' ]:
for mode in ['inductive' ]:
    for ii in range(1,6):
        for kk in [1, 2, 3]:
            fd = FraudRGCN.load_fg(f"model/{mode}_{ii}")
            fraud_proba=fd.predict(df_test, k=kk)
            auc=roc_auc_score(df_test.isFraud, fraud_proba)
            elaps=fd._timings['predict: total'][-1]
            print(mode, ii, kk, elaps, auc)
            results.append((mode, ii, kk, elaps, auc))
            
## save results to csv
df_results = pd.DataFrame(results, columns=['mode', 'trial', 'k', 'time', 'AUC'])
df_results.to_csv('full_results.csv', header=True, index=False)

Evaluate inductive models on test set in batches and save results

In [None]:
batch_results = []

## evaluate inductive models on test set in batches of ~1000 transactions
for mode in ['inductive',]:
    for ii in range(1,6):
        for kk in [1, 2, 3,]:
            fd = FraudRGCN.load_fg(f"model/{mode}_{ii}")
            fraud_proba=[]
            for batch in np.array_split(df_test, 28):
                fraud_proba.append(fd.predict(batch, k=kk))
                
                n_nodes = th.sum(th.tensor([fd._train_g.number_of_nodes(n_type) for n_type in fd._train_g.ntypes]))
                n_lookup = np.sum([len(lookup) for ntype, lookup in fd._nodes_lookup.items()])
                
                n_edges = th.sum(th.tensor([fd._train_g.number_of_edges(e_type) for e_type in fd._train_g.etypes]))

                print("""----After Inference Internal Storage------'
                            #Nodes: {}
                            #Edges: {}
                            #Lookup keys: {}
                            """.format(n_nodes,n_edges, n_lookup))

            fraud_proba=np.concatenate(fraud_proba)
            
            auc=roc_auc_score(df_test.isFraud, fraud_proba)
            elaps=np.mean(fd._timings['predict: total'])
            
            print(mode, ii, kk, elaps, auc)
            batch_results.append((mode, ii, kk, elaps, auc))
            
## save results to csv
df_batch_results = pd.DataFrame(batch_results, columns=['mode', 'trial', 'k', 'time', 'AUC'])
df_batch_results.to_csv('batch_results.csv', header=True, index=False)

average

In [None]:
df_results.groupby(by=['mode', 'k'], as_index=False).mean().drop(columns=['trial'])

In [None]:
df_batch_results.groupby(by=['mode', 'k'], as_index=False).mean().drop(columns=['trial'])