# Process Dataset

In [1]:
# from dataset.visa_data import VisaTranasactionDataset
# import importlib, sys
# importlib.reload(sys.modules['dataset.visa_data'])
# from dataset.visa_data import VisaTranasactionDataset
from dataset.ibm_time_static import IBMWithTimePosAndStaticSplitDataset
from dataset.ibm_time_pos import IBMWithTimePosDataset
from dataset.ibm_static import IBMWithStaticSplitDataset
from dataset.ibm import IBMDataset

In [2]:
root="./data/credit_card/"
fname="card_transaction_train"
val_fname="card_transaction_val"
test_fname="card_transaction_test"
fextension="static"
preload_fextension="preload-test"
user_level_cached=True
vocab_cached=False
external_vocab_path=""
resample_method=None
resample_ratio=10
resample_seed=100
external_val=False

In [3]:
if not fextension:
    if vocab_cached:
        external_vocab_path=root+"vocab_ob"
    fextension=""
else:
    if vocab_cached:
        external_vocab_path=root+"vocab_ob_"+ fextension
if not preload_fextension:
    preload_fextension=""
if not external_vocab_path:
    external_vocab_path=""

In [4]:
assert fextension in ["static", "static-test", "time-pos", "time-pos-test", 
                      "", "test", "static-only", "static-only-test"]

pad_seq_first is True for training data, False for validation and test data.

In [5]:
dataset_class = ''
if fextension in ['static', 'static-test']:
    dataset_class = 'IBMWithTimePosAndStaticSplitDataset'
elif fextension in ["static-only", "static-only-test"]:
    dataset_class = 'IBMWithStaticSplitDataset'
elif fextension in ["time-pos", "time-pos-test"]:
    dataset_class = 'IBMWithTimePosDataset'
elif fextension in ["", "test"]:
    dataset_class = 'IBMDataset'

In [6]:
dataset = eval(dataset_class)(cls_task=True,
                              user_ids=None,
                              seq_len=10,
                              root=root,
                              fname=fname,
                              user_level_cached=user_level_cached,
                              vocab_cached=vocab_cached,
                              external_vocab_path=external_vocab_path,
                              preload_vocab_dir=root,
                              save_vocab_dir=root,
                              preload_fextension=preload_fextension,
                              fextension=fextension,
                              nrows=None,
                              flatten=False,
                              stride=5,
                              return_labels=True,
                              label_category='last_label',
                              pad_seq_first=True,
                              get_rids=True,
                              long_and_sort=True,
                              resample_method=resample_method,
                              resample_ratio=resample_ratio,
                              resample_seed=resample_seed)

100%|██████████| 1561/1561 [06:21<00:00,  4.09it/s]


In [19]:
print('negative samples', len([label for label in dataset.labels if label[-1] ==0]))
print('positive samples', len([label for label in dataset.labels if label[-1] ==1]))
print('all negative samples', len([label for label in dataset.all_labels if label[-1] ==0]))
print('all positive samples', len([label for label in dataset.all_labels if label[-1] ==1]))

negative samples 42430
positive samples 4243
all negative samples 3431194
all positive samples 4243


In [8]:
if not external_vocab_path:
    external_vocab_path=dataset.vocab_path
vocab_cached=True

In [9]:
if external_val:
    val_dataset = eval(dataset_class)(cls_task=True,
                                      user_ids=None,
                                      seq_len=10,
                                      root=root,
                                      fname=val_fname,
                                      user_level_cached=user_level_cached,
                                      vocab_cached=vocab_cached,
                                      external_vocab_path=external_vocab_path,
                                      preload_vocab_dir=root,
                                      save_vocab_dir=root,
                                      preload_fextension=preload_fextension,
                                      fextension=fextension,
                                      nrows=None,
                                      flatten=False,
                                      stride=5,
                                      return_labels=True,
                                      label_category='last_label',
                                      pad_seq_first=False,
                                      get_rids=True,
                                      long_and_sort=True,
                                      resample_method=resample_method,
                                      resample_ratio=resample_ratio,
                                      resample_seed=resample_seed)

100%|██████████| 1590/1590 [00:49<00:00, 32.33it/s]


In [20]:
if external_val:
    print('negative samples', len([label for label in val_dataset.labels if label[-1] ==0]))
    print('positive samples', len([label for label in val_dataset.labels if label[-1] ==1]))
    print('all negative samples', len([label for label in val_dataset.all_labels if label[-1] ==0]))
    print('all positive samples', len([label for label in val_dataset.all_labels if label[-1] ==1]))

negative samples 7500
positive samples 750
all negative samples 686306
all positive samples 750


In [11]:
test_dataset = eval(dataset_class)(cls_task=True,
                                  user_ids=None,
                                  seq_len=10,
                                  root=root,
                                  fname=test_fname,
                                  user_level_cached=user_level_cached,
                                  vocab_cached=vocab_cached,
                                  external_vocab_path=external_vocab_path,
                                  preload_vocab_dir=root,
                                  save_vocab_dir=root,
                                  preload_fextension=preload_fextension,
                                  fextension=fextension,
                                  nrows=None,
                                  flatten=False,
                                  stride=1,
                                  return_labels=True,
                                  label_category='last_label',
                                  pad_seq_first=False,
                                  get_rids=True,
                                  long_and_sort=True,
                                  resample_method=None)

100%|██████████| 1964/1964 [02:05<00:00, 15.71it/s]


In [21]:
print('negative samples', len([label for label in test_dataset.labels if label[-1] ==0]))
print('positive samples', len([label for label in test_dataset.labels if label[-1] ==1]))
print('all negative samples', len([label for label in test_dataset.all_labels if label[-1] ==0]))
print('all positive samples', len([label for label in test_dataset.all_labels if label[-1] ==1]))

negative samples 3773643
positive samples 4578
all negative samples 0
all positive samples 0


In [13]:
from os import path
import pickle
fname = path.join(test_dataset.root, f"preprocessed/{test_dataset.fname}.user{test_dataset.fextension}.pkl")
cached_data = pickle.load(open(fname, "rb"))
print(cached_data["columns"])
print(test_dataset.vocab.field_keys)

['Year', 'Month', 'Day', 'Hour', 'Amount', 'Use Chip', 'Merchant Name', 'Merchant City', 'Merchant State', 'Zip', 'MCC', 'Errors?', 'timeFeature', 'avg_dollar_amt', 'std_dollar_amt', 'top_mcc', 'top_chip']
OrderedDict([('Year', None), ('Month', None), ('Day', None), ('Hour', None), ('Amount', None), ('Use Chip', None), ('Merchant Name', None), ('Merchant City', None), ('Merchant State', None), ('Zip', None), ('MCC', None), ('Errors?', None), ('Is Fraud?', None), ('timeFeature', None), ('avg_dollar_amt', None), ('std_dollar_amt', None), ('top_mcc', None), ('top_chip', None), ('SPECIAL', None)])


In [14]:
try:
    print(len(dataset.vocab), len(dataset.dynamic_vocab), len(dataset.time_feature_vocab), len(dataset.static_vocab))
except:
    print(len(dataset.vocab))
if external_val:
    print(len(val_dataset.vocab))
print(len(test_dataset.vocab))

126486 126427 18 55
126486
126486


In [15]:
if external_val:
    print(dataset.ncols, val_dataset.ncols, test_dataset.ncols)
    print(len(dataset), len(val_dataset), len(test_dataset))
else:
    print(dataset.ncols, test_dataset.ncols)
    print(len(dataset), len(test_dataset))
print(dataset[0])
print('dataset len', [i.shape for i in dataset[0]])
if external_val:
    print(val_dataset[0])
    print('val_dataset len', [i.shape for i in val_dataset[0]])
print(test_dataset[0])
print('test_dataset len', [i.shape for i in test_dataset[0]])

18 18 18
46673 8250 3778221
(tensor([[    21,     39,     63,     89,    110,    115,    441,  87332,  99816,
         100443, 126294, 126400, 126435, 126440, 126454, 126461, 126483,      1],
        [    21,     39,     53,     89,    110,    115,    990,  87332,  99816,
         100443, 126296, 126400, 126435, 126440, 126454, 126461, 126483,      1],
        [    21,     39,     53,     93,    107,    115,    138,  87332,  99816,
         100443, 126300, 126400, 126435, 126440, 126454, 126461, 126483,      1],
        [    21,     39,     66,     89,    110,    115,    137,  87332,  99816,
         100443, 126294, 126400, 126435, 126440, 126454, 126461, 126483,      1],
        [    21,     39,     66,     86,    108,    115,   4192,  87332,  99816,
         100443, 126299, 126400, 126435, 126440, 126454, 126461, 126483,      1],
        [    21,     39,     61,     89,    107,    115,    141,  87332,  99816,
         100443, 126301, 126401, 126435, 126440, 126454, 126461, 126483,   