In [1]:
import numpy as np
import math
import pandas as pd
import sklearn as sk
import os
import matplotlib
import matplotlib.pyplot as plt
import warnings 
warnings.filterwarnings('ignore')
import seaborn as sns; sns.set_theme()

**Identity Table**

- **DeviceType**
- **DeviceInfo**
- **id_01 - id_38**
- Variables in this table are identity information – network connection information (IP, ISP, Proxy, etc) and digital signature (UA/browser/os/version, etc) associated with transactions.
- They're collected by Vesta’s fraud protection system and digital security partners.
- The field names are masked and pairwise dictionary will not be provided for privacy protection and contract agreement

In [2]:
#Data path
train_transaction_path = "data/train_transaction.csv"
train_identity_path = "data/train_identity.csv"
test_transaction_path = "data/test_transaction.csv"
test_identity_path = "data/test_identity.csv"

train_identity = pd.read_csv(train_identity_path)
test_identity = pd.read_csv(test_identity_path)

In [4]:
train_identity.head()

Unnamed: 0,TransactionID,id_01,id_02,id_03,id_04,id_05,id_06,id_07,id_08,id_09,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987004,0.0,70787.0,,,,,,,,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
1,2987008,-5.0,98945.0,,,0.0,-5.0,,,,...,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device
2,2987010,-5.0,191631.0,0.0,0.0,0.0,0.0,,,0.0,...,chrome 62.0,,,,F,F,T,T,desktop,Windows
3,2987011,-5.0,221832.0,,,0.0,-6.0,,,,...,chrome 62.0,,,,F,F,T,T,desktop,
4,2987016,0.0,7460.0,0.0,0.0,1.0,0.0,,,0.0,...,chrome 62.0,24.0,1280x800,match_status:2,T,F,T,T,desktop,MacOS


In [None]:
['TransactionDayOfWeek', 'TransactionHour', 'ProductCD', 'card4', 'card6', 'addr1_new', 'addr2_new', 'P_emaildomain_new', 'R_emaildomain_new',
'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']


In [None]:
['V1-V339', 'C1-C14', 'dis1', 'dist2', 'TransactionAmt_log']

In [5]:
# following columns are the easily one hot encoded categorical
one_hot_lst = ['DeviceType', 'id_12', 'id_15', 'id_16', 'id_23', 'id_27', 'id_28', 'id_29', 'id_32', 'id_34', 
               'id_35', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38']

# following columns are the numerical features


# following are the not sure, but treated as numeric directly this time


In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
# try to apply ont hot encoding on the simple categorical features
to_ohe = train_identity[one_hot_lst]
ohe = OneHotEncoder()
ohe.fit(to_ohe)

OneHotEncoder()

In [11]:
# check for the values in categories
ohe.categories_

[array(['desktop', 'mobile', nan], dtype=object),
 array(['Found', 'NotFound'], dtype=object),
 array(['Found', 'New', 'Unknown', nan], dtype=object),
 array(['Found', 'NotFound', nan], dtype=object),
 array(['IP_PROXY:ANONYMOUS', 'IP_PROXY:HIDDEN', 'IP_PROXY:TRANSPARENT',
        nan], dtype=object),
 array(['Found', 'NotFound', nan], dtype=object),
 array(['Found', 'New', nan], dtype=object),
 array(['Found', 'NotFound', nan], dtype=object),
 array([ 0., 16., 24., 32., nan]),
 array(['match_status:-1', 'match_status:0', 'match_status:1',
        'match_status:2', nan], dtype=object),
 array(['F', 'T', nan], dtype=object),
 array(['match_status:-1', 'match_status:0', 'match_status:1',
        'match_status:2', nan], dtype=object),
 array(['F', 'T', nan], dtype=object),
 array(['F', 'T', nan], dtype=object),
 array(['F', 'T', nan], dtype=object),
 array(['F', 'T', nan], dtype=object)]

In [12]:
# give a feature object
ohe_features = ohe.transform(to_ohe)
ohe_features

<144233x55 sparse matrix of type '<class 'numpy.float64'>'
	with 2307728 stored elements in Compressed Sparse Row format>

In [13]:
# gives the feature in a array
ohe_features.toarray()

array([[0., 1., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.]])

In [15]:
# gives the names of each feature
ohe.get_feature_names()

array(['x0_desktop', 'x0_mobile', 'x0_nan', 'x1_Found', 'x1_NotFound',
       'x2_Found', 'x2_New', 'x2_Unknown', 'x2_nan', 'x3_Found',
       'x3_NotFound', 'x3_nan', 'x4_IP_PROXY:ANONYMOUS',
       'x4_IP_PROXY:HIDDEN', 'x4_IP_PROXY:TRANSPARENT', 'x4_nan',
       'x5_Found', 'x5_NotFound', 'x5_nan', 'x6_Found', 'x6_New',
       'x6_nan', 'x7_Found', 'x7_NotFound', 'x7_nan', 'x8_0.0', 'x8_16.0',
       'x8_24.0', 'x8_32.0', 'x8_nan', 'x9_match_status:-1',
       'x9_match_status:0', 'x9_match_status:1', 'x9_match_status:2',
       'x9_nan', 'x10_F', 'x10_T', 'x10_nan', 'x11_match_status:-1',
       'x11_match_status:0', 'x11_match_status:1', 'x11_match_status:2',
       'x11_nan', 'x12_F', 'x12_T', 'x12_nan', 'x13_F', 'x13_T',
       'x13_nan', 'x14_F', 'x14_T', 'x14_nan', 'x15_F', 'x15_T',
       'x15_nan'], dtype=object)