In [1]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

import sys, os

dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
sys.path
exec(open('rapidsai-csp-utils/colab/update_modules.py').read(), globals())

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 205 (delta 16), reused 3 (delta 0), pack-reused 171[K
Receiving objects: 100% (205/205), 62.72 KiB | 10.45 MiB/s, done.
Resolving deltas: 100% (78/78), done.
PLEASE READ
********************************************************************************************************
Changes:
1. IMPORTANT SCRIPT CHANGES: Colab has updated to Python 3.7, and now runs our STABLE and NIGHTLY versions (0.18 and 0.19)!  PLEASE update your older install script code as follows:
	!bash rapidsai-csp-utils/colab/rapids-colab.sh 0.18

	import sys, os

	dist_package_index = sys.path.index('/usr/local/lib/python3.7/dist-packages')
	sys.path = sys.path[:dist_package_index] + ['/usr/local/lib/python3.7/site-packages'] + sys.path[dist_package_index:]
	sys.path
	exec(open('rapidsai-csp-utils/colab/update_modules.py').rea

## Feature engineering with RAPIDS

In this notebook we explore our data and develop a feature engineering pipeline, on GPUs. 

### Loading data


In [2]:
import cudf

In [4]:
df = cudf.read_parquet("fraud-cleaned-sample.parquet")

### Training/testing set split

In [5]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [6]:
train = df[df['timestamp'] <= cutoff ]
len(train)

1748016

In [7]:
test = df[df['timestamp'] > cutoff]
len(test)

751984

In [8]:
df.columns

Index(['timestamp', 'label', 'user_id', 'amount', 'merchant_id', 'trans_type',
       'foreign', 'interarrival'],
      dtype='object')

## Encoding catagorical features

We'll start by encoding the categorical features. Within our data set, the categorical features are: 

- merchant id
- transaction type

We will implement one-hot encoding for transaction types and target encoding for merchant id's.

In [9]:
## One-Hot Encoding

transaction_types= train['trans_type'].unique()
transaction_types

0    chip_and_pin
1     contactless
2          manual
3          online
4           swipe
Name: trans_type, dtype: object

In [10]:
train['trans_type'].groupby(['trans_type']).agg('count')

trans_type
online          616796
chip_and_pin    345144
swipe           258937
manual           96580
contactless     430559
Name: trans_type, dtype: int32

In [11]:
import cuml
from cuml.preprocessing import OneHotEncoder

In [12]:
ohe = OneHotEncoder(output_type='input', sparse=False)

In [13]:
encode_type = ohe.fit_transform(train[['trans_type']])

In [14]:
encode_df = cudf.DataFrame(encode_type, columns=['chip_and_pin', 'contactless', 'manual', 'online', 'swipe'], index=train.index)

In [15]:
#checking counts
encode_df[['chip_and_pin', 'contactless', 'manual', 'online', 'swipe']].sum()


chip_and_pin    345144.0
contactless     430559.0
manual           96580.0
online          616796.0
swipe           258937.0
dtype: float64

We are going to use target-encoding to encode the merchant id's into our feature vectors. We replace the merchant id by the proportion of that merchant's transactions which are legitmate. 

CUML's target encoder first requires us to parse the label strings as ordinal labels:  

In [16]:
from cuml.preprocessing.LabelEncoder import LabelEncoder

In [25]:
lab_ser = cudf.Series(['legitimate', 'fraud'])

In [26]:
le=LabelEncoder(output_type='cudf')

encoded = le.fit_transform(train.label, lab_ser)

In [27]:
encoded

44905449    1
17805936    1
8819718     1
5773423     1
19308700    1
           ..
41838350    1
280831      1
23291970    1
18344979    1
23645914    1
Length: 1748016, dtype: uint8

Now we can train a target encoder to replace the merchant ID with the proportion of transactions carried out with that merchant which were legitimate. 

In [28]:
import cuml
from cuml.preprocessing.TargetEncoder import TargetEncoder

In [29]:
tar_encoder = TargetEncoder(n_folds=4)

In [30]:
prop_enc=tar_encoder.fit_transform(train.merchant_id, encoded)

In [31]:
prop_enc

array([0.99678198, 0.99784637, 0.99852071, ..., 1.        , 0.99924981,
       0.99848714])

## Encoding Numeric Features

In [32]:
import cuml

from cuml.experimental.preprocessing import SimpleImputer

med_imputer = SimpleImputer(strategy="median")


In [33]:
## median interarrival time
imputed = med_imputer.fit_transform(train[['interarrival']])

In [34]:
from cuml.experimental.preprocessing import RobustScaler

robust_scale = RobustScaler()
robust_arrival=robust_scale.fit_transform(imputed)

In [35]:
amount_scaler = RobustScaler()
sc_amount = amount_scaler.fit_transform(train[['amount']])


In [None]:
### Transforming whole df into feature vecs: 

In [40]:
encode_type_df = ohe.transform(df[['trans_type']]) ##array

In [41]:
prop_enc_df = tar_encoder.transform(df.merchant_id) ##array

In [42]:
imputed_df = med_imputer.transform(df[['interarrival']]) #cudf
robust_arrival_df=robust_scale.transform(imputed) #cudf

In [45]:
type(imputed_df)

cudf.core.dataframe.DataFrame

In [43]:
sc_amount_df = amount_scaler.transform(df[['amount']]) #cudf


In [None]:
cudf.DataFrame(encode_type_df)

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...
2499995,1.0,0.0,0.0,0.0,0.0
2499996,0.0,0.0,0.0,0.0,1.0
2499997,1.0,0.0,0.0,0.0,0.0
2499998,0.0,0.0,0.0,1.0,0.0


In [None]:
cudf.DataFrame(prop_enc_df)

Unnamed: 0,0
0,0.997599
1,0.998369
2,0.998899
3,0.999423
4,0.999447
...,...
2499995,1.000000
2499996,1.000000
2499997,0.999434
2499998,0.998876


In [None]:
robust_arrival_df

Unnamed: 0,0
0,-0.212208
1,-0.227280
2,0.056820
3,-0.190957
4,0.251394
...,...
2499995,6.338809
2499996,-0.210550
2499997,5.537754
2499998,-0.020950


In [None]:
sc_amount_df

Unnamed: 0,0
0,0.309371
1,0.512837
2,-0.720154
3,-0.887677
4,0.146341
...,...
2499995,-0.962131
2499996,1.417202
2499997,-0.830552
2499998,-0.752888


In [None]:
## Want to join these data frames into 1 df, then change the indexes to those used in the initial table. 

## Combining the results

combine all the results to the same answer. 

We want to use pipelines, but are lacking the 'column transformer' function that is needed to make a super sleek pipeline, like we do with scikit. We can get round this by saving the pipeline steps individually

TODO: There must be a nicer way. 

In [36]:
import sklearn 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [70]:
tt_xform = ('onehot', OneHotEncoder(sparse=False), ['trans_type']) ### TO FIX - pass in transaction types. 

In [71]:
xform_steps = [tt_xform]
cat_xform = ColumnTransformer(transformers = xform_steps)

In [72]:
cat_xform.fit_transform(train)

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0.]])

In [None]:
 ## works for one hot encoding. Now try for merchant ids:

In [75]:

## this is not going to work, but captures the flavour of what I expect we need here 
encode_and_target = Pipeline([('label_encoder', LabelEncoder()), ('target_encoder', TargetEncoder())])

lab_ser = cudf.Series(['legitimate', 'fraud']) ## need to pass these in 

et_scaler = ('merchant_target', encode_and_target, ['label', 'merchant_id'])


In [76]:
mer_steps = [et_scaler]

In [77]:
mer_xform = ColumnTransformer(transformers=mer_steps)

In [78]:
mer_xform.fit_transform(train)

AttributeError: ignored

In [79]:
### numeric values: Amount and interarrival time. Amount first: 


am_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [am_scaler]


am_xform = ColumnTransformer(transformers=scale_steps, n_jobs=None)

In [80]:
am_xform.fit_transform(train)

TypeError: ignored

In [81]:

impute_and_scale = Pipeline([('median_imputer', SimpleImputer(strategy="median")), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])

In [82]:
ia_xform = ColumnTransformer(transformers=[ia_scaler])

In [84]:
ia_xform.fit_transform(train)

TypeError: ignored