# Loading data

In [1]:
import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-cleaned-sample.parquet")

##  Train/test split

We're using time-series data, so we'll split based on time.

In [2]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [3]:
train = df[df['timestamp'] <= cutoff]
len(train)

1748016

In [4]:
test = df[df['timestamp'] > cutoff]
len(test)

751984

In [5]:
len(train) / (len(train) + len(test))

0.6992064

# Encoding categorical features

In [6]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

def mk_hasher(features=16384, values=None):
    return Pipeline([('stringize', FunctionTransformer(repr, accept_sparse=True)), 
                     ('hasher', 
                      sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string'))])


tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])
mu_xform = ('m_hashing', mk_hasher(2048), 'merchant_id')

xform_steps = [tt_xform, mu_xform]

cat_xform = ColumnTransformer(transformers=xform_steps)


# Encoding other features


In [7]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))

# Keep this next bit



In [8]:

feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms)
])

from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")

In [9]:
feat_pipeline

Pipeline(memory=None,
         steps=[('feature_extraction',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('interarrival_scaler',
                                                  Pipeline(memory=None,
                                                           steps=[('median_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                  