# Loading data

In [None]:
import numpy as np
import pandas as pd
df = pd.read_parquet("fraud.parquet")

## Cleaning data

These data are mostly clean but we need to add a new field for transaction interarrival time.  Unlike the rest of the work in this notebook, we'll do this for *all* our data (i.e., we'll do this before holding out a test set).

In [None]:
df = df.sort_values(["user_id", "timestamp"]).reset_index()
del df['index']

In [None]:
shifted = df.shift(1)[['user_id', 'timestamp']]

df['prev_user_id'] = shifted['user_id']
df['prev_timestamp'] = shifted['timestamp']
df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)

del df['prev_user_id']
del df['prev_timestamp']

We'll also convert user and merchant IDs to strings so that we can hash them later:

In [None]:
df["user_id"] = "user_" + df["user_id"].astype(str).astype(pd.StringDtype())
df["merchant_id"] = "merchant_" + df["merchant_id"].astype(str).astype(pd.StringDtype())

In [None]:
df.to_parquet("fraud-cleaned.parquet")

In [None]:
df.sample(frac=0.05).to_parquet("fraud-cleaned-small.parquet")

##  Train/test split

We're using time-series data, so we'll split based on time.

In [None]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [None]:
train = df[df['timestamp'] <= cutoff]
len(train)

In [None]:
test = df[df['timestamp'] > cutoff]
len(test)

In [None]:
len(train) / (len(train) + len(test))

# Encoding categorical features

In [None]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

def mk_hasher(features=16384, values=None):
    return sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string')

tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])
mu_xform = ('m_hashing', mk_hasher(2048), 'merchant_id')
u_xform = ('u_hashing', mk_hasher(2048), 'user_id')


xform_steps = [tt_xform, mu_xform]

cat_xform = ColumnTransformer(transformers=xform_steps)


In [None]:
cat_xform

In [None]:
smol_train = train.sample(16384).copy()


In [None]:
smol_train["features1"] = cat_xform.fit_transform(smol_train)

In [None]:
smol_train

In [None]:
smol_train = train.sample(65536).copy()


fh = sklearn.feature_extraction.FeatureHasher(n_features=1024, input_type='string')
smol_train["mvecs"] = fh.fit_transform(smol_train["merchant_id"].values)

In [None]:
smol_train.mvecs




In [None]:
smol_train.dtypes

In [None]:
svecs = cat_xform.fit_transform(smol_train)

# Encoding other features

### FIXME:  

- `RobustScaler` for amounts
- something circular with timestamps for each user
- oversampling (for training)


In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))

# Keep this next bit



In [None]:

feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms)
])

from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")