# Loading data

In [2]:
import numpy as np
import pandas as pd
df = pd.read_parquet("fraud-cleaned-sample.parquet")

##  Train/test split

We're using time-series data, so we'll split based on time.

In [3]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [4]:
train = df[df['timestamp'] <= cutoff]
len(train)

1748016

In [5]:
test = df[df['timestamp'] > cutoff]
len(test)

751984

In [6]:
len(train) / (len(train) + len(test))

0.6992064

# Encoding categorical features

In [7]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

stringize = np.frompyfunc(lambda x: "%s" % x, 1, 1)

def mk_stringize(colname):
    def stringize(tab):
        return [{colname : s} for s in tab]
    return stringize

def amap(s):
    return s.map(str)

# my_func = mk_stringize('merchant_id')
my_func = amap

def mk_hasher(features=16384, values=None):    
    return Pipeline([('stringize', 
                      FunctionTransformer(my_func, accept_sparse=True)), 
                     ('hasher', 
                      sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string'))])


HASH_BUCKETS = 256

tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])
mu_xform = ('m_hashing', mk_hasher(HASH_BUCKETS), 'merchant_id')

xform_steps = [tt_xform, mu_xform]

cat_xform = ColumnTransformer(transformers=xform_steps, n_jobs=None)


# Visualizing categorical features

The general approach we'll use is to [_reduce the dimensionality_](https://en.wikipedia.org/wiki/Dimensionality_reduction) of our encoded categorical features so we can plot them as points on a plane.  This means going from hundreds of dimensions (in the case of hashed merchant IDs) or five or six dimensions (in the case of one-hot encoded transaction types) to two dimensions.

We'll use two different techniques:  a linear technique called [principal component analysis](https://en.wikipedia.org/wiki/Principal_component_analysis) and a nonlinear technique called [t-distributed stochastic neighbor embedding]().  The details of these techniques are out of scope for this workshop, but they're both good places to start if you want to visualize some high-dimensional data.  Dimensionality reduction can be expensive, so we'll start by sampling only a small amount of our data.  

In [8]:
vis_sample = pd.concat([train[train["label"] == label].sample(2500) for label in ["legitimate", "fraud"]])

categorical_matrix = cat_xform.fit_transform(vis_sample)

In [9]:
crows, ccols = categorical_matrix.shape

## Does the merchant ID correlate with fraud?

In [39]:
import sklearn.decomposition

merchants = categorical_matrix[:, -HASH_BUCKETS:]

DIMENSIONS = 2

mpca2 = sklearn.decomposition.SparsePCA(DIMENSIONS)

mpca2_a = mpca2.fit_transform(merchants.toarray())

In [40]:
merchants_df = pd.DataFrame({"label": vis_sample["label"].astype(np.object),
                             "x": mpca2_a.T[0],
                             "y": mpca2_a.T[1]}).reset_index().dropna()

del merchants_df["index"]

In [45]:
import altair as alt
alt.Chart(merchants_df).mark_point(opacity=0.1).encode(
    x="x:Q", 
    y="y:Q", 
    color="label"
)


### What if we look at more than two principal components?

In [55]:
mpca4 = sklearn.decomposition.TruncatedSVD(4)
mpca4_a = mpca4.fit_transform(merchants).T

merchant4_df = pd.DataFrame({"label": vis_sample["label"].astype(np.object),
                             "0": mpca4_a[0],
                             "1": mpca4_a[1],
                             "2": mpca4_a[2],
                             "3": mpca4_a[3]
                            }).reset_index().dropna()

alt.Chart(merchant4_df).mark_point(opacity=0.1).encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color="label"
).properties(
    height=150,
    width=150
).repeat(
    row=["%d" % i for i in range(4)],
    column=["%d" % i for i in range(4)]
)



In [29]:
import sklearn.manifold
tsne = sklearn.manifold.TSNE()

# use SVD to reduce the dimensionality before fitting t-SNE
svd = sklearn.decomposition.TruncatedSVD(16)
svd_a = svd.fit_transform(merchants)

tsne_a = tsne.fit_transform(svd_a)

merchants_df["x"] = tsne_a.T[0]
merchants_df["y"] = tsne_a.T[1]

alt.Chart(merchants_df).mark_point().encode(x="x:Q", y="y:Q", color="label")



## Does the combination of the transaction type and the merchant ID correlate with fraud?

In [30]:
tpca2 = sklearn.decomposition.SparsePCA(DIMENSIONS)

tpca_a = tpca2.fit_transform(categorical_matrix.toarray())

transactions_df = pd.DataFrame({"label": vis_sample["label"].astype(np.object),
                             "x": tpca_a.T[0],
                             "y": tpca_a.T[1]}).reset_index().dropna()

del transactions_df["index"]

import altair as alt
alt.Chart(transactions_df).mark_point().encode(x="x:Q", y="y:Q", color="label")

# Encoding other features


In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


impute_and_scale = Pipeline([('median_imputer', SimpleImputer()), ('interarrival_scaler', RobustScaler())])
ia_scaler = ('interarrival_scaler', impute_and_scale, ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))

# Fit and save the feature extraction pipeline

In [None]:
feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms)
])

feat_pipeline.fit(train)

from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")