# Loading data

In [1]:
import numpy as np
import pandas as pd
df = pd.read_parquet("fraud.parquet")

## Cleaning data

These data are mostly clean but we need to add a new field for transaction interarrival time.  Unlike the rest of the work in this notebook, we'll do this for *all* our data (i.e., we'll do this before holding out a test set).

In [2]:
df = df.sort_values(["user_id", "timestamp"]).reset_index()
del df['index']

In [3]:
shifted = df.shift(1)[['user_id', 'timestamp']]

df['prev_user_id'] = shifted['user_id']
df['prev_timestamp'] = shifted['timestamp']
df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)

del df['prev_user_id']
del df['prev_timestamp']

We'll also convert user and merchant IDs to strings so that we can hash them later:

In [4]:
df["user_id"] = "user_" + df["user_id"].astype(str).astype(pd.StringDtype())
df["merchant_id"] = "merchant_" + df["merchant_id"].astype(str).astype(pd.StringDtype())

##  Train/test split

We're using time-series data, so we'll split based on time.

In [5]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [6]:
train = df[df['timestamp'] <= cutoff]
len(train)

34993345

In [7]:
test = df[df['timestamp'] > cutoff]
len(test)

15006655

In [8]:
len(train) / (len(train) + len(test))

0.6998669

# Encoding categorical features

In [43]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer

def mk_hasher(features=16384, values=None):
    return sklearn.feature_extraction.FeatureHasher(n_features=features, input_type='string')

tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder(handle_unknown='ignore', categories=[['online','contactless','chip_and_pin','manual','swipe']]), ['trans_type'])
mu_xform = ('m_hashing', mk_hasher(2048), 'merchant_id')
u_xform = ('u_hashing', mk_hasher(2048), 'user_id')


xform_steps = [tt_xform, mu_xform]

cat_xform = ColumnTransformer(transformers=xform_steps)


In [38]:
cat_xform

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('onehot',
                                 OneHotEncoder(categories=[['online',
                                                            'contactless',
                                                            'chip_and_pin',
                                                            'manual',
                                                            'swipe']],
                                               drop=None,
                                               dtype=<class 'numpy.float64'>,
                                               handle_unknown='ignore',
                                               sparse=True),
                                 ['trans_type']),
                                ('m_hashing',
                                 FeatureHasher(alternate_sign=True,
                                               

In [11]:
smol_train = train.sample(16384).copy()


In [12]:
smol_train["features1"] = cat_xform.fit_transform(smol_train)

In [13]:
smol_train

Unnamed: 0,timestamp,label,user_id,amount,merchant_id,trans_type,foreign,interarrival,features1
2909330,1586630837,legitimate,user_578,19.36,merchant_16627,swipe,False,7907.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
31614200,1594857210,legitimate,user_6325,75.31,merchant_15416,chip_and_pin,False,5342.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
19882432,1608791187,legitimate,user_3975,7.96,merchant_12175,swipe,False,18828.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
21985707,1629031045,legitimate,user_4393,4.13,merchant_12410,swipe,False,6345.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
36816892,1630235399,legitimate,user_7351,11.32,merchant_421,chip_and_pin,False,6159.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
...,...,...,...,...,...,...,...,...,...
45496927,1630024202,legitimate,user_9088,6.11,merchant_11402,swipe,False,5839.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
36181450,1591541600,legitimate,user_7230,10.78,merchant_4384,swipe,False,5275.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
1061558,1625938321,legitimate,user_209,24.53,merchant_5857,online,False,41744.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."
43396518,1615392221,legitimate,user_8668,8.18,merchant_18619,swipe,False,5938.0,"(0, 92)\t1.0\n (0, 179)\t-1.0\n (0, 417)\t..."


In [55]:
smol_train = train.sample(65536).copy()


fh = sklearn.feature_extraction.FeatureHasher(n_features=1024, input_type='string')
smol_train["mvecs"] = fh.fit_transform(smol_train["merchant_id"].values)

In [15]:
smol_train.mvecs




25617143      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
11956845      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
25237130      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
29589058      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
1177990       (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
                                  ...                        
16228005      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
36890252      (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
4163968       (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
5097287       (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
3204713       (0, 92)\t1.0\n  (0, 179)\t-1.0\n  (0, 192)\t...
Name: mvecs, Length: 16384, dtype: object

In [16]:
smol_train.dtypes

timestamp         int64
label            object
user_id          string
amount          float64
merchant_id      string
trans_type       object
foreign            bool
interarrival    float64
mvecs            object
dtype: object

In [18]:
svecs = cat_xform.fit_transform(smol_train)

# Encoding other features

### FIXME:  

- `RobustScaler` for amounts
- something circular with timestamps for each user
- oversampling (for training)


In [56]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer

imputer = ('median_imputer', SimpleImputer(), ['interarrival'])
ia_scaler = ('interarrival_scaler', RobustScaler(), ['interarrival'])
amount_scaler = ('amount_scaler', RobustScaler(), ['amount'])

scale_steps = [imputer, ia_scaler, amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps + xform_steps))

In [70]:
1 << 20

1048576

In [90]:
smol_train = train.dropna().sample(1 << 21).copy()
# frauds = smol_train[smol_train["label"] == "fraud"].sample(n=len(smol_train), replace=True)

In [72]:
# smol_train = pd.concat([smol_train, frauds])

In [91]:
smol_train["weights"] = 3

In [92]:
smol_train.loc[smol_train["label"] == "fraud", "weights"] = 97

In [93]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(max_iter=500)

svecs = all_xforms.fit_transform(smol_train)
lr.fit(svecs, smol_train["label"], sample_weight=smol_train["weights"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [94]:
smol_test = test.dropna().sample(1000000)
predictions = lr.predict(all_xforms.fit_transform(smol_test))

In [95]:
from sklearn.metrics import classification_report
print(classification_report(smol_test.label.values, predictions))

              precision    recall  f1-score   support

       fraud       0.39      0.91      0.55     19016
  legitimate       1.00      0.97      0.99    980984

    accuracy                           0.97   1000000
   macro avg       0.69      0.94      0.77   1000000
weighted avg       0.99      0.97      0.98   1000000



In [96]:
lr.score(all_xforms.fit_transform(smol_test), smol_test.label.values)

0.971434

In [99]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection

rfc = RandomForestClassifier(n_estimators=8, random_state=404, class_weight="balanced_subsample")
rfc.fit(svecs, smol_train["label"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=8, n_jobs=None, oob_score=False,
                       random_state=404, verbose=0, warm_start=False)

In [100]:
smol_test = test.dropna().sample(1000000)
rf_predictions = rfc.predict(all_xforms.fit_transform(smol_test))

In [101]:
print(classification_report(smol_test.label.values, rf_predictions))

              precision    recall  f1-score   support

       fraud       0.96      0.92      0.94     19031
  legitimate       1.00      1.00      1.00    980969

    accuracy                           1.00   1000000
   macro avg       0.98      0.96      0.97   1000000
weighted avg       1.00      1.00      1.00   1000000



In [102]:
rfc.score(all_xforms.fit_transform(smol_test), smol_test.label.values)

0.997791

In [108]:
rfc2 = RandomForestClassifier(n_estimators=4, max_depth=3, random_state=404, class_weight="balanced_subsample")
rfc2.fit(svecs, smol_train["label"])

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=4, n_jobs=None, oob_score=False,
                       random_state=404, verbose=0, warm_start=False)

In [109]:
rf_predictions2 = rfc2.predict(all_xforms.fit_transform(smol_test))
print(classification_report(smol_test.label.values, rf_predictions2))

              precision    recall  f1-score   support

       fraud       0.92      0.93      0.93     19031
  legitimate       1.00      1.00      1.00    980969

    accuracy                           1.00   1000000
   macro avg       0.96      0.96      0.96   1000000
weighted avg       1.00      1.00      1.00   1000000



In [110]:
rftrain = rfc2.predict(all_xforms.fit_transform(smol_train))
print(classification_report(smol_train.label.values, rftrain))

              precision    recall  f1-score   support

       fraud       0.92      0.93      0.93     38189
  legitimate       1.00      1.00      1.00   2058963

    accuracy                           1.00   2097152
   macro avg       0.96      0.96      0.96   2097152
weighted avg       1.00      1.00      1.00   2097152



In [107]:
rftrain = rfc.predict(all_xforms.fit_transform(smol_train))
print(classification_report(smol_train.label.values, rftrain))

              precision    recall  f1-score   support

       fraud       1.00      0.99      0.99     38189
  legitimate       1.00      1.00      1.00   2058963

    accuracy                           1.00   2097152
   macro avg       1.00      1.00      1.00   2097152
weighted avg       1.00      1.00      1.00   2097152



In [111]:
rfc

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=8, n_jobs=None, oob_score=False,
                       random_state=404, verbose=0, warm_start=False)

# Keep this next bit



In [112]:
from sklearn.pipeline import Pipeline

feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms)
])

from mlworkflows import util
util.serialize_to(feat_pipeline, "feature_pipeline.sav")

ModuleNotFoundError: No module named 'cloudpickle'