# Loading data

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv("fraud.csv")

## Cleaning data

These data are mostly clean but we need to add a new field for transaction interarrival time.  Unlike the rest of the work in this notebook, we'll do this for *all* our data (i.e., we'll do this before holding out a test set).

In [None]:
df = df.sort_values(["user_id", "timestamp"]).reset_index()
del df['index']

In [None]:
shifted = df.shift(1)[['user_id', 'timestamp']]

df['prev_user_id'] = shifted['user_id']
df['prev_timestamp'] = shifted['timestamp']
df['interarrival'] = (df['timestamp'] - df['prev_timestamp']).where(df['user_id'] == df['prev_user_id'], np.NaN)

del df['prev_user_id']
del df['prev_timestamp']

In [None]:
df

##  Train/test split

We're using time-series data, so we'll split based on time.

In [None]:
first = df['timestamp'].min()
last = df['timestamp'].max()
cutoff = first + ((last - first) * 0.7)

In [None]:
train = df[df['timestamp'] <= cutoff]
len(train)

In [None]:
test = df[df['timestamp'] > cutoff]
len(test)

In [None]:
len(train) / (len(train) + len(test))

# Encoding categorical features

In [None]:
import sklearn
from sklearn.pipeline import Pipeline
from sklearn import feature_extraction, preprocessing
from sklearn.compose import ColumnTransformer

tt_xform = ('onehot', sklearn.preprocessing.OneHotEncoder())
merchant_xform = ('m_hashing', sklearn.feature_extraction.FeatureHasher(n_features=128, input_type='string'))
user_xform = ('u_hashing', sklearn.feature_extraction.FeatureHasher(n_features=512, input_type='string'))


xform_columns = ['trans_type', 'merchant_id', 'user_id']
xform_steps = [tt_xform, merchant_xform, user_xform]

cat_xform = ColumnTransformer(transformers=[('categorical_xforms', Pipeline(xform_steps), xform_columns)])



In [None]:
cat_xform.fit_transform(train)



# Encoding other features

### FIXME:  

- `RobustScaler` for amounts
- something circular with timestamps for each user
- 
