In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
import transaction_demo

In [3]:
seed = 42

### Load transactions

In [4]:
df = pd.read_csv('../data/transactions_sample_10k.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,mean_income
0,0,695247,2378,1993-01-01,1,2.0,700.0,700.0,,,,[ 11271.53754941]
1,1,695250,2378,1993-01-22,1,2.0,7268.0,7968.0,,,,[ 11271.53754941]
2,2,695266,2378,1993-01-23,1,2.0,14440.0,22408.0,,,,[ 11271.53754941]
3,3,695262,2378,1993-01-24,1,2.0,49752.0,72160.0,,,,[ 11271.53754941]
4,4,695265,2378,1993-01-26,1,2.0,21899.0,94059.0,,,,[ 11271.53754941]
5,5,695560,2378,1993-01-31,0,3.0,34700.0,59453.7,,,,[ 11271.53754941]
6,6,3608592,2378,1993-01-31,1,,94.7,94153.7,2.0,,,[ 11271.53754941]
7,7,695272,2378,1993-02-05,1,2.0,30192.0,89645.7,,,,[ 11271.53754941]
8,8,695340,2378,1993-02-07,0,0.0,9612.0,80033.7,4.0,EF,1222903.0,[ 11271.53754941]
9,9,695270,2378,1993-02-11,1,2.0,31038.0,111071.7,,,,[ 11271.53754941]


### Clean the dataset

* Leave target columns only (date, operation and amount)
* filter out NaNs
* convert operations to int

In [5]:
df = transaction_demo.clean_dataset(df)
df.head(10)

Unnamed: 0,account_id,date,operation,amount
0,2378,1993-01-01,2,700.0
1,2378,1993-01-22,2,7268.0
2,2378,1993-01-23,2,14440.0
3,2378,1993-01-24,2,49752.0
4,2378,1993-01-26,2,21899.0
5,2378,1993-01-31,3,34700.0
7,2378,1993-02-05,2,30192.0
8,2378,1993-02-07,0,9612.0
9,2378,1993-02-11,2,31038.0
10,2378,1993-02-11,3,25200.0


### Segment all transaction by months

We consider each month of an account as a separate segment. All segments are considered the same. It means our generator will generate a monthly snapshot of transactions for some user.

In [6]:
df = transaction_demo.segment_by_month(df)
df.head(10)

Unnamed: 0,operation,amount,day,segment_id
0,2,700.0,1,2378:1993-01
1,2,7268.0,22,2378:1993-01
2,2,14440.0,23,2378:1993-01
3,2,49752.0,24,2378:1993-01
4,2,21899.0,26,2378:1993-01
5,3,34700.0,31,2378:1993-01
7,2,30192.0,5,2378:1993-02
8,0,9612.0,7,2378:1993-02
9,2,31038.0,11,2378:1993-02
10,3,25200.0,11,2378:1993-02


### Prepare training data

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
transform_pipeline = Pipeline([
    ("featurize", transaction_demo.TransactionTransformer(
        group_column='segment_id',
        value_column='amount',
        dim1_column='operation',
        dim2_column='day',
        dim1_size=5,
        dim2_size=32)),
    ("scale", StandardScaler())
])

In [8]:
df_transformed = transform_pipeline.fit_transform(df)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_transformed, test_size=0.3, random_state=seed)

### Train the model

In [10]:
from synthesized.modules.synth import AlphaSynth
from sklearn.metrics import mean_squared_error

# define the estimator. We will use L2 penalty to augment the generalizability of the encoder
alpha = AlphaSynth(n_epochs=100, n_hidden=250, learning_rate=0.01, batch_size=106, 
                 display_step=10, activation_function='relu', verbose=2, min_change = 1e-6, 
                 random_state=seed, clip=True, l2_penalty=1e-5, 
                 early_stopping=True)

# fit
alpha.fit(X_train)

# transform and reconstruct the test images
reconstructed = alpha.feed_forward(X_test)

# get the error:
mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]
print("\nTest MSE: %.4f" % mse)

Epoch: 1, cost=0.433520, time=0.0584 (sec)
Epoch: 11, cost=0.115628, time=0.0426 (sec)
Epoch: 21, cost=0.072102, time=0.0319 (sec)
Epoch: 31, cost=0.065192, time=0.0340 (sec)
Epoch: 41, cost=0.055605, time=0.0410 (sec)
Epoch: 51, cost=0.067917, time=0.0363 (sec)
Epoch: 61, cost=0.064859, time=0.0342 (sec)
Epoch: 71, cost=0.064395, time=0.0329 (sec)
Epoch: 81, cost=0.056316, time=0.0320 (sec)
Epoch: 91, cost=0.066579, time=0.0323 (sec)
Optimization complete after 100 epoch(s). Average epoch time: 0.0442 seconds

Test MSE: 8.3141


### Convert generated data to transactions

In [11]:
transform_pipeline.inverse_transform(reconstructed).head(10)

Unnamed: 0,segment_id,day,operation,amount
0,5580556212893143401,1,0,35.012077
1,5580556212893143401,1,1,968.084534
2,5580556212893143401,1,2,756.671326
3,5580556212893143401,1,3,765.69043
4,5580556212893143401,1,4,818.86969
5,5580556212893143401,2,0,478.43866
6,5580556212893143401,2,1,587.410034
7,5580556212893143401,2,2,970.840454
8,5580556212893143401,2,3,391.835968
9,5580556212893143401,2,4,206.794022


In [12]:
transform_pipeline.inverse_transform(X_test).head(10)

Unnamed: 0,segment_id,day,operation,amount
0,2655626363671805029,1,2,-1.136868e-13
1,2655626363671805029,2,3,5.684342e-14
2,2655626363671805029,3,2,3100.0
3,2655626363671805029,3,4,4470.0
4,2655626363671805029,4,0,5.684342e-14
5,2655626363671805029,6,3,5.684342e-14
6,2655626363671805029,7,3,5.684342e-14
7,2655626363671805029,9,1,14.6
8,4361901126958995587,1,1,1591.0
9,4361901126958995587,1,2,3572.0
