In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
import transaction_demo

In [3]:
seed = 42

### Load transactions

In [4]:
df = pd.read_csv('../data/transactions_sample_10k.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,mean_income
0,0,695247,2378,1993-01-01,1,2.0,700.0,700.0,,,,[ 11271.53754941]
1,1,695250,2378,1993-01-22,1,2.0,7268.0,7968.0,,,,[ 11271.53754941]
2,2,695266,2378,1993-01-23,1,2.0,14440.0,22408.0,,,,[ 11271.53754941]
3,3,695262,2378,1993-01-24,1,2.0,49752.0,72160.0,,,,[ 11271.53754941]
4,4,695265,2378,1993-01-26,1,2.0,21899.0,94059.0,,,,[ 11271.53754941]
5,5,695560,2378,1993-01-31,0,3.0,34700.0,59453.7,,,,[ 11271.53754941]
6,6,3608592,2378,1993-01-31,1,,94.7,94153.7,2.0,,,[ 11271.53754941]
7,7,695272,2378,1993-02-05,1,2.0,30192.0,89645.7,,,,[ 11271.53754941]
8,8,695340,2378,1993-02-07,0,0.0,9612.0,80033.7,4.0,EF,1222903.0,[ 11271.53754941]
9,9,695270,2378,1993-02-11,1,2.0,31038.0,111071.7,,,,[ 11271.53754941]


### Clean the dataset

* Leave onyl target columns (date, operation and amount)
* filter out NaNs
* convert operations to int

In [5]:
df = transaction_demo.clean_dataset(df)
df.head(10)

Unnamed: 0,account_id,date,operation,amount
0,2378,1993-01-01,2,700.0
1,2378,1993-01-22,2,7268.0
2,2378,1993-01-23,2,14440.0
3,2378,1993-01-24,2,49752.0
4,2378,1993-01-26,2,21899.0
5,2378,1993-01-31,3,34700.0
7,2378,1993-02-05,2,30192.0
8,2378,1993-02-07,0,9612.0
9,2378,1993-02-11,2,31038.0
10,2378,1993-02-11,3,25200.0


### Prepare training data

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
transform_pipeline = Pipeline([
    ("featurize", transaction_demo.TransactionTransformer()),
    ("scale", StandardScaler())
])

In [7]:
df_transformed = transform_pipeline.fit_transform(df)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_transformed, test_size=0.3, random_state=seed)

### Train the model

In [9]:
from synthesized.modules.synth import AlphaSynth
from sklearn.metrics import mean_squared_error

# define the estimator. We will use L2 penalty to augment the generalizability of the encoder
alpha = AlphaSynth(n_epochs=100, n_hidden=250, learning_rate=0.01, batch_size=106, 
                 display_step=10, activation_function='relu', verbose=2, min_change = 1e-6, 
                 random_state=seed, clip=True, l2_penalty=1e-5, 
                 early_stopping=True)

# fit
alpha.fit(X_train)

# transform and reconstruct the test images
reconstructed = alpha.feed_forward(X_test)

# get the error:
mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]
print("\nTest MSE: %.4f" % mse)

Epoch: 1, cost=0.357953, time=0.1319 (sec)
Epoch: 11, cost=0.357504, time=0.1103 (sec)
Epoch: 21, cost=0.357790, time=0.1056 (sec)
Epoch: 31, cost=0.356763, time=0.1580 (sec)
Epoch: 41, cost=0.355559, time=0.1370 (sec)
Epoch: 51, cost=0.353660, time=0.1057 (sec)
Convergence reached at epoch 55, stopping early
Optimization complete after 56 epoch(s). Average epoch time: 0.1397 seconds

Test MSE: 3783.2736


### Convert generated data to transactions

In [14]:
transform_pipeline.inverse_transform(reconstructed).head(10)

Unnamed: 0,account_id,date,operation,amount
0,4308903652354333064,1993-01-01,1,0.228005
1,4308903652354333064,1993-01-01,2,172.1772
2,4308903652354333064,1993-01-01,4,0.231233
3,4308903652354333064,1993-01-02,0,0.007424
4,4308903652354333064,1993-01-02,1,0.304757
5,4308903652354333064,1993-01-02,2,100.973091
6,4308903652354333064,1993-01-02,4,0.019262
7,4308903652354333064,1993-01-03,0,0.106576
8,4308903652354333064,1993-01-03,2,191.607819
9,4308903652354333064,1993-01-03,4,0.223623
