In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
import transaction_demo

In [3]:
seed = 42

### Load transactions

In [4]:
df = pd.read_csv('../data/transactions_sample_10k.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,mean_income
0,0,695247,2378,1993-01-01,1,2.0,700.0,700.0,,,,[ 11271.53754941]
1,1,695250,2378,1993-01-22,1,2.0,7268.0,7968.0,,,,[ 11271.53754941]
2,2,695266,2378,1993-01-23,1,2.0,14440.0,22408.0,,,,[ 11271.53754941]
3,3,695262,2378,1993-01-24,1,2.0,49752.0,72160.0,,,,[ 11271.53754941]
4,4,695265,2378,1993-01-26,1,2.0,21899.0,94059.0,,,,[ 11271.53754941]
5,5,695560,2378,1993-01-31,0,3.0,34700.0,59453.7,,,,[ 11271.53754941]
6,6,3608592,2378,1993-01-31,1,,94.7,94153.7,2.0,,,[ 11271.53754941]
7,7,695272,2378,1993-02-05,1,2.0,30192.0,89645.7,,,,[ 11271.53754941]
8,8,695340,2378,1993-02-07,0,0.0,9612.0,80033.7,4.0,EF,1222903.0,[ 11271.53754941]
9,9,695270,2378,1993-02-11,1,2.0,31038.0,111071.7,,,,[ 11271.53754941]


### Clean the dataset

* Leave target columns only (date, operation and amount)
* filter out NaNs
* convert operations to int

In [5]:
df = transaction_demo.clean_dataset(df)
df.head(10)

Unnamed: 0,account_id,date,operation,amount
0,2378,1993-01-01,2,700.0
1,2378,1993-01-22,2,7268.0
2,2378,1993-01-23,2,14440.0
3,2378,1993-01-24,2,49752.0
4,2378,1993-01-26,2,21899.0
5,2378,1993-01-31,3,34700.0
7,2378,1993-02-05,2,30192.0
8,2378,1993-02-07,0,9612.0
9,2378,1993-02-11,2,31038.0
10,2378,1993-02-11,3,25200.0


### Prepare training data

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
transform_pipeline = Pipeline([
    ("featurize", transaction_demo.TransactionTransformer()),
    ("scale", StandardScaler())
])

In [7]:
df_transformed = transform_pipeline.fit_transform(df)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_transformed, test_size=0.3, random_state=seed)

### Train the model

In [9]:
from synthesized.modules.synth import AlphaSynth
from sklearn.metrics import mean_squared_error

# define the estimator. We will use L2 penalty to augment the generalizability of the encoder
alpha = AlphaSynth(n_epochs=100, n_hidden=250, learning_rate=0.01, batch_size=106, 
                 display_step=10, activation_function='relu', verbose=2, min_change = 1e-6, 
                 random_state=seed, clip=True, l2_penalty=1e-5, 
                 early_stopping=True)

# fit
alpha.fit(X_train)

# transform and reconstruct the test images
reconstructed = alpha.feed_forward(X_test)

# get the error:
mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]
print("\nTest MSE: %.4f" % mse)

Epoch: 1, cost=0.357953, time=0.1319 (sec)
Epoch: 11, cost=0.357504, time=0.1103 (sec)
Epoch: 21, cost=0.357790, time=0.1056 (sec)
Epoch: 31, cost=0.356763, time=0.1580 (sec)
Epoch: 41, cost=0.355559, time=0.1370 (sec)
Epoch: 51, cost=0.353660, time=0.1057 (sec)
Convergence reached at epoch 55, stopping early
Optimization complete after 56 epoch(s). Average epoch time: 0.1397 seconds

Test MSE: 3783.2736


### Convert generated data to transactions

In [14]:
transform_pipeline.inverse_transform(reconstructed).head(10)

Unnamed: 0,account_id,date,operation,amount
0,4308903652354333064,1993-01-01,1,0.228005
1,4308903652354333064,1993-01-01,2,172.1772
2,4308903652354333064,1993-01-01,4,0.231233
3,4308903652354333064,1993-01-02,0,0.007424
4,4308903652354333064,1993-01-02,1,0.304757
5,4308903652354333064,1993-01-02,2,100.973091
6,4308903652354333064,1993-01-02,4,0.019262
7,4308903652354333064,1993-01-03,0,0.106576
8,4308903652354333064,1993-01-03,2,191.607819
9,4308903652354333064,1993-01-03,4,0.223623


In [16]:
pd.DataFrame(reconstructed)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8335,8336,8337,8338,8339,8340,8341,8342,8343,8344
0,0.0,0.228005,0.084609,0.0,0.231233,0.007424,0.304757,0.136602,0.0,0.019262,...,0.0,0.074334,0.290587,0.226846,0.118924,0.0,0.056528,0.0,0.0,0.023623
1,0.052604,0.0,0.062351,0.047353,0.056338,0.260372,0.062295,0.264298,0.193823,0.0,...,0.0,0.0,0.123393,0.056335,0.044556,0.0,0.022573,0.0,0.085833,0.0
2,0.0,0.196265,0.396412,0.0,0.132074,0.543741,0.331732,0.117836,0.0,0.041576,...,0.001926,0.295358,0.123586,0.0,0.0,0.0,0.024718,0.0,0.527254,0.16847
3,0.0,0.109905,0.100976,0.0,0.023018,0.333306,0.130173,0.0,0.325727,0.057503,...,0.0,0.141784,0.06332,0.254198,0.196685,0.0,0.0,0.0,0.01231,0.084078
4,0.0,0.156276,0.0,0.0,0.0,0.0,0.183113,0.24234,0.303162,0.0,...,0.006668,0.0,0.224051,0.117356,0.255385,0.0,0.03421,0.066712,0.104963,0.0
5,0.0,0.0,0.339549,0.0,0.0,0.017951,0.0,0.280094,0.0,0.0,...,0.078364,0.0,0.020806,0.0,0.0,0.064924,0.173459,0.0,0.335436,0.099823
6,0.0,0.0,0.0,0.0,0.129341,0.0,0.3377,0.154602,0.0,0.0,...,0.038268,0.027605,0.279116,0.137329,0.013272,0.20659,0.261538,0.0,0.111269,0.012492


In [18]:
pd.DataFrame(X_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8335,8336,8337,8338,8339,8340,8341,8342,8343,8344
0,0.0,0.0,1.755805,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,-0.304034,0.0,0.0,0.0,0.0,-0.218218,0.0
1,0.0,0.0,-0.460539,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,2.128241,0.0,0.0,0.0,0.0,-0.218218,0.0
2,0.0,0.0,-0.460539,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,-0.304034,0.0,0.0,0.0,0.0,-0.218218,0.0
3,0.0,0.0,2.389046,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,3.952448,0.0,0.0,0.0,0.0,-0.218218,0.0
4,0.0,0.0,-0.460539,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,-0.304034,0.0,0.0,0.0,0.0,-0.218218,0.0
5,0.0,0.0,-0.460539,0.0,0.0,0.0,0.0,4.298348,0.0,0.0,...,0.0,0.0,-0.218218,-0.304034,0.0,0.0,0.0,0.0,-0.218218,0.0
6,0.0,0.0,-0.460539,0.0,0.0,0.0,0.0,-0.284032,0.0,0.0,...,0.0,0.0,-0.218218,-0.304034,0.0,0.0,0.0,0.0,-0.218218,0.0


### Summary

Seems that quality of generated data is poor (Test MSE: 3783.2736). It might be explained by fact that transactions are not aligned: we are collecting all dates from all transactions and sort them and find an index in this dictionary. This index is used to calculate dimension. This approach doesn't reflect season patterns. If we round all dates by, for example, a month we can make transactions aligned, so December 2017 and December 2018 encoded as the same index in output vector (for the same category).

However, this kind-off knowledge (like the alignment of dates) should be acquired automatically by a generator. |