In [1]:
import pandas as pd
import numpy as np
import sys

In [2]:
import transaction_demo

In [3]:
seed = 42

### Load transactions

In [4]:
df = pd.read_csv('../data/transactions.csv')
df.head(10)

Unnamed: 0.1,Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account,mean_income
0,0,695247,2378,1993-01-01,1,2.0,700.0,700.0,,,,[ 11271.53754941]
1,1,695250,2378,1993-01-22,1,2.0,7268.0,7968.0,,,,[ 11271.53754941]
2,2,695266,2378,1993-01-23,1,2.0,14440.0,22408.0,,,,[ 11271.53754941]
3,3,695262,2378,1993-01-24,1,2.0,49752.0,72160.0,,,,[ 11271.53754941]
4,4,695265,2378,1993-01-26,1,2.0,21899.0,94059.0,,,,[ 11271.53754941]
5,5,695560,2378,1993-01-31,0,3.0,34700.0,59453.7,,,,[ 11271.53754941]
6,6,3608592,2378,1993-01-31,1,,94.7,94153.7,2.0,,,[ 11271.53754941]
7,7,695272,2378,1993-02-05,1,2.0,30192.0,89645.7,,,,[ 11271.53754941]
8,8,695340,2378,1993-02-07,0,0.0,9612.0,80033.7,4.0,EF,1222903.0,[ 11271.53754941]
9,9,695270,2378,1993-02-11,1,2.0,31038.0,111071.7,,,,[ 11271.53754941]


### Clean the dataset

* Leave target columns only (date, operation and amount)
* filter out NaNs
* convert operations to int

In [5]:
df = transaction_demo.clean_dataset(df)
df.head(10)

Unnamed: 0,account_id,date,operation,amount
0,2378,1993-01-01,2,700.0
1,2378,1993-01-22,2,7268.0
2,2378,1993-01-23,2,14440.0
3,2378,1993-01-24,2,49752.0
4,2378,1993-01-26,2,21899.0
5,2378,1993-01-31,3,34700.0
7,2378,1993-02-05,2,30192.0
8,2378,1993-02-07,0,9612.0
9,2378,1993-02-11,2,31038.0
10,2378,1993-02-11,3,25200.0


Leave transaction only for 1993 year:

In [6]:
df = df[pd.to_datetime(df['date']) < pd.to_datetime('1994-01-01')]

### Segment all transaction by months

We consider each month of an account as a separate segment. All segments are considered the same. It means our generator will generate a monthly snapshot of transactions for some user.

In [7]:
df = transaction_demo.segment_by_month(df)
df.head(10)

Unnamed: 0,operation,amount,day,segment_id
0,2,700.0,1,2378:1993-01
1,2,7268.0,22,2378:1993-01
2,2,14440.0,23,2378:1993-01
3,2,49752.0,24,2378:1993-01
4,2,21899.0,26,2378:1993-01
5,3,34700.0,31,2378:1993-01
7,2,30192.0,5,2378:1993-02
8,0,9612.0,7,2378:1993-02
9,2,31038.0,11,2378:1993-02
10,3,25200.0,11,2378:1993-02


### Prepare training data

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
transform_pipeline = Pipeline([
    ("vectorizer", transaction_demo.TransactionVectorizer(
        group_column='segment_id',
        value_column='amount',
        dim1_column='operation',
        dim2_column='day',
        dim1_size=5,
        dim2_size=32)),
    ("scale", StandardScaler())
])

In [9]:
df_transformed = transform_pipeline.fit_transform(df)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(df_transformed, test_size=0.3, random_state=seed)

### Train the model

In [11]:
from synthesized.modules.synth import AlphaSynth
from sklearn.metrics import mean_squared_error

# define the estimator. We will use L2 penalty to augment the generalizability of the encoder
alpha = AlphaSynth(n_epochs=100, n_hidden=250, learning_rate=0.01, batch_size=106, 
                 display_step=10, activation_function='relu', verbose=2, min_change = 1e-6, 
                 random_state=seed, clip=True, l2_penalty=1e-5, 
                 early_stopping=True)

# fit
alpha.fit(X_train)

# transform and reconstruct the test images
reconstructed = alpha.feed_forward(X_test)

# get the error:
mse = ((X_test - reconstructed) ** 2).sum(axis=1).sum() / X_test.shape[0]
print("\nTest MSE: %.4f" % mse)

Epoch: 1, cost=0.279351, time=0.2024 (sec)
Epoch: 11, cost=0.035733, time=0.1519 (sec)
Epoch: 21, cost=0.035093, time=0.1681 (sec)
Epoch: 31, cost=0.036756, time=0.1623 (sec)
Epoch: 41, cost=0.027045, time=0.1456 (sec)
Epoch: 51, cost=0.035427, time=0.1758 (sec)
Epoch: 61, cost=0.050448, time=0.1468 (sec)
Epoch: 71, cost=0.031682, time=0.3251 (sec)
Epoch: 81, cost=0.030665, time=0.1614 (sec)
Epoch: 91, cost=0.034699, time=0.1676 (sec)
Optimization complete after 100 epoch(s). Average epoch time: 0.1733 seconds

Test MSE: 8.0422


### Convert generated data to transactions

In [12]:
transform_pipeline.inverse_transform(reconstructed).head(10)

Unnamed: 0,segment_id,day,operation,amount
0,7781631943097904576,0,1,139.897339
1,7781631943097904576,1,1,119.069122
2,7781631943097904576,2,1,116.029465
3,7781631943097904576,3,1,117.953697
4,7781631943097904576,4,1,139.312607
5,7781631943097904576,0,2,583.938599
6,7781631943097904576,1,2,29051.90625
7,7781631943097904576,2,2,783.110291
8,7781631943097904576,3,2,765.452454
9,7781631943097904576,4,2,747.99469


In [13]:
transform_pipeline.inverse_transform(X_test).head(10)

Unnamed: 0,segment_id,day,operation,amount
0,5477438150466095981,1,2,26258.0
1,5477438150466095981,3,3,3200.0
2,1013255213884708072,1,1,6550.0
3,1013255213884708072,3,4,16646.0
4,1013255213884708072,3,6,11900.0
5,1013255213884708072,1,9,14.6
6,6567845265428200430,0,2,3016.0
7,7152550698521310708,2,2,8720.0
8,7152550698521310708,4,4,18467.0
9,7152550698521310708,2,7,9100.0


In [14]:
orig = transform_pipeline.inverse_transform(X_test)

In [15]:
synth = transform_pipeline.inverse_transform(reconstructed)

In [16]:
orig = orig.drop(columns=['segment_id'])

In [17]:
synth = synth.drop(columns=['segment_id'])

In [18]:
orig.head(10)

Unnamed: 0,day,operation,amount
0,1,2,26258.0
1,3,3,3200.0
2,1,1,6550.0
3,3,4,16646.0
4,3,6,11900.0
5,1,9,14.6
6,0,2,3016.0
7,2,2,8720.0
8,4,4,18467.0
9,2,7,9100.0


In [19]:
synth.head(10)

Unnamed: 0,day,operation,amount
0,0,1,139.897339
1,1,1,119.069122
2,2,1,116.029465
3,3,1,117.953697
4,4,1,139.312607
5,0,2,583.938599
6,1,2,29051.90625
7,2,2,783.110291
8,3,2,765.452454
9,4,2,747.99469


In [20]:
from synthesized.testing.linkage_attack import linkage_attack

In [21]:
len(synth)

92820

In [22]:
len(orig)

6282

In [82]:
attrs = linkage_attack(orig, synth, categ_columns={'operation', 'day'}, t_closeness=0.2, k_distance=0.5)

In [71]:
len(attrs)

789

In [72]:
from synthesized.testing.linkage_attack import t_closeness_check, find_neighbour_distances, find_eq_class, find_eq_class_fuzzy

In [67]:
candidate_attrs = t_closeness_check(orig)

In [52]:
len(candidate_attrs)

8569

In [73]:
attrs

[{'operation': 2},
 {'operation': 3},
 {'operation': 1},
 {'operation': 4},
 {'operation': 6},
 {'operation': 9},
 {'operation': 7},
 {'operation': 8},
 {'operation': 5},
 {'day': 1},
 {'day': 3},
 {'day': 0},
 {'day': 2},
 {'day': 4},
 {'amount': 26258.0},
 {'amount': 8834.0},
 {'amount': 3900.0},
 {'amount': 2700.0},
 {'amount': 13900.0},
 {'amount': 1300.0},
 {'amount': 3456.0},
 {'amount': 8270.0},
 {'amount': 31550.0},
 {'amount': 2000.0},
 {'amount': 7900.0},
 {'amount': 4641.0},
 {'amount': 5800.0},
 {'amount': 7428.999999999999},
 {'amount': 10000.0},
 {'amount': 17980.0},
 {'amount': 8412.0},
 {'amount': 1700.0},
 {'amount': 1040.0},
 {'amount': 20611.0},
 {'amount': 2025.0},
 {'amount': 4736.0},
 {'amount': 6193.0},
 {'amount': 4345.0},
 {'amount': 2100.0},
 {'amount': 8600.0},
 {'amount': 4639.0},
 {'amount': 1200.0},
 {'amount': 3737.0},
 {'amount': 6474.0},
 {'amount': 5700.0},
 {'amount': 4400.0},
 {'amount': 46920.0},
 {'amount': 69144.0},
 {'amount': 538.0},
 {'amount':

In [36]:
from pyemd import emd_samples

In [74]:
amount = 31550.0

In [84]:
c1 = find_eq_class(orig, {'operation': 1})
c1

Unnamed: 0,day,operation,amount
2,1,1,6550.0
15,1,1,1437.0
16,3,1,511.0
27,4,1,502.0
37,3,1,40.0
86,2,1,6741.0
93,4,1,1757.0
138,4,1,4639.0
148,3,1,60.0
177,1,1,5464.0


In [85]:
down, up = find_neighbour_distances(orig, {'operation': 1}, {'operation', 'day'})
c2 = find_eq_class_fuzzy(synth, {'operation': 1}, down, up, {'operation', 'day'})
c2

Unnamed: 0,day,operation,amount
0,0,1,139.897339
1,1,1,119.069122
2,2,1,116.029465
3,3,1,117.953697
4,4,1,139.312607
42,0,1,139.897339
43,1,1,4981.971191
44,2,1,116.029465
45,3,1,117.953697
46,4,1,139.312607


In [98]:
%matplotlib inline

In [86]:
emd_samples(c1['operation'], c2['operation'])

0.0

In [91]:
emd_samples(c1['day'], c2['day'])

0.08715042071234783

In [88]:
emd_samples(c1['amount'], c2['amount'])

2833.5288092474375

In [41]:
def clean_df(df, attr_list):
    for attrs in attr_list:
        if len(attrs) > 1:
            continue
        f = pd.Series([True] * len(df), index=df.index)
        for attr, val in attrs.items():
            f = f & (df[attr] == val)
        df = df[~f]
    return df

In [79]:
synth_cleaned = clean_df(synth, attrs)

In [80]:
len(synth_cleaned)

0

In [81]:
len(synth)

92820