# The Credit Card Fraud Dataset - Synthesizing the Minority Class

In this notebook a practical exercise is presented to showcase the usage of the YData Synthetic library along with
GANs to synthesize tabular data.
For the purpose of this exercise, dataset of credit card fraud from Kaggle is used, that can be found here:
https://www.kaggle.com/mlg-ulb/creditcardfraud

In [1]:
# Note: You can select between running the Notebook on "CPU" or "GPU"
# Click "Runtime > Change Runtime time" and set "GPU"

In [2]:
# Install ydata-synthetic lib
# ! pip install ydata-synthetic

In [3]:
import os

import sklearn.cluster as cluster

from ydata_synthetic.synthesizers.regular import VanilllaGAN
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
from ydata_synthetic.preprocessing.regular.credit_fraud import *

model = VanilllaGAN

In [4]:
# Read the original data and have it preprocessed
data = pd.read_csv('../../data/creditcard.csv', index_col=[0])

In [5]:
# Extract list of columns
data_cols = list(data.columns[ data.columns != 'Class' ])

print('Dataset columns: {}'.format(data_cols))
sorted_cols = ['V14', 'V4', 'V10', 'V17', 'V12', 'V26', 'Amount', 'V21', 'V8', 'V11', 'V7', 'V28', 'V19', 'V3', 'V22', 'V6', 'V20', 'V27', 'V16', 'V13', 'V25', 'V24', 'V18', 'V2', 'V1', 'V5', 'V15', 'V9', 'V23', 'Class']
processed_data = data[ sorted_cols ].copy()

Dataset columns: ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']


In [6]:
# Before training the GAN do not forget to apply the required data transformations
# To ease here we've applied a PowerTransformation - make data distribution more Gaussian-like.
_, data, _ = transformations(data)

# For the purpose of this example we will only synthesize the minority class
# train_data contains 492 rows which had 'Class' value as 1 (which were very few)
train_data = data.loc[ data['Class']==1 ].copy()

print("Dataset info: Number of records - {} Number of variables - {}".format(train_data.shape[0], train_data.shape[1]))

# We define a K-means clustering method using sklearn, and declare that
# we want 2 clusters. We then apply this algorithm (fit_predict) to our train_data
# We essentially get an array of 492 rows ('labels') having values either 0 or 1 for the 2 clustered classes.
algorithm = cluster.KMeans
args, kwds = (), {'n_clusters':2, 'random_state':0}
labels = algorithm(*args, **kwds).fit_predict(train_data[ data_cols ])

# Get the count of both classes
print( pd.DataFrame( [ [np.sum(labels==i)] for i in np.unique(labels) ], columns=['count'], index=np.unique(labels) ) )

# Assign the k-means clustered classes' labels to the a seperate copy of train data 'fraud_w_classes'
fraud_w_classes = train_data.copy()
fraud_w_classes['Class'] = labels

Dataset info: Number of records - 492 Number of variables - 30
   count
0    384
1    108


# GAN training

Below you can try to train your own generators using the available GANs architectures. You can train it either with labels (created using KMeans) or with no labels at all. 

Remember that for this exercise in particular we've decided to synthesize only the minority class from the Credit Fraud dataset.

In [7]:
# Define the GAN and training parameters
noise_dim = 32
dim = 128
batch_size = 128

log_step = 100
epochs = 200+1
learning_rate = 5e-4
beta_1 = 0.5
beta_2 = 0.9
models_dir = './cache'

train_sample = fraud_w_classes.copy().reset_index(drop=True)
print("train_sample.columns:")
print(train_sample.columns)

# There's only 1 class, so essentially rename the 'Class' to 'Class_1',
# which tells weather a sample data is of class 1 or not.
train_sample = pd.get_dummies(train_sample, columns=['Class'], prefix='Class', drop_first=True)

# 'Class_1' label
label_cols = [ i for i in train_sample.columns if 'Class' in i ]

# All columns except 'Class_1'
data_cols = [ i for i in train_sample.columns if i not in label_cols ]

# Scale down the data, and rename it to 'train_no_label'
train_sample[ data_cols ] = train_sample[ data_cols ] / 10 # scale to random noise size, one less thing to learn
train_no_label = train_sample[ data_cols ]

train_sample.columns:
Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'],
      dtype='object')


In [8]:
#Setting the GAN model parameters and the training step parameters
gan_args = ModelParameters(batch_size=batch_size,
                           lr=learning_rate,
                           betas=(beta_1, beta_2),
                           noise_dim=noise_dim,
                           n_cols=train_sample.shape[1],
                           layers_dim=dim)

train_args = TrainParameters(epochs=epochs,
                             sample_interval=log_step)

In [9]:
# Training the GAN model chosen: Vanilla GAN, CGAN, DCGAN, etc.
synthesizer = model(gan_args)
synthesizer.train(train_sample, train_args)

  1%|▏         | 3/201 [00:00<00:39,  5.04it/s]

0 [D loss: 0.667799, acc.: 50.00%] [G loss: 0.627460]
generated_data
1 [D loss: 0.711574, acc.: 50.00%] [G loss: 0.605750]
2 [D loss: 0.685413, acc.: 55.86%] [G loss: 0.811609]
3 [D loss: 0.664166, acc.: 64.84%] [G loss: 0.925518]


  3%|▎         | 7/201 [00:00<00:19,  9.93it/s]

4 [D loss: 0.609520, acc.: 75.78%] [G loss: 0.942143]
5 [D loss: 0.614591, acc.: 75.00%] [G loss: 0.833043]
6 [D loss: 0.615558, acc.: 63.28%] [G loss: 0.792707]
7 [D loss: 0.626250, acc.: 57.81%] [G loss: 0.754947]


  5%|▌         | 11/201 [00:01<00:15, 11.98it/s]

8 [D loss: 0.786874, acc.: 49.61%] [G loss: 0.651523]
9 [D loss: 0.479988, acc.: 86.33%] [G loss: 1.305638]
10 [D loss: 0.408540, acc.: 86.33%] [G loss: 1.362704]


  7%|▋         | 15/201 [00:01<00:13, 13.71it/s]

11 [D loss: 0.533762, acc.: 64.06%] [G loss: 0.940758]
12 [D loss: 0.598850, acc.: 60.94%] [G loss: 0.964313]
13 [D loss: 0.778332, acc.: 32.42%] [G loss: 0.729217]
14 [D loss: 0.859972, acc.: 13.67%] [G loss: 0.655480]


  8%|▊         | 17/201 [00:01<00:13, 13.68it/s]

15 [D loss: 0.742009, acc.: 33.98%] [G loss: 0.852757]
16 [D loss: 0.682731, acc.: 59.38%] [G loss: 0.908791]
17 [D loss: 0.639570, acc.: 68.36%] [G loss: 1.022757]


 10%|█         | 21/201 [00:01<00:12, 14.24it/s]

18 [D loss: 0.612888, acc.: 70.31%] [G loss: 0.988172]
19 [D loss: 0.640458, acc.: 58.20%] [G loss: 0.809449]
20 [D loss: 0.648786, acc.: 66.02%] [G loss: 0.838685]
21 [D loss: 0.507747, acc.: 81.25%] [G loss: 1.097573]


 12%|█▏        | 25/201 [00:02<00:11, 14.83it/s]

22 [D loss: 0.460041, acc.: 84.77%] [G loss: 1.104118]
23 [D loss: 0.483974, acc.: 76.95%] [G loss: 0.992063]
24 [D loss: 0.596746, acc.: 53.52%] [G loss: 0.749414]
25 [D loss: 0.523119, acc.: 76.95%] [G loss: 1.000256]


 14%|█▍        | 29/201 [00:02<00:12, 14.04it/s]

26 [D loss: 0.588859, acc.: 72.66%] [G loss: 0.951767]
27 [D loss: 0.741217, acc.: 50.78%] [G loss: 0.818179]
28 [D loss: 0.743806, acc.: 47.27%] [G loss: 1.011570]


 15%|█▌        | 31/201 [00:02<00:11, 14.22it/s]

29 [D loss: 0.695435, acc.: 63.28%] [G loss: 1.125682]
30 [D loss: 0.604529, acc.: 71.48%] [G loss: 1.190506]
31 [D loss: 0.646084, acc.: 65.23%] [G loss: 1.132169]


 17%|█▋        | 35/201 [00:02<00:11, 13.92it/s]

32 [D loss: 0.734277, acc.: 59.77%] [G loss: 0.853960]
33 [D loss: 0.746823, acc.: 53.12%] [G loss: 0.762385]
34 [D loss: 0.670159, acc.: 60.16%] [G loss: 0.916341]


 18%|█▊        | 37/201 [00:03<00:11, 14.12it/s]

35 [D loss: 0.671954, acc.: 61.33%] [G loss: 1.064799]
36 [D loss: 0.660299, acc.: 64.45%] [G loss: 1.115992]
37 [D loss: 0.659350, acc.: 64.45%] [G loss: 1.045850]


 20%|██        | 41/201 [00:03<00:11, 14.06it/s]

38 [D loss: 0.695063, acc.: 50.78%] [G loss: 0.896535]
39 [D loss: 0.702763, acc.: 60.16%] [G loss: 1.007340]
40 [D loss: 0.659711, acc.: 63.67%] [G loss: 1.125074]


 22%|██▏       | 45/201 [00:03<00:10, 14.56it/s]

41 [D loss: 0.666940, acc.: 60.16%] [G loss: 1.107759]
42 [D loss: 0.695480, acc.: 52.73%] [G loss: 0.997851]
43 [D loss: 0.650630, acc.: 60.94%] [G loss: 1.052085]
44 [D loss: 0.582053, acc.: 80.86%] [G loss: 1.031008]


 24%|██▍       | 49/201 [00:03<00:10, 14.95it/s]

45 [D loss: 0.555156, acc.: 78.12%] [G loss: 1.036592]
46 [D loss: 0.537293, acc.: 77.34%] [G loss: 0.950673]
47 [D loss: 0.584877, acc.: 65.23%] [G loss: 1.061974]
48 [D loss: 0.627898, acc.: 58.20%] [G loss: 0.996153]


 26%|██▋       | 53/201 [00:04<00:09, 15.15it/s]

49 [D loss: 0.759212, acc.: 39.06%] [G loss: 0.941198]
50 [D loss: 0.723677, acc.: 54.30%] [G loss: 1.107731]
51 [D loss: 0.684543, acc.: 55.86%] [G loss: 1.043659]
52 [D loss: 0.691835, acc.: 54.30%] [G loss: 0.992927]


 28%|██▊       | 57/201 [00:04<00:09, 15.18it/s]

53 [D loss: 0.648824, acc.: 63.67%] [G loss: 0.979212]
54 [D loss: 0.664794, acc.: 60.16%] [G loss: 1.011013]
55 [D loss: 0.729410, acc.: 54.69%] [G loss: 0.854251]
56 [D loss: 0.723036, acc.: 53.12%] [G loss: 0.914398]


 29%|██▉       | 59/201 [00:04<00:09, 15.19it/s]

57 [D loss: 0.568848, acc.: 80.47%] [G loss: 1.097183]
58 [D loss: 0.622805, acc.: 64.06%] [G loss: 0.952932]
59 [D loss: 0.667270, acc.: 56.64%] [G loss: 0.992395]


 31%|███▏      | 63/201 [00:04<00:09, 14.80it/s]

60 [D loss: 0.593453, acc.: 67.19%] [G loss: 1.231334]
61 [D loss: 0.618000, acc.: 62.89%] [G loss: 1.083225]
62 [D loss: 0.697268, acc.: 54.30%] [G loss: 0.889366]
63 [D loss: 0.679349, acc.: 59.38%] [G loss: 0.926913]


 33%|███▎      | 67/201 [00:05<00:08, 15.02it/s]

64 [D loss: 0.613594, acc.: 70.70%] [G loss: 0.965451]
65 [D loss: 0.587141, acc.: 73.83%] [G loss: 1.164349]
66 [D loss: 0.655603, acc.: 66.02%] [G loss: 1.009748]


 35%|███▌      | 71/201 [00:05<00:08, 14.98it/s]

67 [D loss: 0.693001, acc.: 53.91%] [G loss: 0.928315]
68 [D loss: 0.722607, acc.: 45.70%] [G loss: 0.874341]
69 [D loss: 0.614516, acc.: 71.48%] [G loss: 1.043282]
70 [D loss: 0.500814, acc.: 80.47%] [G loss: 1.167846]


 36%|███▋      | 73/201 [00:05<00:08, 14.88it/s]

71 [D loss: 0.615568, acc.: 66.80%] [G loss: 0.883015]
72 [D loss: 0.662185, acc.: 57.81%] [G loss: 0.866582]
73 [D loss: 0.662623, acc.: 58.59%] [G loss: 0.957053]


 38%|███▊      | 77/201 [00:05<00:08, 14.72it/s]

74 [D loss: 0.642028, acc.: 64.06%] [G loss: 0.980985]
75 [D loss: 0.604001, acc.: 69.14%] [G loss: 1.040126]
76 [D loss: 0.603764, acc.: 71.48%] [G loss: 1.055285]
77 [D loss: 0.657053, acc.: 56.25%] [G loss: 1.020827]


 40%|████      | 81/201 [00:06<00:08, 14.95it/s]

78 [D loss: 0.631437, acc.: 62.50%] [G loss: 1.064563]
79 [D loss: 0.627216, acc.: 62.11%] [G loss: 1.056897]
80 [D loss: 0.639494, acc.: 58.59%] [G loss: 1.099240]
81 [D loss: 0.687297, acc.: 54.69%] [G loss: 0.978818]


 42%|████▏     | 85/201 [00:06<00:07, 15.12it/s]

82 [D loss: 0.675774, acc.: 54.30%] [G loss: 1.035994]
83 [D loss: 0.552702, acc.: 80.47%] [G loss: 1.179609]
84 [D loss: 0.531251, acc.: 80.08%] [G loss: 1.142820]
85 [D loss: 0.633887, acc.: 64.06%] [G loss: 0.885979]


 44%|████▍     | 89/201 [00:06<00:07, 14.73it/s]

86 [D loss: 0.705252, acc.: 50.78%] [G loss: 0.919509]
87 [D loss: 0.625692, acc.: 70.31%] [G loss: 1.203430]
88 [D loss: 0.596135, acc.: 75.00%] [G loss: 1.170023]


 46%|████▋     | 93/201 [00:06<00:07, 14.89it/s]

89 [D loss: 0.665914, acc.: 56.25%] [G loss: 0.873145]
90 [D loss: 0.633155, acc.: 62.11%] [G loss: 1.051797]
91 [D loss: 0.604939, acc.: 68.36%] [G loss: 1.016445]
92 [D loss: 0.653639, acc.: 59.77%] [G loss: 0.954702]


 47%|████▋     | 95/201 [00:06<00:07, 15.04it/s]

93 [D loss: 0.620737, acc.: 64.06%] [G loss: 1.073353]
94 [D loss: 0.628342, acc.: 64.06%] [G loss: 1.129244]
95 [D loss: 0.619721, acc.: 64.84%] [G loss: 1.050286]


 49%|████▉     | 99/201 [00:07<00:07, 14.50it/s]

96 [D loss: 0.677251, acc.: 60.16%] [G loss: 0.945121]
97 [D loss: 0.670997, acc.: 61.33%] [G loss: 0.920590]
98 [D loss: 0.660021, acc.: 59.38%] [G loss: 0.968030]


 50%|█████     | 101/201 [00:07<00:07, 14.09it/s]

99 [D loss: 0.607855, acc.: 67.58%] [G loss: 1.064863]
100 [D loss: 0.584812, acc.: 75.00%] [G loss: 1.067599]
generated_data
101 [D loss: 0.647032, acc.: 63.28%] [G loss: 1.009043]


 52%|█████▏    | 105/201 [00:07<00:06, 14.37it/s]

102 [D loss: 0.702883, acc.: 46.48%] [G loss: 0.785143]
103 [D loss: 0.653963, acc.: 59.38%] [G loss: 0.921296]
104 [D loss: 0.660003, acc.: 59.77%] [G loss: 0.992312]


 53%|█████▎    | 107/201 [00:07<00:06, 14.14it/s]

105 [D loss: 0.674144, acc.: 54.30%] [G loss: 0.842441]
106 [D loss: 0.643941, acc.: 62.11%] [G loss: 0.905057]
107 [D loss: 0.653854, acc.: 60.16%] [G loss: 0.997020]


 55%|█████▌    | 111/201 [00:08<00:06, 14.20it/s]

108 [D loss: 0.645455, acc.: 62.11%] [G loss: 0.932954]
109 [D loss: 0.665808, acc.: 53.52%] [G loss: 0.888468]
110 [D loss: 0.639209, acc.: 61.72%] [G loss: 0.893442]


 56%|█████▌    | 113/201 [00:08<00:06, 14.13it/s]

111 [D loss: 0.636178, acc.: 63.67%] [G loss: 0.925546]
112 [D loss: 0.607769, acc.: 71.48%] [G loss: 0.993356]
113 [D loss: 0.652839, acc.: 61.72%] [G loss: 0.949343]


 58%|█████▊    | 117/201 [00:08<00:05, 14.43it/s]

114 [D loss: 0.629230, acc.: 67.19%] [G loss: 0.991919]
115 [D loss: 0.596526, acc.: 70.31%] [G loss: 1.092277]
116 [D loss: 0.613246, acc.: 71.09%] [G loss: 1.062938]


 59%|█████▉    | 119/201 [00:08<00:05, 13.75it/s]

117 [D loss: 0.624708, acc.: 66.41%] [G loss: 0.957176]
118 [D loss: 0.628994, acc.: 61.72%] [G loss: 1.109560]
119 [D loss: 0.615997, acc.: 65.62%] [G loss: 1.018562]


 61%|██████    | 123/201 [00:08<00:05, 14.15it/s]

120 [D loss: 0.638407, acc.: 60.16%] [G loss: 0.980298]
121 [D loss: 0.592755, acc.: 67.97%] [G loss: 1.163128]
122 [D loss: 0.597432, acc.: 68.36%] [G loss: 1.115566]


 62%|██████▏   | 125/201 [00:09<00:05, 14.29it/s]

123 [D loss: 0.618195, acc.: 65.62%] [G loss: 1.072845]
124 [D loss: 0.620309, acc.: 62.11%] [G loss: 1.011995]
125 [D loss: 0.604478, acc.: 67.97%] [G loss: 0.992189]


 64%|██████▍   | 129/201 [00:09<00:05, 14.37it/s]

126 [D loss: 0.615076, acc.: 66.41%] [G loss: 0.956289]
127 [D loss: 0.594594, acc.: 69.14%] [G loss: 1.006189]
128 [D loss: 0.597467, acc.: 67.58%] [G loss: 1.124163]


 65%|██████▌   | 131/201 [00:09<00:04, 14.43it/s]

129 [D loss: 0.640436, acc.: 59.77%] [G loss: 1.114807]
130 [D loss: 0.589628, acc.: 66.02%] [G loss: 1.092964]
131 [D loss: 0.643631, acc.: 59.38%] [G loss: 0.959220]


 67%|██████▋   | 135/201 [00:09<00:04, 14.65it/s]

132 [D loss: 0.624654, acc.: 65.23%] [G loss: 1.012808]
133 [D loss: 0.537211, acc.: 74.22%] [G loss: 1.223650]
134 [D loss: 0.548765, acc.: 76.56%] [G loss: 1.047919]


 68%|██████▊   | 137/201 [00:09<00:04, 14.63it/s]

135 [D loss: 0.593324, acc.: 67.58%] [G loss: 1.031100]
136 [D loss: 0.628770, acc.: 63.28%] [G loss: 1.017282]
137 [D loss: 0.638802, acc.: 62.11%] [G loss: 1.023109]


 70%|███████   | 141/201 [00:10<00:04, 14.36it/s]

138 [D loss: 0.544216, acc.: 74.22%] [G loss: 1.145653]
139 [D loss: 0.526260, acc.: 75.00%] [G loss: 1.257279]
140 [D loss: 0.554439, acc.: 72.27%] [G loss: 1.173286]


 72%|███████▏  | 145/201 [00:10<00:03, 14.57it/s]

141 [D loss: 0.629538, acc.: 61.33%] [G loss: 1.136336]
142 [D loss: 0.633804, acc.: 63.67%] [G loss: 0.980453]
143 [D loss: 0.631608, acc.: 64.06%] [G loss: 0.987468]
144 [D loss: 0.563645, acc.: 69.92%] [G loss: 1.279492]


 74%|███████▍  | 149/201 [00:10<00:03, 14.84it/s]

145 [D loss: 0.495721, acc.: 76.17%] [G loss: 1.502890]
146 [D loss: 0.545707, acc.: 73.83%] [G loss: 1.231275]
147 [D loss: 0.615372, acc.: 58.20%] [G loss: 0.950182]
148 [D loss: 0.659109, acc.: 60.94%] [G loss: 0.950729]


 75%|███████▌  | 151/201 [00:10<00:03, 14.92it/s]

149 [D loss: 0.612304, acc.: 63.28%] [G loss: 1.109508]
150 [D loss: 0.552484, acc.: 71.09%] [G loss: 1.209966]
151 [D loss: 0.545809, acc.: 71.09%] [G loss: 1.135190]


 77%|███████▋  | 155/201 [00:11<00:03, 14.67it/s]

152 [D loss: 0.578675, acc.: 66.02%] [G loss: 1.168916]
153 [D loss: 0.584283, acc.: 65.23%] [G loss: 1.196073]
154 [D loss: 0.598975, acc.: 67.58%] [G loss: 1.135116]


 79%|███████▉  | 159/201 [00:11<00:02, 14.84it/s]

155 [D loss: 0.606514, acc.: 66.41%] [G loss: 1.032472]
156 [D loss: 0.594855, acc.: 67.19%] [G loss: 1.082688]
157 [D loss: 0.560641, acc.: 72.27%] [G loss: 1.080591]
158 [D loss: 0.540712, acc.: 74.22%] [G loss: 1.185920]


 81%|████████  | 163/201 [00:11<00:02, 14.91it/s]

159 [D loss: 0.590826, acc.: 65.23%] [G loss: 1.120154]
160 [D loss: 0.619872, acc.: 65.62%] [G loss: 1.025882]
161 [D loss: 0.612490, acc.: 64.45%] [G loss: 1.083111]
162 [D loss: 0.577126, acc.: 68.75%] [G loss: 1.244798]


 83%|████████▎ | 167/201 [00:11<00:02, 14.97it/s]

163 [D loss: 0.526530, acc.: 75.00%] [G loss: 1.235106]
164 [D loss: 0.555750, acc.: 72.27%] [G loss: 1.104536]
165 [D loss: 0.580906, acc.: 69.92%] [G loss: 1.243667]
166 [D loss: 0.527248, acc.: 69.53%] [G loss: 1.300426]


 85%|████████▌ | 171/201 [00:12<00:02, 14.86it/s]

167 [D loss: 0.564445, acc.: 70.31%] [G loss: 1.134656]
168 [D loss: 0.566677, acc.: 75.00%] [G loss: 1.079484]
169 [D loss: 0.600965, acc.: 67.58%] [G loss: 1.197279]
170 [D loss: 0.605864, acc.: 63.67%] [G loss: 1.257107]


 87%|████████▋ | 175/201 [00:12<00:01, 14.93it/s]

171 [D loss: 0.581070, acc.: 67.19%] [G loss: 1.214595]
172 [D loss: 0.558318, acc.: 69.53%] [G loss: 1.273430]
173 [D loss: 0.529268, acc.: 78.12%] [G loss: 1.231471]
174 [D loss: 0.545604, acc.: 74.22%] [G loss: 1.202493]


 88%|████████▊ | 177/201 [00:12<00:01, 14.87it/s]

175 [D loss: 0.561914, acc.: 73.44%] [G loss: 1.217553]
176 [D loss: 0.549528, acc.: 73.05%] [G loss: 1.282071]
177 [D loss: 0.549463, acc.: 75.78%] [G loss: 1.319983]


 90%|█████████ | 181/201 [00:12<00:01, 14.88it/s]

178 [D loss: 0.556112, acc.: 73.05%] [G loss: 1.283996]
179 [D loss: 0.577835, acc.: 66.41%] [G loss: 1.278464]
180 [D loss: 0.580962, acc.: 67.97%] [G loss: 1.145324]
181 [D loss: 0.631294, acc.: 63.28%] [G loss: 1.088824]


 92%|█████████▏| 185/201 [00:13<00:01, 14.84it/s]

182 [D loss: 0.609078, acc.: 67.58%] [G loss: 1.082600]
183 [D loss: 0.568129, acc.: 73.05%] [G loss: 1.153463]
184 [D loss: 0.573630, acc.: 70.31%] [G loss: 1.072984]


 93%|█████████▎| 187/201 [00:13<00:00, 14.65it/s]

185 [D loss: 0.545454, acc.: 73.44%] [G loss: 1.159975]
186 [D loss: 0.604143, acc.: 67.58%] [G loss: 1.104958]
187 [D loss: 0.569865, acc.: 69.92%] [G loss: 1.136199]


 95%|█████████▌| 191/201 [00:13<00:00, 14.43it/s]

188 [D loss: 0.526441, acc.: 73.83%] [G loss: 1.258840]
189 [D loss: 0.585926, acc.: 68.36%] [G loss: 1.214675]
190 [D loss: 0.584743, acc.: 67.19%] [G loss: 1.237197]


 96%|█████████▌| 193/201 [00:13<00:00, 14.41it/s]

191 [D loss: 0.572285, acc.: 67.58%] [G loss: 1.207233]
192 [D loss: 0.537477, acc.: 71.48%] [G loss: 1.200769]
193 [D loss: 0.550338, acc.: 72.66%] [G loss: 1.235749]


 98%|█████████▊| 197/201 [00:13<00:00, 14.37it/s]

194 [D loss: 0.598589, acc.: 66.80%] [G loss: 1.144557]
195 [D loss: 0.607460, acc.: 67.58%] [G loss: 1.045876]
196 [D loss: 0.559414, acc.: 72.66%] [G loss: 1.158894]


 99%|█████████▉| 199/201 [00:14<00:00, 14.37it/s]

197 [D loss: 0.543792, acc.: 71.88%] [G loss: 1.285328]
198 [D loss: 0.563303, acc.: 66.41%] [G loss: 1.143555]
199 [D loss: 0.611836, acc.: 62.11%] [G loss: 1.033467]


100%|██████████| 201/201 [00:14<00:00, 14.10it/s]

200 [D loss: 0.597143, acc.: 64.84%] [G loss: 1.126840]
generated_data





In [10]:
# Generator description
synthesizer.generator.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(128, 32)]               0         
_________________________________________________________________
dense (Dense)                (128, 128)                4224      
_________________________________________________________________
dense_1 (Dense)              (128, 256)                33024     
_________________________________________________________________
dense_2 (Dense)              (128, 512)                131584    
_________________________________________________________________
dense_3 (Dense)              (128, 30)                 15390     
Total params: 184,222
Trainable params: 184,222
Non-trainable params: 0
_________________________________________________________________


In [11]:
# Discriminator description
synthesizer.discriminator.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(128, 30)]               0         
_________________________________________________________________
dense_4 (Dense)              (128, 512)                15872     
_________________________________________________________________
dropout (Dropout)            (128, 512)                0         
_________________________________________________________________
dense_5 (Dense)              (128, 256)                131328    
_________________________________________________________________
dropout_1 (Dropout)          (128, 256)                0         
_________________________________________________________________
dense_6 (Dense)              (128, 128)                32896     
_________________________________________________________________
dense_7 (Dense)              (128, 1)                  129 

In [12]:
# You can easily save the trained generator and loaded it afterwards
if not os.path.exists("./saved/gan"):
    os.makedirs("./saved/gan")
synthesizer.save(path="./saved/gan/generator_fraud.pkl")

In [13]:
models = {'GAN': ['GAN', False, synthesizer.generator]}

In [14]:
# Setup parameters visualization parameters
seed = 17
test_size = 492 # number of fraud cases
noise_dim = 32

np.random.seed(seed)
z = np.random.normal(size=(test_size, noise_dim))
real = synthesizer.get_data_batch(train=train_sample, batch_size=test_size, seed=seed)
real_samples = pd.DataFrame(real, columns=data_cols+label_cols)
labels = fraud_w_classes['Class']

model_names = ['GAN']
colors = ['deepskyblue','blue']
markers = ['o','^']
class_labels = ['Class 1','Class 2']

col1, col2 = 'V17', 'V10'

base_dir = 'cache/'

# Actual fraud data visualization
model_steps = [ 0, 100, 200]
rows = len(model_steps)
columns = 1 + len(models)

axarr = [[]]*len(model_steps)

fig = plt.figure(figsize=(14,rows*3))

# Go through each of the 3 model_step values -> 0, 100, 200
for model_step_ix, model_step in enumerate(model_steps):        
    axarr[model_step_ix] = plt.subplot(rows, columns, model_step_ix*columns + 1)

    # Plot 'Class 1' and 'Class 2' samples taken from the original data, in a random shuffled fashion
    for group, color, marker, label in zip(real_samples.groupby('Class_1'), colors, markers, class_labels ):
        plt.scatter( group[1][[col1]], group[1][[col2]], 
                         label=label, marker=marker, edgecolors=color, facecolors='none' )
    
    plt.title('Actual Fraud Data')
    plt.ylabel(col2) # Only add y label to left plot
    plt.xlabel(col1)
    xlims, ylims = axarr[model_step_ix].get_xlim(), axarr[model_step_ix].get_ylim()
    
    if model_step_ix == 0: 
        legend = plt.legend()
        legend.get_frame().set_facecolor('white')

    # Go through all the GAN models listed in 'model_names' and defined in 'models'
    for i, model_name in enumerate( model_names[:] ):

        [model_name, with_class, generator_model] = models[model_name]

        generator_model.load_weights( base_dir + '_generator_model_weights_step_'+str(model_step)+'.h5')

        ax = plt.subplot(rows, columns, model_step_ix*columns + 1 + (i+1) )

        if with_class:
            g_z = generator_model.predict([z, labels])
            gen_samples = pd.DataFrame(g_z, columns=data_cols+label_cols)
            for group, color, marker, label in zip( gen_samples.groupby('Class_1'), colors, markers, class_labels ):
                plt.scatter( group[1][[col1]], group[1][[col2]], 
                                 label=label, marker=marker, edgecolors=color, facecolors='none' )
        else:
            g_z = generator_model.predict(z)
            gen_samples = pd.DataFrame(g_z, columns=data_cols+['label'])
            gen_samples.to_csv('../../data/Generated_sample.csv')
            plt.scatter( gen_samples[[col1]], gen_samples[[col2]],
                             label=class_labels[0], marker=markers[0], edgecolors=colors[0], facecolors='none' )
        plt.title(model_name)   
        plt.xlabel(col1)
        ax.set_xlim(xlims), ax.set_ylim(ylims)

plt.suptitle('Comparison of GAN outputs', size=16, fontweight='bold')
plt.tight_layout(rect=[0.075,0,1,0.95])

# Adding text labels for training steps
vpositions = np.array([ i._position.bounds[1] for i in axarr ])
vpositions += ((vpositions[0] - vpositions[1]) * 0.35 )
for model_step_ix, model_step in enumerate( model_steps ):
    fig.text( 0.05, vpositions[model_step_ix], 'training\nstep\n'+str(model_step), ha='center', va='center', size=12)

if not os.path.exists("./img"):
    os.makedirs("./img")
plt.savefig('img/Comparison_of_GAN_outputs.png', dpi=100)