# TimeGAN Tutorial

## Time-series Generative Adversarial Networks

- Paper: Jinsung Yoon, Daniel Jarrett, Mihaela van der Schaar, "Time-series Generative Adversarial Networks," Neural Information Processing Systems (NeurIPS), 2019.

- Paper link: https://papers.nips.cc/paper/8789-time-series-generative-adversarial-networks

- Last updated Date: April 24th 2020

- Code author: Jinsung Yoon (jsyoon0823@gmail.com)

This notebook describes the user-guide of a time-series synthetic data generation application using timeGAN framework. We use Stock, Energy, and Sine dataset as examples.

### Prerequisite
Clone https://github.com/jsyoon0823/timeGAN.git to the current directory.

## Necessary packages and functions call

- timegan: Synthetic time-series data generation module
- data_loading: 2 real datasets and 1 synthetic datasets loading and preprocessing
- metrics: 
    - discriminative_metrics: classify real data from synthetic data
    - predictive_metrics: train on synthetic, test on real
    - visualization: PCA and tSNE analyses

In [1]:
## Necessary packages
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler

# 1. TimeGAN model
from timegan import timegan
# 2. Data loading
from data_loading import real_data_loading, sine_data_generation
# 3. Metrics
from metrics.discriminative_metrics import discriminative_score_metrics
from metrics.predictive_metrics import predictive_score_metrics
from metrics.visualization_metrics import visualization

In [2]:
df_ori = pd.read_csv('/Users/xiafei/code/itu-ml-challenge/csv/dataset.csv')
try:
    df_ori = df_ori.loc[:,(df_ori !=0).any(axis=0)]
    df_ori = df_ori.drop(columns=['Unnamed: 0'])
except:
    print('drop error')

In [3]:
print(df_ori.shape)
df_ori.head()

(9670, 662)


Unnamed: 0,p_/computes0/service/id,p_/computes0/vcpus_used,p_/computes0/vcpus,p_/computes0/memory_mb_used,p_/computes0/memory_mb,p_/computes0/cpu_info/topology/cores,p_/computes0/cpu_info/topology/cells,p_/computes0/cpu_info/topology/threads,p_/computes0/cpu_info/topology/sockets,p_/computes0/running_vms,...,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets-rate,v_/time,v_type,v_type_code
0,16,20,48,41472,257790,12,2,2,1,5,...,13.236069,1401532.0,0.100425,174853276.0,11.431817,1263588.0,0.099783,1593395580,ixnetwork-traffic-start,0
1,16,20,48,41472,257790,12,2,2,1,5,...,11.438221,1401538.0,0.099506,174854074.0,13.313681,1263595.0,0.116705,1593395640,ixnetwork-traffic-start,0
2,16,20,48,41472,257790,12,2,2,1,5,...,11.438221,1401545.0,0.099506,174854758.0,13.313681,1263601.0,0.100463,1593395700,ixnetwork-traffic-start,0
3,16,20,48,41472,257790,12,2,2,1,5,...,11.429972,1401551.0,0.099826,174855556.0,13.277844,1263608.0,0.116223,1593395760,ixnetwork-traffic-start,0
4,16,20,48,41472,257790,12,2,2,1,5,...,11.429972,1401558.0,0.116494,174856338.0,13.277844,1263615.0,0.116614,1593395820,ixnetwork-traffic-start,0


# Select type n as the original data

In [4]:
curr_type = 5

In [5]:
df_ori_typeN = df_ori[df_ori['v_type_code'] == curr_type]

In [6]:
df_ori_typeN.head()

Unnamed: 0,p_/computes0/service/id,p_/computes0/vcpus_used,p_/computes0/vcpus,p_/computes0/memory_mb_used,p_/computes0/memory_mb,p_/computes0/cpu_info/topology/cores,p_/computes0/cpu_info/topology/cells,p_/computes0/cpu_info/topology/threads,p_/computes0/cpu_info/topology/sockets,p_/computes0/running_vms,...,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets-rate,v_/time,v_type,v_type_code
282,16,20,48,41472,257790,12,2,2,1,5,...,13.322413,1418712.0,0.117148,211089956.0,11.396969,1295740.0,0.116821,1593412500,tap-loss-start,5
283,16,20,48,41472,257790,12,2,2,1,5,...,14.234634,1418719.0,0.116205,211090754.0,14.297662,1295747.0,0.116983,1593412560,tap-loss-start,5
284,16,20,48,41472,257790,12,2,2,1,5,...,12.589926,1418726.0,0.116794,211091438.0,13.240228,1295753.0,0.09926,1593412620,tap-loss-start,5
285,16,20,48,41472,257790,12,2,2,1,5,...,13.28843,1418732.0,0.116794,211092122.0,11.420076,1295759.0,0.09996,1593412680,tap-loss-start,5
286,16,20,48,41472,257790,12,2,2,1,5,...,11.379676,1418738.0,0.100191,211092920.0,13.354185,1295766.0,0.116854,1593412740,tap-loss-start,5


In [7]:
df_ori_typeN.shape

(1707, 662)

# Split ori_data

In [8]:
columns = df_ori_typeN.columns
cut_num = 30

curr_column = []
curr_idx = 0

for i in range(1, len(columns[:-1])):
    curr_column.append(columns[i-1])
    curr_idx = curr_idx+1
    if(i % cut_num == 0):
        curr_df = pd.DataFrame(df_ori_typeN[curr_column])
        curr_column = []
        curr_df.to_csv('./data/cut'+str(cut_num)+'/'+str(curr_idx)+'.csv', index=False)

# Read dataset list

In [9]:
dataset_list = os.listdir('./data/cut'+str(cut_num))

try:
    dataset_list.remove('.DS_Store')
except:
    print('delete hiden dir')
    
dataset_list.sort(key=lambda x:int(x.split('.')[0]))
print(dataset_list)

['30.csv', '60.csv', '90.csv', '120.csv', '150.csv', '180.csv', '210.csv', '240.csv', '270.csv', '300.csv', '330.csv', '360.csv', '390.csv', '420.csv', '450.csv', '480.csv', '510.csv', '540.csv', '570.csv', '600.csv', '630.csv', '660.csv']


# Batch generated data

In [10]:
## Dataset parameters
seq_len = 5

## Newtork parameters
parameters = dict()

parameters['module'] = 'lstm' 
parameters['hidden_dim'] = 24
parameters['num_layer'] = 3
parameters['iterations'] = 4000
parameters['batch_size'] = 128

metric_iteration = 10


In [None]:
discriminative_score_list = []
predictive_score_list = []
df_generated_list = []
cal_score = False # it will take a long time

for dataset_name in dataset_list:
    data_name = dataset_name[:-4]
    
    # 1. Load data
    ori_data, scaler= real_data_loading(data_name, seq_len, mix=False, data_dir='./data/cut'+str(cut_num)+'/')
    print(data_name + ' dataset is ready.')
    
    # 2. Run TimeGAN
    generated_data = timegan(ori_data, parameters)   
    print('Finish Synthetic Data Generation')
    
    if cal_score:
        # 3. Discriminative score
        discriminative_score = list()
        for _ in range(metric_iteration):
            temp_disc = discriminative_score_metrics(ori_data, generated_data)
            discriminative_score.append(temp_disc)

        discriminative_score_list.append(np.round(np.mean(discriminative_score), 4))
        print('Discriminative score: ' + str(np.round(np.mean(discriminative_score), 4)))

        # 4. Predictive score
        predictive_score = list()
        for tt in range(metric_iteration):
            temp_pred = predictive_score_metrics(ori_data, generated_data)
            predictive_score.append(temp_pred)

        predictive_score_list.append(np.round(np.mean(predictive_score), 4))
        print('Predictive score: ' + str(np.round(np.mean(predictive_score), 4)))
    
    # 5. Recover data
    g_data = np.reshape(generated_data, (generated_data.shape[0] * generated_data.shape[1], generated_data.shape[2]))
    g_data = scaler.inverse_transform(g_data)
    df_generated = pd.DataFrame(g_data)
    
    df_ori = pd.read_csv('./data/cut'+str(cut_num)+'/'+data_name+'.csv')
    df_generated.columns = df_ori.columns
    df_generated_list.append(df_generated)

df_final = pd.concat(df_generated_list, axis=1)
print(df_final.shape)

30 dataset is ready.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Start Embedding Network Training
step: 0/4000, e_loss: 0.4092
step: 1000/4000, e_loss: 0.1756
step: 2000/4000, e_loss: 0.1798
step: 3000/4000, e_loss: 0.1708
Finish Embedding Network Training
Start Training with Su

step: 3000/4000, d_loss: 1.7394, g_loss_u: 1.0896, g_loss_s: 0.0569, g_loss_v: 0.0187, e_loss_t0: 0.0336
Finish Joint Training
Finish Synthetic Data Generation
240 dataset is ready.
Start Embedding Network Training
step: 0/4000, e_loss: 0.4564
step: 1000/4000, e_loss: 0.2733
step: 2000/4000, e_loss: 0.2751
step: 3000/4000, e_loss: 0.2775
Finish Embedding Network Training
Start Training with Supervised Loss Only
step: 0/4000, s_loss: 0.3835
step: 1000/4000, s_loss: 0.3549
step: 2000/4000, s_loss: 0.3573
step: 3000/4000, s_loss: 0.3576
Finish Training with Supervised Loss Only
Start Joint Training
step: 0/4000, d_loss: 2.0653, g_loss_u: 0.7055, g_loss_s: 0.3577, g_loss_v: 0.2561, e_loss_t0: 0.1041
step: 1000/4000, d_loss: 1.4455, g_loss_u: 1.1738, g_loss_s: 0.0893, g_loss_v: 0.0341, e_loss_t0: 0.0509
step: 2000/4000, d_loss: 1.512, g_loss_u: 1.1971, g_loss_s: 0.0799, g_loss_v: 0.0318, e_loss_t0: 0.0484
step: 3000/4000, d_loss: 1.4699, g_loss_u: 1.3831, g_loss_s: 0.0789, g_loss_v: 0.0355,

step: 3000/4000, d_loss: 1.6233, g_loss_u: 1.2033, g_loss_s: 0.0211, g_loss_v: 0.0721, e_loss_t0: 0.0674
Finish Joint Training
Finish Synthetic Data Generation
510 dataset is ready.
Start Embedding Network Training
step: 0/4000, e_loss: 0.399
step: 1000/4000, e_loss: 0.2406
step: 2000/4000, e_loss: 0.243
step: 3000/4000, e_loss: 0.256
Finish Embedding Network Training
Start Training with Supervised Loss Only
step: 0/4000, s_loss: 0.4879
step: 1000/4000, s_loss: 0.0117
step: 2000/4000, s_loss: 0.006
step: 3000/4000, s_loss: 0.0042
Finish Training with Supervised Loss Only
Start Joint Training
step: 0/4000, d_loss: 2.0802, g_loss_u: 0.6916, g_loss_s: 0.0029, g_loss_v: 0.2174, e_loss_t0: 0.2515
step: 1000/4000, d_loss: 1.9112, g_loss_u: 1.0956, g_loss_s: 0.0, g_loss_v: 0.2368, e_loss_t0: 0.253
step: 2000/4000, d_loss: 1.4114, g_loss_u: 1.1552, g_loss_s: 0.0412, g_loss_v: 0.3866, e_loss_t0: 0.1034
step: 3000/4000, d_loss: 1.5845, g_loss_u: 0.8823, g_loss_s: 0.0249, g_loss_v: 0.0867, e_loss

In [None]:
print(discriminative_score_list)
print(predictive_score_list)

In [14]:
# type 3 : interface-down
# type 5 : tap-loss-start
# type 9 : ixnetwork-bgp-injection-start
# type 11: ixnetwork-bgp-hijacking-start

df_final['v_type'] = 'tap-loss-start'
df_final['v_type_code'] = 5

In [15]:
df_final.head()

Unnamed: 0,p_/computes0/service/id,p_/computes0/vcpus_used,p_/computes0/vcpus,p_/computes0/memory_mb_used,p_/computes0/memory_mb,p_/computes0/cpu_info/topology/cores,p_/computes0/cpu_info/topology/cells,p_/computes0/cpu_info/topology/threads,p_/computes0/cpu_info/topology/sockets,p_/computes0/running_vms,...,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-incoming-packets-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-bytes-rate,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets,v_/ports#link-tr-tr-a-1-y/metrics/network-outgoing-packets-rate,v_/time,v_type,v_type_code
0,16.0,20.0,48.0,41472.0,257790.0,12.0,2.0,2.0,1.0,5.0,...,16.205495,1504367.0,0.136839,238863900.0,15.944977,1371555.0,0.134739,1593915000.0,ixnetwork-bgp-hijacking-start,11
1,16.0,20.0,48.0,41472.0,257790.0,12.0,2.0,2.0,1.0,5.0,...,11.681464,1511663.0,0.103877,242658100.0,12.512365,1379879.0,0.107542,1593966000.0,ixnetwork-bgp-hijacking-start,11
2,16.0,20.0,48.0,41472.0,257790.0,12.0,2.0,2.0,1.0,5.0,...,11.38801,1512675.0,0.099919,243046800.0,11.383964,1380929.0,0.100194,1593973000.0,ixnetwork-bgp-hijacking-start,11
3,16.0,20.0,48.0,41472.0,257790.0,12.0,2.0,2.0,1.0,5.0,...,14.402513,1513013.0,0.12649,243194300.0,11.917914,1381299.0,0.115128,1593975000.0,ixnetwork-bgp-hijacking-start,11
4,16.0,20.0,48.0,41472.0,257790.0,12.0,2.0,2.0,1.0,5.0,...,16.153096,1512618.0,0.136868,242989700.0,15.417678,1380833.0,0.130926,1593973000.0,ixnetwork-bgp-hijacking-start,11


In [16]:
df_final.shape

(1115, 662)

In [17]:
df_final.to_csv('./data/generated_all_data.csv', index=False)