# Sequential data synthesis - Time dependent financial transactions

### Import the needed packages

In [14]:
# Imports the packages that are needed
#YData package specific packages
import os

from ydata.connectors import GCSConnector, LocalConnector
from ydata.connectors.filetype import FileType
from ydata.utils.formats import read_json

from ydata.metadata import Metadata
from ydata.utils.data_types import DataType

import json
import pickle as pkl
import pandas as pd

try:
    os.mkdir('outputs')
except FileExistsError as e:
    print('Directory already exists')

Directory already exists


In [22]:
dataset_path = os.environ['DATASET_PATH']

KeyError: 'DATASET_PATH'

## Reading the data from the source

#### Using the connectors - Google Cloud Storage & Local filesystem

To read the data from a given datasource (cloud storage, filesystem, etc.) it is possible to be done while using YData's scalable connectors. This connectors enable to read and write data from multiple different sources, but are only usable in the context of the lab where they where created.

In [15]:
# Initialize the YData's connector
token = read_json('credentials/gcs_credentials.json')
read_connector = GCSConnector('ydatasynthetic', keyfile_dict=token)

# Read the data from the Cloud Storage 
data = read_connector.read_file(dataset_path, file_type = FileType.CSV)

#Filter the data based on the columns required for the use case
data = data.select_columns(columns=['account_id', 'date', 'type', 'amount', 'k_symbol', 'balance'])


+-------------+-----------+-----------+---------+
| Package     | client    | scheduler | workers |
+-------------+-----------+-----------+---------+
| cloudpickle | 2.0.0     | 1.6.0     | None    |
| distributed | 2021.10.0 | 2022.01.1 | None    |
| msgpack     | 1.0.2     | 1.0.3     | None    |
+-------------+-----------+-----------+---------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


INFO: 2022-02-13 22:22:33,193 [CONNECTOR] - Init data types inference.
INFO: 2022-02-13 22:22:41,011 [CONNECTOR] - Data types infered.


In [16]:
## Metadata, inicial calculation
metadata = Metadata()
metadata(data)

print(metadata)



In [17]:
#Init metadata with time-series attributes
dataset_attrs = {
    "sortbykey": "date",
    "entity_id_cols": ["account_id"],
    "autoregressive_cols": ["k_symbol", "balance"]
}

metadata_ts = Metadata()
metadata_ts(data, dataset_attrs=dataset_attrs)

#Updating the data type from a column
metadata_ts.columns['date'].datatype = DataType.NUMERICAL

data_columns = metadata_ts.columns

In [18]:
print(metadata_ts)



## Preparing the outputs for the next pipeline step
To ensure that the required elements are shared from one step to the other of the pipeline we have to output the elements that we will need downstream the pipeline. In this particular example, we want to ensure both the dataset and the calculated metadata to avoid duplicated calculations and queries to external sources.

In [19]:
#Creating the output of the dataset
output_data = data.to_pandas()
output_data.to_csv('outputs/real_data.csv', header=False, index=False)

Note that for bigger datasets it is recommended to write the intermediate steps into a remote storage (ObjectStorage, FileStorage, RDBMS, etc.)
The platform have a limited amount of space in terms of storage, not to mention traceability and monitoring of the intermediate results.

In [20]:
with open('outputs/dataset_attrs.json', 'w') as f_metadata:
    json.dump(dataset_attrs, f_metadata)
    
with open('outputs/shape.json', 'w') as shape:
    json.dump(data.shape(lazy_eval=False), shape)
    
with open('outputs/data_columns.pkl', 'wb') as f_columns:
    pkl.dump(data_columns, f_columns, protocol=pkl.HIGHEST_PROTOCOL)

In [21]:
list(metadata_ts.columns.keys())

['account_id', 'date', 'type', 'amount', 'balance', 'k_symbol']

In [9]:
# Here we create the visualization of the table.

metadata = {
    'outputs' : [{
      'type': 'table',
      'storage': 'inline',
      'format': 'csv',
      'header': list(metadata_ts.columns.keys()),
      'source': 'outputs/real_data.csv'
    }]
  }

with open("mlpipeline-ui-metadata.json", 'w') as metadata_file:
    json.dump(metadata, metadata_file)