From b3d6df91ab34f3126fcab7112281bf3a38046325 Mon Sep 17 00:00:00 2001 From: fabclmnt Date: Sun, 24 Jan 2021 15:37:22 +0000 Subject: [PATCH] feat(timeseries): Add time-series stock example. --- examples/{ => regular}/adult_wgangp.py | 0 examples/{ => regular}/cgan_example.py | 0 examples/{ => regular}/gan_example.ipynb | 0 examples/{ => regular}/wgan_example.py | 0 src/ydata_synthetic/evaluation/__init__.py | 0 .../preprocessing/regular/__init__.py | 0 .../preprocessing/{ => regular}/adult.py | 0 .../{ => regular}/breast_cancer_wisconsin.py | 0 .../{ => regular}/cardiovascular.py | 0 .../{ => regular}/credit_fraud.py | 0 .../preprocessing/timeseries/__init__.py | 0 .../preprocessing/timeseries/stock.py | 28 +++++++++++++++ .../preprocessing/timeseries/utils.py | 35 +++++++++++++++++++ 13 files changed, 63 insertions(+) rename examples/{ => regular}/adult_wgangp.py (100%) rename examples/{ => regular}/cgan_example.py (100%) rename examples/{ => regular}/gan_example.ipynb (100%) rename examples/{ => regular}/wgan_example.py (100%) create mode 100644 src/ydata_synthetic/evaluation/__init__.py create mode 100644 src/ydata_synthetic/preprocessing/regular/__init__.py rename src/ydata_synthetic/preprocessing/{ => regular}/adult.py (100%) rename src/ydata_synthetic/preprocessing/{ => regular}/breast_cancer_wisconsin.py (100%) rename src/ydata_synthetic/preprocessing/{ => regular}/cardiovascular.py (100%) rename src/ydata_synthetic/preprocessing/{ => regular}/credit_fraud.py (100%) create mode 100644 src/ydata_synthetic/preprocessing/timeseries/__init__.py create mode 100644 src/ydata_synthetic/preprocessing/timeseries/stock.py create mode 100644 src/ydata_synthetic/preprocessing/timeseries/utils.py diff --git a/examples/adult_wgangp.py b/examples/regular/adult_wgangp.py similarity index 100% rename from examples/adult_wgangp.py rename to examples/regular/adult_wgangp.py diff --git a/examples/cgan_example.py b/examples/regular/cgan_example.py similarity index 100% rename from examples/cgan_example.py rename to examples/regular/cgan_example.py diff --git a/examples/gan_example.ipynb b/examples/regular/gan_example.ipynb similarity index 100% rename from examples/gan_example.ipynb rename to examples/regular/gan_example.ipynb diff --git a/examples/wgan_example.py b/examples/regular/wgan_example.py similarity index 100% rename from examples/wgan_example.py rename to examples/regular/wgan_example.py diff --git a/src/ydata_synthetic/evaluation/__init__.py b/src/ydata_synthetic/evaluation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_synthetic/preprocessing/regular/__init__.py b/src/ydata_synthetic/preprocessing/regular/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_synthetic/preprocessing/adult.py b/src/ydata_synthetic/preprocessing/regular/adult.py similarity index 100% rename from src/ydata_synthetic/preprocessing/adult.py rename to src/ydata_synthetic/preprocessing/regular/adult.py diff --git a/src/ydata_synthetic/preprocessing/breast_cancer_wisconsin.py b/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py similarity index 100% rename from src/ydata_synthetic/preprocessing/breast_cancer_wisconsin.py rename to src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py diff --git a/src/ydata_synthetic/preprocessing/cardiovascular.py b/src/ydata_synthetic/preprocessing/regular/cardiovascular.py similarity index 100% rename from src/ydata_synthetic/preprocessing/cardiovascular.py rename to src/ydata_synthetic/preprocessing/regular/cardiovascular.py diff --git a/src/ydata_synthetic/preprocessing/credit_fraud.py b/src/ydata_synthetic/preprocessing/regular/credit_fraud.py similarity index 100% rename from src/ydata_synthetic/preprocessing/credit_fraud.py rename to src/ydata_synthetic/preprocessing/regular/credit_fraud.py diff --git a/src/ydata_synthetic/preprocessing/timeseries/__init__.py b/src/ydata_synthetic/preprocessing/timeseries/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py new file mode 100644 index 00000000..47c9ccdd --- /dev/null +++ b/src/ydata_synthetic/preprocessing/timeseries/stock.py @@ -0,0 +1,28 @@ +""" + Get the stock data from Yahoo finance data + Data from the period 01 January 2017 - 24 January 2021 +""" +import os +import requests as req +import pandas as pd + +from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading + +def transformations(seq_len: int): + try: + stock_df = pd.read_csv('../data/stock.csv') + except: + stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/GOOG?period1=1483228800&period2=1611446400&interval=1d&events=history&includeAdjustedClose=true' + request = req.get(stock_url) + url_content = request.content + + file_path = os.path.join(os.path.dirname(os.path.join('..', os.path.dirname(__file__))), 'data') + stock_csv = open(os.path.join(file_path, 'stock.csv'), 'wb') + stock_csv.write(url_content) + # Reading the stock data + stock_df = pd.read_csv('../data/stock.csv') + + #Data transformations to be applied prior to be used with the synthesizer model + processed_data = real_data_loading(stock_df, seq_len=seq_len) + + return processed_data diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py new file mode 100644 index 00000000..e5e315ba --- /dev/null +++ b/src/ydata_synthetic/preprocessing/timeseries/utils.py @@ -0,0 +1,35 @@ +""" + Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers +""" +import numpy as np +from sklearn.preprocessing import MinMaxScaler + +# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py +# Originally used in TimeGAN research +def real_data_loading(data: np.array, seq_len): + """Load and preprocess real-world datasets. + Args: + - data_name: Numpy array with the values from a a Dataset + - seq_len: sequence length + + Returns: + - data: preprocessed data. + """ + # Flip the data to make chronological data + ori_data = data[::-1] + # Normalize the data + ori_data = MinMaxScaler(ori_data) + + # Preprocess the dataset + temp_data = [] + # Cut data by sequence length + for i in range(0, len(ori_data) - seq_len): + _x = ori_data[i:i + seq_len] + temp_data.append(_x) + + # Mix the datasets (to make it similar to i.i.d) + idx = np.random.permutation(len(temp_data)) + data = [] + for i in range(len(temp_data)): + data.append(temp_data[idx[i]]) + return data