From b3d6df91ab34f3126fcab7112281bf3a38046325 Mon Sep 17 00:00:00 2001
From: fabclmnt <fabiana.clemente@ydata.ai>
Date: Sun, 24 Jan 2021 15:37:22 +0000
Subject: [PATCH] feat(timeseries): Add time-series stock example.

---
 examples/{ => regular}/adult_wgangp.py        |  0
 examples/{ => regular}/cgan_example.py        |  0
 examples/{ => regular}/gan_example.ipynb      |  0
 examples/{ => regular}/wgan_example.py        |  0
 src/ydata_synthetic/evaluation/__init__.py    |  0
 .../preprocessing/regular/__init__.py         |  0
 .../preprocessing/{ => regular}/adult.py      |  0
 .../{ => regular}/breast_cancer_wisconsin.py  |  0
 .../{ => regular}/cardiovascular.py           |  0
 .../{ => regular}/credit_fraud.py             |  0
 .../preprocessing/timeseries/__init__.py      |  0
 .../preprocessing/timeseries/stock.py         | 28 +++++++++++++++
 .../preprocessing/timeseries/utils.py         | 35 +++++++++++++++++++
 13 files changed, 63 insertions(+)
 rename examples/{ => regular}/adult_wgangp.py (100%)
 rename examples/{ => regular}/cgan_example.py (100%)
 rename examples/{ => regular}/gan_example.ipynb (100%)
 rename examples/{ => regular}/wgan_example.py (100%)
 create mode 100644 src/ydata_synthetic/evaluation/__init__.py
 create mode 100644 src/ydata_synthetic/preprocessing/regular/__init__.py
 rename src/ydata_synthetic/preprocessing/{ => regular}/adult.py (100%)
 rename src/ydata_synthetic/preprocessing/{ => regular}/breast_cancer_wisconsin.py (100%)
 rename src/ydata_synthetic/preprocessing/{ => regular}/cardiovascular.py (100%)
 rename src/ydata_synthetic/preprocessing/{ => regular}/credit_fraud.py (100%)
 create mode 100644 src/ydata_synthetic/preprocessing/timeseries/__init__.py
 create mode 100644 src/ydata_synthetic/preprocessing/timeseries/stock.py
 create mode 100644 src/ydata_synthetic/preprocessing/timeseries/utils.py

diff --git a/examples/adult_wgangp.py b/examples/regular/adult_wgangp.py
similarity index 100%
rename from examples/adult_wgangp.py
rename to examples/regular/adult_wgangp.py
diff --git a/examples/cgan_example.py b/examples/regular/cgan_example.py
similarity index 100%
rename from examples/cgan_example.py
rename to examples/regular/cgan_example.py
diff --git a/examples/gan_example.ipynb b/examples/regular/gan_example.ipynb
similarity index 100%
rename from examples/gan_example.ipynb
rename to examples/regular/gan_example.ipynb
diff --git a/examples/wgan_example.py b/examples/regular/wgan_example.py
similarity index 100%
rename from examples/wgan_example.py
rename to examples/regular/wgan_example.py
diff --git a/src/ydata_synthetic/evaluation/__init__.py b/src/ydata_synthetic/evaluation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ydata_synthetic/preprocessing/regular/__init__.py b/src/ydata_synthetic/preprocessing/regular/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ydata_synthetic/preprocessing/adult.py b/src/ydata_synthetic/preprocessing/regular/adult.py
similarity index 100%
rename from src/ydata_synthetic/preprocessing/adult.py
rename to src/ydata_synthetic/preprocessing/regular/adult.py
diff --git a/src/ydata_synthetic/preprocessing/breast_cancer_wisconsin.py b/src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py
similarity index 100%
rename from src/ydata_synthetic/preprocessing/breast_cancer_wisconsin.py
rename to src/ydata_synthetic/preprocessing/regular/breast_cancer_wisconsin.py
diff --git a/src/ydata_synthetic/preprocessing/cardiovascular.py b/src/ydata_synthetic/preprocessing/regular/cardiovascular.py
similarity index 100%
rename from src/ydata_synthetic/preprocessing/cardiovascular.py
rename to src/ydata_synthetic/preprocessing/regular/cardiovascular.py
diff --git a/src/ydata_synthetic/preprocessing/credit_fraud.py b/src/ydata_synthetic/preprocessing/regular/credit_fraud.py
similarity index 100%
rename from src/ydata_synthetic/preprocessing/credit_fraud.py
rename to src/ydata_synthetic/preprocessing/regular/credit_fraud.py
diff --git a/src/ydata_synthetic/preprocessing/timeseries/__init__.py b/src/ydata_synthetic/preprocessing/timeseries/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py
new file mode 100644
index 00000000..47c9ccdd
--- /dev/null
+++ b/src/ydata_synthetic/preprocessing/timeseries/stock.py
@@ -0,0 +1,28 @@
+"""
+    Get the stock data from Yahoo finance data
+    Data from the period 01 January 2017 - 24 January 2021
+"""
+import os
+import requests as req
+import pandas as pd
+
+from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
+
+def transformations(seq_len: int):
+    try:
+        stock_df = pd.read_csv('../data/stock.csv')
+    except:
+        stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/GOOG?period1=1483228800&period2=1611446400&interval=1d&events=history&includeAdjustedClose=true'
+        request = req.get(stock_url)
+        url_content = request.content
+
+        file_path = os.path.join(os.path.dirname(os.path.join('..', os.path.dirname(__file__))), 'data')
+        stock_csv = open(os.path.join(file_path, 'stock.csv'), 'wb')
+        stock_csv.write(url_content)
+        # Reading the stock data
+        stock_df = pd.read_csv('../data/stock.csv')
+
+    #Data transformations to be applied prior to be used with the synthesizer model
+    processed_data = real_data_loading(stock_df, seq_len=seq_len)
+
+    return processed_data
diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py
new file mode 100644
index 00000000..e5e315ba
--- /dev/null
+++ b/src/ydata_synthetic/preprocessing/timeseries/utils.py
@@ -0,0 +1,35 @@
+"""
+    Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers
+"""
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+
+# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py
+# Originally used in TimeGAN research
+def real_data_loading(data: np.array, seq_len):
+    """Load and preprocess real-world datasets.
+    Args:
+      - data_name: Numpy array with the values from a a Dataset
+      - seq_len: sequence length
+
+    Returns:
+      - data: preprocessed data.
+    """
+    # Flip the data to make chronological data
+    ori_data = data[::-1]
+    # Normalize the data
+    ori_data = MinMaxScaler(ori_data)
+
+    # Preprocess the dataset
+    temp_data = []
+    # Cut data by sequence length
+    for i in range(0, len(ori_data) - seq_len):
+        _x = ori_data[i:i + seq_len]
+        temp_data.append(_x)
+
+    # Mix the datasets (to make it similar to i.i.d)
+    idx = np.random.permutation(len(temp_data))
+    data = []
+    for i in range(len(temp_data)):
+        data.append(temp_data[idx[i]])
+    return data