feat(timeseries): Add time-series stock example.

ydataai · Jan 24, 2021 · b3d6df9 · b3d6df9
1 parent 343eeea
commit b3d6df9
Show file tree

Hide file tree

Showing 13 changed files with 63 additions and 0 deletions.
diff --git a/examples/adult_wgangp.py → examples/regular/adult_wgangp.py b/examples/adult_wgangp.py → examples/regular/adult_wgangp.py
diff --git a/examples/cgan_example.py → examples/regular/cgan_example.py b/examples/cgan_example.py → examples/regular/cgan_example.py
diff --git a/examples/gan_example.ipynb → examples/regular/gan_example.ipynb b/examples/gan_example.ipynb → examples/regular/gan_example.ipynb
diff --git a/examples/wgan_example.py → examples/regular/wgan_example.py b/examples/wgan_example.py → examples/regular/wgan_example.py
diff --git a/src/ydata_synthetic/evaluation/__init__.py b/src/ydata_synthetic/evaluation/__init__.py
diff --git a/src/ydata_synthetic/preprocessing/regular/__init__.py b/src/ydata_synthetic/preprocessing/regular/__init__.py
diff --git a/src/ydata_synthetic/preprocessing/adult.py → ..._synthetic/preprocessing/regular/adult.py b/src/ydata_synthetic/preprocessing/adult.py → ..._synthetic/preprocessing/regular/adult.py
diff --git a/.../preprocessing/breast_cancer_wisconsin.py → ...essing/regular/breast_cancer_wisconsin.py b/.../preprocessing/breast_cancer_wisconsin.py → ...essing/regular/breast_cancer_wisconsin.py
diff --git a/...synthetic/preprocessing/cardiovascular.py → ...c/preprocessing/regular/cardiovascular.py b/...synthetic/preprocessing/cardiovascular.py → ...c/preprocessing/regular/cardiovascular.py
diff --git a/...a_synthetic/preprocessing/credit_fraud.py → ...tic/preprocessing/regular/credit_fraud.py b/...a_synthetic/preprocessing/credit_fraud.py → ...tic/preprocessing/regular/credit_fraud.py
diff --git a/src/ydata_synthetic/preprocessing/timeseries/__init__.py b/src/ydata_synthetic/preprocessing/timeseries/__init__.py
diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py
@@ -0,0 +1,28 @@
+"""
+    Get the stock data from Yahoo finance data
+    Data from the period 01 January 2017 - 24 January 2021
+"""
+import os
+import requests as req
+import pandas as pd
+
+from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
+
+def transformations(seq_len: int):
+    try:
+        stock_df = pd.read_csv('../data/stock.csv')
+    except:
+        stock_url = 'https://query1.finance.yahoo.com/v7/finance/download/GOOG?period1=1483228800&period2=1611446400&interval=1d&events=history&includeAdjustedClose=true'
+        request = req.get(stock_url)
+        url_content = request.content
+
+        file_path = os.path.join(os.path.dirname(os.path.join('..', os.path.dirname(__file__))), 'data')
+        stock_csv = open(os.path.join(file_path, 'stock.csv'), 'wb')
+        stock_csv.write(url_content)
+        # Reading the stock data
+        stock_df = pd.read_csv('../data/stock.csv')
+
+    #Data transformations to be applied prior to be used with the synthesizer model
+    processed_data = real_data_loading(stock_df, seq_len=seq_len)
+
+    return processed_data
diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py
@@ -0,0 +1,35 @@
+"""
+    Utility functions to be shared by the time-series preprocessing required to feed the data into the synthesizers
+"""
+import numpy as np
+from sklearn.preprocessing import MinMaxScaler
+
+# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py
+# Originally used in TimeGAN research
+def real_data_loading(data: np.array, seq_len):
+    """Load and preprocess real-world datasets.
+    Args:
+      - data_name: Numpy array with the values from a a Dataset
+      - seq_len: sequence length
+
+    Returns:
+      - data: preprocessed data.
+    """
+    # Flip the data to make chronological data
+    ori_data = data[::-1]
+    # Normalize the data
+    ori_data = MinMaxScaler(ori_data)
+
+    # Preprocess the dataset
+    temp_data = []
+    # Cut data by sequence length
+    for i in range(0, len(ori_data) - seq_len):
+        _x = ori_data[i:i + seq_len]
+        temp_data.append(_x)
+
+    # Mix the datasets (to make it similar to i.i.d)
+    idx = np.random.permutation(len(temp_data))
+    data = []
+    for i in range(len(temp_data)):
+        data.append(temp_data[idx[i]])
+    return data