ydataai · jfsantos-ds · Nov 2, 2021 · Nov 4, 2021 · Nov 4, 2021 · Nov 5, 2021
diff --git a/.gitignore b/.gitignore
@@ -373,4 +373,4 @@ DerivedData/
 
 # User created
 VERSION
-version.py
+version.py
diff --git a/examples/timeseries/tscwgan_example.py b/examples/timeseries/tscwgan_example.py
@@ -0,0 +1,59 @@
+from numpy import reshape
+
+from ydata_synthetic.preprocessing.timeseries import processed_stock
+from ydata_synthetic.synthesizers.timeseries import TSCWGAN
+from ydata_synthetic.synthesizers import ModelParameters, TrainParameters
+from ydata_synthetic.postprocessing.regular.inverse_preprocesser import inverse_transform
+
+model = TSCWGAN
+
+#Define the GAN and training parameters
+noise_dim = 32
+dim = 128
+seq_len = 48
+cond_dim = 24
+batch_size = 128
+
+log_step = 100
+epochs = 300+1
+learning_rate = 5e-4
+beta_1 = 0.5
+beta_2 = 0.9
+models_dir = './cache'
+critic_iter = 5
+
+# Get transformed data stock - Univariate
+data, processed_data, scaler = processed_stock(path='./data/stock_data.csv', seq_len=seq_len, cols = ['Open'])
+data_sample = processed_data[0]
+
+model_parameters = ModelParameters(batch_size=batch_size,
+                           lr=learning_rate,
+                           betas=(beta_1, beta_2),
+                           noise_dim=noise_dim,
+                           n_cols=seq_len,
+                           layers_dim=dim,
+                           condition = cond_dim)
+
+train_args = TrainParameters(epochs=epochs,
+                             sample_interval=log_step,
+                             critic_iter=critic_iter)
+
+#Training the TSCWGAN model
+synthesizer = model(model_parameters, gradient_penalty_weight=10)
+synthesizer.train(processed_data, train_args)
+
+#Saving the synthesizer to later generate new events
+synthesizer.save(path='./tscwgan_stock.pkl')
+
+#Loading the synthesizer
+synth = model.load(path='./tscwgan_stock.pkl')
+
+#Sampling the data
+#Note that the data returned is not inverse processed.
+cond_index = 100  # Arbitrary sequence for conditioning
+cond_array = reshape(processed_data[cond_index][:cond_dim], (1,-1))
+
+data_sample = synth.sample(cond_array, 1000, 100)
+
+# Inverting the scaling of the synthetic samples
+inv_data_sample = inverse_transform(data_sample, scaler)
diff --git a/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py b/src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
@@ -1,45 +1,46 @@
 # Inverts all preprocessing pipelines provided in the preprocessing examples
 from typing import Union
 
-import pandas as pd
+from pandas import DataFrame, concat
 
 from sklearn.pipeline import Pipeline
 from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler
+from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler, MinMaxScaler
 
 
-def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame:
+def inverse_transform(data: DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer,
+                                                           OneHotEncoder, StandardScaler, MinMaxScaler]) -> DataFrame:
     """Inverts data transformations taking place in a standard sklearn processor.
     Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers.
 
     Args:
-        data (pd.DataFrame): The data object that needs inversion of preprocessing
+        data (DataFrame): The data object that needs inversion of preprocessing
         processor (Union[Pipeline, ColumnTransformer, BaseEstimator]): The processor applied on the original data
 
     Returns:
-        inv_data (pd.DataFrame): The data object after inverting preprocessing"""
+        inv_data (DataFrame): The data object after inverting preprocessing"""
     inv_data = data.copy()
-    if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)):
-        inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_)
+    if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, MinMaxScaler, Pipeline)):
+        inv_data = DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_ if hasattr(processor, "feature_names_in") else None)
     elif isinstance(processor, ColumnTransformer):
         output_indices = processor.output_indices_
-        assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame."
+        assert isinstance(data, DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame."
         for t_name, t, t_cols in processor.transformers_[::-1]:
             slice_ = output_indices[t_name]
             t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step))
             if t == 'drop':
                 continue
             elif t == 'passthrough':
-                inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index)
+                inv_cols = DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index)
                 inv_col_names = inv_cols.columns
             else:
-                inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index)
+                inv_cols = DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index)
                 inv_col_names = inv_cols.columns
             if set(inv_col_names).issubset(set(inv_data.columns)):
                 inv_data[inv_col_names] = inv_cols[inv_col_names]
             else:
-                inv_data = pd.concat([inv_data, inv_cols], axis=1)
+                inv_data = concat([inv_data, inv_cols], axis=1)
     else:
         print('The provided data processor is not supported and cannot be inverted with this method.')
         return None
-    return inv_data[processor.feature_names_in_]
+    return inv_data[processor.feature_names_in_] if hasattr(processor, "feature_names_in") else inv_data
diff --git a/src/ydata_synthetic/preprocessing/timeseries/stock.py b/src/ydata_synthetic/preprocessing/timeseries/stock.py
@@ -2,17 +2,30 @@
     Get the stock data from Yahoo finance data
     Data from the period 01 January 2017 - 24 January 2021
 """
+from typing import Optional, List
+
 import pandas as pd
+from typeguard import typechecked
 
 from ydata_synthetic.preprocessing.timeseries.utils import real_data_loading
 
-def transformations(path, seq_len: int):
-    stock_df = pd.read_csv(path)
+@typechecked
+def transformations(path, seq_len: int, cols: Optional[List] = None):
+    """Apply min max scaling and roll windows of a temporal dataset.
+
+    Args:
+        path(str): path to a csv temporal dataframe
+        seq_len(int): length of the rolled sequences
+        cols (Union[str, List]): Column or list of columns to be used"""
+    if isinstance(cols, list):
+        stock_df = pd.read_csv(path)[cols]
+    else:
+        stock_df = pd.read_csv(path)
     try:
         stock_df = stock_df.set_index('Date').sort_index()
     except:
         stock_df=stock_df
     #Data transformations to be applied prior to be used with the synthesizer model
-    processed_data = real_data_loading(stock_df.values, seq_len=seq_len)
+    data, processed_data, scaler = real_data_loading(stock_df.values, seq_len=seq_len)
 
-    return processed_data
+    return data, processed_data, scaler
diff --git a/src/ydata_synthetic/preprocessing/timeseries/utils.py b/src/ydata_synthetic/preprocessing/timeseries/utils.py
@@ -4,7 +4,7 @@
 import numpy as np
 from sklearn.preprocessing import MinMaxScaler
 
-# Method implemented here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py
+# Method adapted from here: https://github.com/jsyoon0823/TimeGAN/blob/master/data_loading.py
 # Originally used in TimeGAN research
 def real_data_loading(data: np.array, seq_len):
     """Load and preprocess real-world datasets.
@@ -30,7 +30,7 @@ def real_data_loading(data: np.array, seq_len):
 
     # Mix the datasets (to make it similar to i.i.d)
     idx = np.random.permutation(len(temp_data))
-    data = []
+    processed_data = []
     for i in range(len(temp_data)):
-        data.append(temp_data[idx[i]])
-    return data
+        processed_data.append(temp_data[idx[i]])
+    return data, processed_data, scaler
diff --git a/src/ydata_synthetic/synthesizers/gan.py b/src/ydata_synthetic/synthesizers/gan.py
@@ -21,10 +21,10 @@
 _model_parameters_df = [128, 1e-4, (None, None), 128, 264,
                         None, None, None, 1, None]
 
-_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval', 'labels']
+_train_parameters = ['cache_prefix', 'label_dim', 'epochs', 'sample_interval', 'labels', 'critic_iter']
 
 ModelParameters = namedtuple('ModelParameters', _model_parameters, defaults=_model_parameters_df)
-TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None))
+TrainParameters = namedtuple('TrainParameters', _train_parameters, defaults=('', None, 300, 50, None, None))
 
 
 # pylint: disable=R0902

diff --git a/src/ydata_synthetic/synthesizers/timeseries/__init__.py b/src/ydata_synthetic/synthesizers/timeseries/__init__.py
@@ -1,5 +1,7 @@
 from ydata_synthetic.synthesizers.timeseries.timegan.model import TimeGAN
+from ydata_synthetic.synthesizers.timeseries.tscwgan.model import TSCWGAN
 
 __all__ = [
     'TimeGAN',
+    'TSCWGAN',
 ]
diff --git a/src/ydata_synthetic/synthesizers/timeseries/tscwgan/__init__.py b/src/ydata_synthetic/synthesizers/timeseries/tscwgan/__init__.py