Skip to content

Commit

Permalink
feat: Add inverse transformations to supported datasets (#104)
Browse files Browse the repository at this point in the history
  • Loading branch information
fabclmnt committed Oct 28, 2021
1 parent a9de1ab commit c618d90
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 25 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
@@ -1,6 +1,6 @@
pandas==1.2.*
numpy==1.19.*
scikit-learn==0.22.*
scikit-learn==1.0.*
matplotlib==3.3.2
seaborn==0.11.*
tensorflow==2.4.*
Expand Down
45 changes: 45 additions & 0 deletions src/ydata_synthetic/postprocessing/regular/inverse_preprocesser.py
@@ -0,0 +1,45 @@
# Inverts all preprocessing pipelines provided in the preprocessing examples
from typing import Union

import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PowerTransformer, OneHotEncoder, StandardScaler


def inverse_transform(data: pd.DataFrame, processor: Union[Pipeline, ColumnTransformer, PowerTransformer, OneHotEncoder, StandardScaler]) -> pd.DataFrame:
"""Inverts data transformations taking place in a standard sklearn processor.
Supported processes are sklearn pipelines, column transformers or base estimators like standard scalers.
Args:
data (pd.DataFrame): The data object that needs inversion of preprocessing
processor (Union[Pipeline, ColumnTransformer, BaseEstimator]): The processor applied on the original data
Returns:
inv_data (pd.DataFrame): The data object after inverting preprocessing"""
inv_data = data.copy()
if isinstance(processor, (PowerTransformer, OneHotEncoder, StandardScaler, Pipeline)):
inv_data = pd.DataFrame(processor.inverse_transform(data), columns=processor.feature_names_in_)
elif isinstance(processor, ColumnTransformer):
output_indices = processor.output_indices_
assert isinstance(data, pd.DataFrame), "The data to be inverted from a ColumnTransformer has to be a Pandas DataFrame."
for t_name, t, t_cols in processor.transformers_[::-1]:
slice_ = output_indices[t_name]
t_indices = list(range(slice_.start, slice_.stop, 1 if slice_.step is None else slice_.step))
if t == 'drop':
continue
elif t == 'passthrough':
inv_cols = pd.DataFrame(data.iloc[:,t_indices].values, columns = t_cols, index = data.index)
inv_col_names = inv_cols.columns
else:
inv_cols = pd.DataFrame(t.inverse_transform(data.iloc[:,t_indices].values), columns = t_cols, index = data.index)
inv_col_names = inv_cols.columns
if set(inv_col_names).issubset(set(inv_data.columns)):
inv_data[inv_col_names] = inv_cols[inv_col_names]
else:
inv_data = pd.concat([inv_data, inv_cols], axis=1)
else:
print('The provided data processor is not supported and cannot be inverted with this method.')
return None
return inv_data[processor.feature_names_in_]
12 changes: 7 additions & 5 deletions src/ydata_synthetic/preprocessing/regular/adult.py
Expand Up @@ -9,26 +9,28 @@
def transformations():
data = fetch_data('adult')

numerical_features = ['age', 'fnlwgt',
numerical_features = ['age', 'fnlwgt',
'capital-gain', 'capital-loss',
'hours-per-week']
numerical_transformer = Pipeline(steps=[
('onehot', StandardScaler())])
('scaler', StandardScaler())])

categorical_features = ['workclass','education', 'marital-status',
categorical_features = ['workclass','education', 'marital-status',
'occupation', 'relationship',
'race', 'sex']
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])

remaining_features = ['education-num', 'native-country','target']
remaining_transformer = 'passthrough'
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)])
('cat', categorical_transformer, categorical_features),
('remaining', remaining_transformer, remaining_features)])

processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(data))

return data, processed_data, preprocessor



Expand Up @@ -6,22 +6,18 @@

from pmlb import fetch_data

def transformations(auto=True):
if auto:
data = fetch_data('breast_cancer_wisconsin')
else:
data = fetch_data('breast_cancer_wisconsin')

def transformations():
data = fetch_data('breast_cancer_wisconsin')

scaler = StandardScaler()
processed_data = scaler.fit_transform(data)
processed_data = pd.DataFrame(processed_data)

return data, processed_data, scaler


if __name__ == '__main__':
data = transformations(auto=True)

data = transformations()

print(data)

4 changes: 2 additions & 2 deletions src/ydata_synthetic/preprocessing/regular/cardiovascular.py
Expand Up @@ -19,7 +19,7 @@ def transformations(data):
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)])

processed_data = preprocessor.fit_transform(data)
processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(processed_data))
return processed_data, preprocessor
return data, processed_data, preprocessor
19 changes: 13 additions & 6 deletions src/ydata_synthetic/preprocessing/regular/credit_fraud.py
@@ -1,16 +1,23 @@
#Data transformations to be aplied
#Data transformations to be applied
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from sklearn.preprocessing import PowerTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def transformations(data):
#Log transformation to Amount variable
processed_data = data.copy()
data_cols = list(data.columns[data.columns != 'Class'])

#data[data_cols] = StandardScaler().fit_transform(data[data_cols])
data[data_cols] = PowerTransformer(method='yeo-johnson', standardize=True, copy=True).fit_transform(data[data_cols])

return data

data_transformer = Pipeline(steps=[
('PowerTransformer', PowerTransformer(method='yeo-johnson', standardize=True, copy=True))])

preprocessor = ColumnTransformer(
transformers = [('power', data_transformer, data_cols)])
processed_data[data_cols] = preprocessor.fit_transform(data[data_cols])

return data, processed_data, preprocessor

0 comments on commit c618d90

Please sign in to comment.