Skip to content

Commit

Permalink
feat(datasets): Add cardiovascular data
Browse files Browse the repository at this point in the history
  • Loading branch information
fabclmnt committed Nov 2, 2020
1 parent cf2230b commit cc3dc0f
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 1 deletion.
5 changes: 4 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,7 @@ numpy==1.17.4
scikit-learn==0.22.2
matplotlib==3.3.2
tensorflow==2.1.2
easydict
tensorflow-privacy==0.5.1
kaggle==1.5.9
easydict
pmlb
35 changes: 35 additions & 0 deletions src/ydata_synthetic/preprocessing/adult.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from pmlb import fetch_data

def transformations(auto=True):
if auto:
data = fetch_data('adult')
else:
data = fetch_data('adult')

numerical_features = ['age', 'fnlwgt',
'capital-gain', 'capital-loss',
'hours-per-week']
numerical_transformer = Pipeline(steps=[
('onehot', StandardScaler())])

categorical_features = ['workclass','education', 'marital-status',
'occupation', 'relationship',
'race', 'sex']
categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)])

processed_data = preprocessor.fit_transform(data)
processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(processed_data))
return data, processed_data, preprocessor

19 changes: 19 additions & 0 deletions src/ydata_synthetic/preprocessing/breast_cancer_wisconsin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from pmlb import fetch_data

def transformations(auto=True):
if auto:
data = fetch_data('breast_cancer_wisconsin')
else:
data = fetch_data('breast_cancer_wisconsin')
scaler = StandardScaler()
processed_data = scaler.fit_transform(data)
processed_data = pd.DataFrame(processed_data)
return data, processed_data, scaler


25 changes: 25 additions & 0 deletions src/ydata_synthetic/preprocessing/cardiovascular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pandas as pd

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

def transformations(data):
categorical_features = ['gender', 'cardio', 'active', 'alco', 'smoke', 'gluc',
'cholesterol']
numerical_features = [ 'height', 'weight', 'ap_hi', 'ap_lo']

numerical_transformer = Pipeline(steps=[
('onehot', StandardScaler())])

categorical_transformer = Pipeline(steps=[
('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, numerical_features),
('cat', categorical_transformer, categorical_features)])

processed_data = preprocessor.fit_transform(data)
processed_data = pd.DataFrame.sparse.from_spmatrix(preprocessor.fit_transform(processed_data))
return processed_data, preprocessor

0 comments on commit cc3dc0f

Please sign in to comment.