In [1]:
import dataprofiler as dp
from redesign.generator_builder import Generator
from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split



In [2]:
# Import iris dataset
iris = datasets.load_iris(as_frame=True).frame
iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
# Profiler options
profile_options = dp.ProfilerOptions()
profile_options.set(
    {
        "data_labeler.is_enabled": False,
        "correlation.is_enabled": True,
        "multiprocess.is_enabled": False,
    }
)

# Create profile and generate synthetic data
profile = dp.Profiler(iris, profiler_type="structured", options=profile_options)
synthetic_data = Generator(profile=profile, seed=10).synthesize(150)
synthetic_data

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 


100%|██████████| 5/5 [00:00<00:00, 384.18it/s]

INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 



100%|██████████| 5/5 [00:00<00:00, 20.73it/s]


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.0,3.5,2.4,0.9,0.0
1,6.5,2.8,4.7,1.7,2.0
2,5.4,3.0,3.5,1.3,1.0
3,6.2,3.7,3.4,1.3,0.0
4,7.7,3.6,6.1,2.1,2.0
...,...,...,...,...,...
145,6.2,3.2,3.2,1.0,0.0
146,6.4,3.0,4.0,1.6,1.0
147,7.2,3.4,5.3,1.9,2.0
148,5.5,3.5,3.2,1.2,0.0


In [4]:
# Distribution of original petal length
iris["petal length (cm)"].describe()

count    150.000000
mean       3.758000
std        1.765298
min        1.000000
25%        1.600000
50%        4.350000
75%        5.100000
max        6.900000
Name: petal length (cm), dtype: float64

In [5]:
# Distribution of synthetic petal length
synthetic_data["petal length (cm)"].describe()

count    150.000000
mean       3.888667
std        1.090320
min        1.000000
25%        3.300000
50%        3.900000
75%        4.475000
max        6.900000
Name: petal length (cm), dtype: float64

In [6]:
# Random forest classifier accuracy test on original data
X_train, X_test, y_train, y_test = train_test_split(
    iris.iloc[:, :-1], iris.iloc[:, -1], test_size=0.2, random_state=1
)
model = RandomForestClassifier(random_state=1).fit(X_train, y_train)
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class="ovr")
print(f"ROC AUC of original dataset:  {roc_auc}")

# Random forest classifier accuracy test on synthetic data
X_synthetic, y_synthetic = synthetic_data.iloc[:, :-1], synthetic_data.iloc[:, -1]
synthetic_roc_auc = roc_auc_score(
    y_synthetic, model.predict_proba(X_synthetic), multi_class="ovr"
)
print(f"ROC AUC of synthetic dataset: {synthetic_roc_auc}")

ROC AUC of original dataset:  0.992353779118485
ROC AUC of synthetic dataset: 0.8959611675234832
