In [1]:
import pandas as pd
import numpy as np
from ydata.synthesizers.regular.model import RegularSynthesizer
from ydata.metadata import Metadata
from ydata.dataset import Dataset
from ydata.labs import DataSources
from ydata.report import SyntheticDataProfile
from ydata.report.reports.report_type import ReportType
from ydata.synthesizers.privacy import PrivacyLevel

## **Auxiliary Functions**

In [2]:
# Auxiliary function used to train and sample from a regular synthesizer.
def train_and_sample_from_synthesizer(data: Dataset, metadata: Metadata, privacy_level: PrivacyLevel):
    synthesizer = RegularSynthesizer()
    synthesizer.fit(data, metadata=metadata, privacy_level=privacy_level)
    holdout_dataset = Dataset(synthesizer._holdout._data.compute())
    train_dataset = Dataset(synthesizer._holdout._train_data.compute())
    return holdout_dataset, train_dataset, synthesizer.sample(n_samples=len(holdout_dataset))

In [3]:
# Auxiliary function used to obtain the fidelity, utility, and privacy scores.
def get_report_metrics(holdout_data: Dataset, train_data: Dataset, synth_data: Dataset, target: str):
    sdf = SyntheticDataProfile(report_type=ReportType.TABULAR)
    data_types = {k: v.datatype for k, v in metadata.columns.items()}

    sdf.generate_report(real=holdout_data,
                        synth=synth_data,
                        target=target,
                        data_types=data_types,
                        training_data=train_data,
                        metadata=metadata,
                        pdf=False)

    return sdf.get_summary()

# **YData Privacy Layer**

YData synthesizers now offer a privacy layer that can provide differential privacy to the end-user by selecting one of three levels:
- High fidelity - the default behavior, which leads to synthetic data with higher fidelity/utility and less privacy.
- High privacy - enables the generation of synthetic data with higher privacy, accepting a loss of fidelity and utility.
- Balanced fidelity/privacy - tries to find a balance between high fidelity, utility, and privacy, aiming to reach good enough results in all three settings.

This notebook describes how to use the privacy layer with the regular synthesizer. The same logic here explained can be equally applied to the time series synthesizer.

We will use the Breast Cancer Wisconsin dataset to demonstrate how to take advantage of the privacy layer. This dataset contains computed features from a digitized image of a fine needle aspirate (FNA) of several breast masses. Each row has the diagnosis (M for malignant and B for benign) and 32 real-valued features computed for the cell nuclei. The diagnosis is the dataset target.

In [4]:
datasource = DataSources.get(uid='{dasource-uid}')
data = datasource.read()
data = data.drop_columns(columns=["id"])
metadata = Metadata(data)
_target = "diagnosis"

[########################################] | 100% Completed | 101.66 ms
[########################################] | 100% Completed | 101.82 ms
[########################################] | 100% Completed | 103.16 ms
[########################################] | 100% Completed | 101.47 ms
[########################################] | 100% Completed | 661.24 ms


In [5]:
data.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## **High Fidelity**

The synthesizer has a parameter named `privacy_level`, which is optional and defaults to the high-fidelity setting. Nevertheless, we can also explicitly specify this level. We just have to import the `PrivacyLevel` enumeration and chose the `HIGH_FIDELITY` option.

In [6]:
holdout_dataset, train_dataset, synthetic_dataset = train_and_sample_from_synthesizer(data=data, metadata=metadata, privacy_level=PrivacyLevel.HIGH_FIDELITY)

INFO: 2023-03-16 12:45:57,677 [SYNTHESIZER] - Number columns considered for synth: 31
INFO: 2023-03-16 12:45:58,011 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2023-03-16 12:45:58,012 [SYNTHESIZER] - Preprocess segment
INFO: 2023-03-16 12:45:58,016 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-03-16 12:45:58,016 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2023-03-16 12:45:58,713 [SYNTHESIZER] - Start generating model samples.


In [7]:
%%capture
summary_metrics = get_report_metrics(holdout_data=holdout_dataset, train_data=train_dataset, synth_data=synthetic_dataset, target=_target)

INFO: 2023-03-16 12:45:59,120 [PROFILEREPORT] - Starting metrics calculation.
INFO: 2023-03-16 12:45:59,174 [PROFILEREPORT] - Synthetic data quality report selected target variable: diagnosis
INFO: 2023-03-16 12:45:59,175 [PROFILEREPORT] - preparing data format.
INFO: 2023-03-16 12:45:59,255 [PROFILEREPORT] - Preparing the data for metrics calculation
INFO: 2023-03-16 12:45:59,310 [PROFILEREPORT] - Calculating privacy metrics.
INFO: 2023-03-16 12:45:59,313 [PROFILEREPORT] - Calculating metric [Exact Matches].
INFO: 2023-03-16 12:45:59,317 [PROFILEREPORT] - Metric [Exact Matches] took 0.00s.
INFO: 2023-03-16 12:45:59,319 [PROFILEREPORT] - Calculating metric [Membership Inference Score].
INFO: 2023-03-16 12:45:59,320 [PROFILEREPORT] - Membership Disclosure Score sample size was reduce to match the dataset with size 114.
INFO: 2023-03-16 12:45:59,329 [PROFILEREPORT] - Metric [Membership Inference Score] took 0.01s.
INFO: 2023-03-16 12:45:59,331 [PROFILEREPORT] - Calculating metric [Neighb

In [8]:
print(f"\033[1m{PrivacyLevel.HIGH_FIDELITY.name}")
print(f"\033[1mFidelity: {summary_metrics['fidelity']:.2f}")
print(f"\033[1mUtility: {summary_metrics['utility']:.2f}")
print(f"\033[1mPrivacy: {summary_metrics['privacy']:.2f}")

[1mHIGH_FIDELITY
[1mFidelity: 0.89
[1mUtility: 0.70
[1mPrivacy: 0.67


## **High Privacy**

To achieve high privacy, the `privacy_level` parameter must be defined with the `HIGH_PRIVACY` option of the `PrivacyLevel` enumeration.

In [9]:
holdout_dataset, train_dataset, synthetic_dataset = train_and_sample_from_synthesizer(data=data, metadata=metadata, privacy_level=PrivacyLevel.HIGH_PRIVACY)

INFO: 2023-03-16 12:46:05,413 [SYNTHESIZER] - Number columns considered for synth: 31
INFO: 2023-03-16 12:46:19,376 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2023-03-16 12:46:19,378 [SYNTHESIZER] - Preprocess segment
INFO: 2023-03-16 12:46:19,382 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-03-16 12:46:19,383 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2023-03-16 12:46:20,005 [SYNTHESIZER] - Start generating model samples.


In [10]:
%%capture
summary_metrics = get_report_metrics(holdout_data=holdout_dataset, train_data=train_dataset, synth_data=synthetic_dataset, target=_target)

INFO: 2023-03-16 12:46:20,387 [PROFILEREPORT] - Starting metrics calculation.
INFO: 2023-03-16 12:46:20,434 [PROFILEREPORT] - Synthetic data quality report selected target variable: diagnosis
INFO: 2023-03-16 12:46:20,434 [PROFILEREPORT] - preparing data format.
INFO: 2023-03-16 12:46:20,513 [PROFILEREPORT] - Preparing the data for metrics calculation
INFO: 2023-03-16 12:46:20,566 [PROFILEREPORT] - Calculating privacy metrics.
INFO: 2023-03-16 12:46:20,568 [PROFILEREPORT] - Calculating metric [Exact Matches].
INFO: 2023-03-16 12:46:20,570 [PROFILEREPORT] - Metric [Exact Matches] took 0.00s.
INFO: 2023-03-16 12:46:20,571 [PROFILEREPORT] - Calculating metric [Membership Inference Score].
INFO: 2023-03-16 12:46:20,572 [PROFILEREPORT] - Membership Disclosure Score sample size was reduce to match the dataset with size 114.
INFO: 2023-03-16 12:46:20,577 [PROFILEREPORT] - Metric [Membership Inference Score] took 0.00s.
INFO: 2023-03-16 12:46:20,578 [PROFILEREPORT] - Calculating metric [Neighb

In [11]:
print(f"\033[1m{PrivacyLevel.HIGH_PRIVACY.name}")
print(f"\033[1mFidelity: {summary_metrics['fidelity']:.2f}")
print(f"\033[1mUtility: {summary_metrics['utility']:.2f}")
print(f"\033[1mPrivacy: {summary_metrics['privacy']:.2f}")

[1mHIGH_PRIVACY
[1mFidelity: 0.70
[1mUtility: 0.46
[1mPrivacy: 0.99


## **Balanced Fidelity/Privacy**

To achieve the balanced setting between fidelity, utility, and privacy, the `privacy_level` parameter must be defined with the `BALANCED_PRIVACY_FIDELITY` option of the `PrivacyLevel` enumeration.

In [12]:
holdout_dataset, train_dataset, synthetic_dataset = train_and_sample_from_synthesizer(data=data, metadata=metadata, privacy_level=PrivacyLevel.BALANCED_PRIVACY_FIDELITY)

INFO: 2023-03-16 12:46:26,617 [SYNTHESIZER] - Number columns considered for synth: 31
INFO: 2023-03-16 12:46:40,463 [SYNTHESIZER] - Starting the synthetic data modeling process over 1x1 blocks.
INFO: 2023-03-16 12:46:40,465 [SYNTHESIZER] - Preprocess segment
INFO: 2023-03-16 12:46:40,469 [SYNTHESIZER] - Synthesizer init.
INFO: 2023-03-16 12:46:40,470 [SYNTHESIZER] - Processing the data prior fitting the synthesizer.
INFO: 2023-03-16 12:46:41,157 [SYNTHESIZER] - Start generating model samples.


In [13]:
%%capture
summary_metrics = get_report_metrics(holdout_data=holdout_dataset, train_data=train_dataset, synth_data=synthetic_dataset, target=_target)

INFO: 2023-03-16 12:46:41,633 [PROFILEREPORT] - Starting metrics calculation.
INFO: 2023-03-16 12:46:41,679 [PROFILEREPORT] - Synthetic data quality report selected target variable: diagnosis
INFO: 2023-03-16 12:46:41,679 [PROFILEREPORT] - preparing data format.
INFO: 2023-03-16 12:46:41,756 [PROFILEREPORT] - Preparing the data for metrics calculation
INFO: 2023-03-16 12:46:41,808 [PROFILEREPORT] - Calculating privacy metrics.
INFO: 2023-03-16 12:46:41,810 [PROFILEREPORT] - Calculating metric [Exact Matches].
INFO: 2023-03-16 12:46:41,813 [PROFILEREPORT] - Metric [Exact Matches] took 0.00s.
INFO: 2023-03-16 12:46:41,814 [PROFILEREPORT] - Calculating metric [Membership Inference Score].
INFO: 2023-03-16 12:46:41,815 [PROFILEREPORT] - Membership Disclosure Score sample size was reduce to match the dataset with size 114.
INFO: 2023-03-16 12:46:41,820 [PROFILEREPORT] - Metric [Membership Inference Score] took 0.00s.
INFO: 2023-03-16 12:46:41,822 [PROFILEREPORT] - Calculating metric [Neighb

In [14]:
print(f"\033[1m{PrivacyLevel.BALANCED_PRIVACY_FIDELITY.name}")
print(f"\033[1mFidelity: {summary_metrics['fidelity']:.2f}")
print(f"\033[1mUtility: {summary_metrics['utility']:.2f}")
print(f"\033[1mPrivacy: {summary_metrics['privacy']:.2f}")

[1mBALANCED_PRIVACY_FIDELITY
[1mFidelity: 0.79
[1mUtility: 0.63
[1mPrivacy: 0.96
