### Import Packages

In [1]:
import multiprocessing
import subprocess
import time
import pandas as pd
from sklearn.model_selection import train_test_split

from Data_Generation.data_synthesizer import ds_generate_data
from Data_Generation.synthetic_data_vault import sdv_generate_data
from Data_Evaluation.sd_metrics import sd_metrics
from Data_Evaluation.resemblance import pairwise_correlation_diff, jsd, wd 
from Data_Evaluation.utility import run_utility_eval
from Data_Evaluation.privacy import dcr, nndr, mia

Set up data path, number of records to generate and split data into train and test data (0.7/0.3 ratio)

In [2]:
# Set the start method of the multiprocessing module to 'fork' to avoid an error
multiprocessing.set_start_method('fork', force=True)

# Number of samples to generate
n = 5000

data_path = "Original_Data/Dataset_1.csv"
original_data = pd.read_csv(data_path)
train_data, test_data = train_test_split(original_data, test_size=0.3, random_state=42)
train_data.to_csv("Original_Data/train_data.csv", index=False)
test_data.to_csv("Original_Data/test_data.csv", index=False)


Generate synthetic data using the different models and save it as csv:

In [10]:
# Use train_data.csv to fit SDG models and generate synthetic data
data_path = "Original_Data/train_data.csv"
arguments = [data_path, str(n)]

print("Sampling synthpop...")
result = subprocess.run(['Rscript', 'Data_Generation/synthpop.R',   *arguments], capture_output=True, text=True)

print("Sampling DataSynthesizer...")
ds_generate_data(data_path=data_path, num_samples=n)

sdv_generate_data(data_path=data_path, num_samples=n)


Sampling synthpop...
Sampling DataSynthesizer...
Adding ROOT Discussion
Adding attribute Relation
Adding attribute AnnouncementsView
Adding attribute raisedhands
Adding attribute Semester
Adding attribute SectionID
Adding attribute VisITedResources
Adding attribute GradeID
Adding attribute StudentAbsenceDays
Adding attribute Class
Adding attribute PlaceofBirth
Adding attribute ParentAnsweringSurvey
Adding attribute StageID
Adding attribute Topic
Adding attribute ParentschoolSatisfaction
Adding attribute gender
Adding attribute NationalITy
Sampling SDV...
Fitting copula_gan...

Time taken to fit copula_gan: 8.88 seconds

Sampling from copula_gan...

Time taken to sample from copula_gan: 0.11 seconds

Fitting ctgan...

Time taken to fit ctgan: 8.35 seconds

Sampling from ctgan...

Time taken to sample from ctgan: 0.10 seconds

Fitting tvae...

Time taken to fit tvae: 2.24 seconds

Sampling from tvae...

Time taken to sample from tvae: 0.07 seconds

Fitting gaussian_copula...

Time taken 

Encode both synthetic and original data to prepare utility evaluation.

In [9]:
# Real data:
original_train = pd.read_csv(data_path)
original_train = pd.get_dummies(original_train, dtype='int64')

# Synthetic data:
synthpop = pd.read_csv("Synthetic_Data/synthpop_samples.csv")
synthpop = pd.get_dummies(synthpop, dtype='int64')

ds = pd.read_csv("Synthetic_Data/ds_samples.csv")
ds = pd.get_dummies(ds, dtype='int64')

tvae = pd.read_csv("Synthetic_Data/tvae_samples.csv")
tvae = pd.get_dummies(tvae, dtype='int64')

gaussian_copula = pd.read_csv("Synthetic_Data/gaussian_samples.csv")
gaussian_copula = pd.get_dummies(gaussian_copula, dtype='int64')

copula_gan = pd.read_csv("Synthetic_Data/copula_gan_samples.csv")
copula_gan = pd.get_dummies(copula_gan, dtype='int64')

ctgan = pd.read_csv("Synthetic_Data/ctgan_samples.csv")
ctgan = pd.get_dummies(ctgan, dtype='int64')
ctgan.head()


Unnamed: 0,raisedhands,VisITedResources,AnnouncementsView,Discussion,gender_F,gender_M,NationalITy_Egypt,NationalITy_Iran,NationalITy_Iraq,NationalITy_Jordan,...,Relation_Mum,ParentAnsweringSurvey_No,ParentAnsweringSurvey_Yes,ParentschoolSatisfaction_Bad,ParentschoolSatisfaction_Good,StudentAbsenceDays_Above-7,StudentAbsenceDays_Under-7,Class_H,Class_L,Class_M
0,11,18,17,93,0,1,0,0,0,0,...,1,1,0,1,0,0,1,1,0,0
1,99,13,76,22,0,1,0,0,0,1,...,1,0,1,1,0,1,0,0,0,1
2,25,57,57,36,1,0,0,0,0,1,...,1,1,0,0,1,1,0,1,0,0
3,42,76,42,37,1,0,0,0,1,0,...,0,1,0,0,1,1,0,1,0,0
4,0,17,22,90,1,0,0,0,0,0,...,1,1,0,0,1,0,1,1,0,0


#### Utility Evaluation

Use ML models for prediction to evaluate the utility

In [None]:
acc_diff , f1_diff = run_utility_eval(original_train, synthpop)