In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install sdv

Collecting sdv
  Downloading sdv-1.13.1-py3-none-any.whl (144 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.1/144.1 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boto3>=1.28 (from sdv)
  Downloading boto3-1.34.117-py3-none-any.whl (139 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.3/139.3 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore>=1.31 (from sdv)
  Downloading botocore-1.34.117-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
Collecting copulas>=0.11.0 (from sdv)
  Downloading copulas-0.11.0-py3-none-any.whl (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.9/51.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ctgan>=0.10.0 (from sdv)
  Downloading ctgan-0.10.1-py3-none-any.whl (24 kB)
Collecting deepecho>=0.6.0 (from sdv)
  Downloading deepecho-0.6.

In [None]:
import pandas as pd
import numpy as np

import sklearn.datasets as datasets
from sklearn.model_selection import train_test_split

from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_plot

seed = 42

## Generation (no need to rerun)

In [None]:
# create metadata
metadata = SingleTableMetadata()

# import data
data = pd.read_csv('/content/drive/MyDrive/SC263/data/sig_feats/train_sigfeats.csv')
data.drop(columns=data.columns[0], inplace=True)

metadata.detect_from_dataframe(data)

In [None]:
data.head()

Unnamed: 0,SEQN_new,SDDSRVYR,diabetes,age,female,race_ethnicity,education,us_born,pir,smoke,...,LBDHDDSI,LBDSTRSI,VNTOTHDRATIO,LBXMCVSI,LBXSGTSI,LBXGH,LBXSOSSI,LBXSCLSI,LBXBPB,LBXSNASI
0,C-21017,3,0,37,1,3,2,0,1,0,...,1.99,0.948,2.38961,95.9,17,5.1,271,105,7.0,137
1,C-21091,3,0,25,1,1,5,1,2,0,...,1.94,0.644,2.36,89.9,7,4.8,268,103,1.0,136
2,C-21142,3,0,31,1,1,2,1,3,0,...,1.86,0.734,2.263889,84.8,17,5.2,276,105,0.6,139
3,C-21205,3,0,40,1,2,4,1,1,1,...,1.58,0.948,2.967213,95.7,32,5.3,272,103,3.1,138
4,C-21223,3,0,34,1,1,3,1,1,1,...,1.6,0.497,3.096774,98.6,8,5.5,278,106,0.8,141


In [None]:
# reliable negative id
id_path = '/content/drive/MyDrive/SC263/data/reliable_negatives_id.txt'
with open(id_path, 'r') as file:
    negative_ids = file.read().splitlines()

In [None]:
# reliable negatives
negatives = data[data['SEQN_new'].apply(lambda x: x in negative_ids)].copy()

# positives
positives = data[data['diabetes'] == 1].copy()

In [None]:
len(negatives[negatives['race_ethnicity'] == 3])

352

In [None]:
# white + negative
race_1_neg = negatives[negatives['race_ethnicity'] == 1].copy()

# white + positive
race_1_pos = positives[positives['race_ethnicity'] == 1].copy()

# black + negative
race_2_neg = negatives[negatives['race_ethnicity'] == 2].copy()

# black + positive
race_2_pos = positives[positives['race_ethnicity'] == 2].copy()

# HL + negative
race_3_neg = negatives[negatives['race_ethnicity'] == 3].copy()

# HL + positive
race_3_pos = positives[positives['race_ethnicity'] == 3].copy()

In [None]:
num_of_samples = 1000

In [None]:
# generate using CTGAN
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_1_neg)

race_1_neg_synthetic = synthesizer.sample(num_rows=num_of_samples)



In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_1_pos)

race_1_pos_synthetic = synthesizer.sample(num_rows=num_of_samples)

In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_2_neg)

race_2_neg_synthetic = synthesizer.sample(num_rows=num_of_samples)

In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_2_pos)

race_2_pos_synthetic = synthesizer.sample(num_rows=num_of_samples)

In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_3_neg)

race_3_neg_synthetic = synthesizer.sample(num_rows=num_of_samples)

In [None]:
synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(race_3_pos)

race_3_pos_synthetic = synthesizer.sample(num_rows=num_of_samples)

In [None]:
CTGAN_synthetic_data = pd.concat([race_1_neg_synthetic, race_1_pos_synthetic,
           race_2_neg_synthetic, race_2_pos_synthetic,
           race_3_neg_synthetic, race_3_pos_synthetic], axis=0, ignore_index=True)

In [None]:
len(CTGAN_synthetic_data)

6000

In [None]:
# save dataframe
CTGAN_synthetic_data.to_csv('/content/drive/MyDrive/SC263/data/sig_feats/CTGAN_synthetic_train_sigfeats.csv', index=False)

## Evaluation (run from here)

In [None]:
# create metadata
metadata = SingleTableMetadata()

# import data
data = pd.read_csv('/content/drive/MyDrive/SC263/data/sig_feats/train_sigfeats.csv')
data.drop(columns=data.columns[0], inplace=True)

metadata.detect_from_dataframe(data)

In [None]:
# reliable negative id
id_path = '/content/drive/MyDrive/SC263/data/reliable_negatives_id.txt'
with open(id_path, 'r') as file:
    negative_ids = file.read().splitlines()

negatives = data[data['SEQN_new'].apply(lambda x: x in negative_ids)].copy()
positives = data[data['diabetes'] == 1].copy()

race_1_neg = negatives[negatives['race_ethnicity'] == 1].copy()
race_1_pos = positives[positives['race_ethnicity'] == 1].copy()

race_2_neg = negatives[negatives['race_ethnicity'] == 2].copy()
race_2_pos = positives[positives['race_ethnicity'] == 2].copy()

race_3_neg = negatives[negatives['race_ethnicity'] == 3].copy()
race_3_pos = positives[positives['race_ethnicity'] == 3].copy()

In [None]:
# load synthetic data
synthetic_data_stacked = pd.read_csv('/content/drive/MyDrive/SC263/data/sig_feats/CTGAN_synthetic_train_sigfeats.csv')

In [None]:
# split dataset

positives_synthetic = synthetic_data_stacked[synthetic_data_stacked['diabetes'] == 1].copy()
negatives_synthetic = synthetic_data_stacked[synthetic_data_stacked['diabetes'] == 0].copy()


race_1_neg_synthetic = negatives_synthetic[negatives_synthetic['race_ethnicity'] == 1].copy()
race_1_pos_synthetic = positives_synthetic[positives_synthetic['race_ethnicity'] == 1].copy()

race_2_neg_synthetic = negatives_synthetic[negatives_synthetic['race_ethnicity'] == 2].copy()
race_2_pos_synthetic = positives_synthetic[positives_synthetic['race_ethnicity'] == 2].copy()

race_3_neg_synthetic = negatives_synthetic[negatives_synthetic['race_ethnicity'] == 3].copy()
race_3_pos_synthetic = positives_synthetic[positives_synthetic['race_ethnicity'] == 3].copy()

Change data and data_synthetic and run diagnosis

In [None]:
data = race_3_pos
data_synthetic = race_3_pos_synthetic

In [None]:
# diagnosis
diagnostic = run_diagnostic(
    real_data=data,
    synthetic_data=data_synthetic,
    metadata=metadata
)

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 30/30 [00:00<00:00, 1004.29it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 119.52it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [None]:
diagnostic.get_details(property_name='Data Validity')

Unnamed: 0,Column,Metric,Score
0,SEQN_new,KeyUniqueness,1.0
1,SDDSRVYR,CategoryAdherence,1.0
2,diabetes,CategoryAdherence,1.0
3,age,BoundaryAdherence,1.0
4,female,CategoryAdherence,1.0
5,race_ethnicity,CategoryAdherence,1.0
6,education,CategoryAdherence,1.0
7,us_born,CategoryAdherence,1.0
8,pir,CategoryAdherence,1.0
9,smoke,CategoryAdherence,1.0


In [None]:
diagnostic.get_details(property_name='Data Structure')

Unnamed: 0,Metric,Score
0,TableStructure,1.0


In [None]:
quality_report = evaluate_quality(
    data,
    data_synthetic,
    metadata
)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 30/30 [00:00<00:00, 1233.64it/s]|
Column Shapes Score: 80.26%

(2/2) Evaluating Column Pair Trends: |██████████| 435/435 [00:08<00:00, 53.12it/s]|
Column Pair Trends Score: 87.51%

Overall Score (Average): 83.89%



In [None]:
quality_report.get_details(property_name='Column Shapes')

Unnamed: 0,Column,Metric,Score
0,SDDSRVYR,TVComplement,0.905302
1,diabetes,TVComplement,1.0
2,age,KSComplement,0.779365
3,female,TVComplement,0.973397
4,race_ethnicity,TVComplement,1.0
5,education,TVComplement,0.930571
6,us_born,TVComplement,0.924175
7,pir,TVComplement,0.939349
8,smoke,TVComplement,0.996016
9,alcohol_consumption,TVComplement,0.987667


In [None]:
quality_report.get_details(property_name='Column Pair Trends')

Unnamed: 0,Column 1,Column 2,Metric,Score,Real Correlation,Synthetic Correlation
0,SDDSRVYR,diabetes,ContingencySimilarity,0.905302,,
1,SDDSRVYR,age,ContingencySimilarity,0.833825,,
2,SDDSRVYR,female,ContingencySimilarity,0.813587,,
3,SDDSRVYR,race_ethnicity,ContingencySimilarity,0.905302,,
4,SDDSRVYR,education,ContingencySimilarity,0.722587,,
...,...,...,...,...,...,...
401,LBXSOSSI,LBXBPB,CorrelationSimilarity,0.952711,0.063665,-0.030912
402,LBXSOSSI,LBXSNASI,CorrelationSimilarity,0.680731,0.604401,-0.034138
403,LBXSCLSI,LBXBPB,CorrelationSimilarity,0.994414,0.026921,0.038094
404,LBXSCLSI,LBXSNASI,CorrelationSimilarity,0.735862,0.615434,0.087157


In [None]:
fig = get_column_plot(
    real_data=data,
    synthetic_data=data_synthetic,
    column_name='LBXSOSSI',
    metadata=metadata
)

fig.show()