In [6]:
import pandas as pd
import numpy as np
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import Metadata
from sdv.evaluation.single_table import evaluate_quality
from typing import List, Dict, Optional, Tuple

In [7]:
df = pd.read_csv("../Data/Cardio_blood.csv")

In [8]:
feature_cols = ['PTID',
                'FL_MMSE', 'CDRSUM', 'CDRGLOB', 'HVLT_DR', 'LASSI_A_CR2', 'LASSI_B_CR1', 'LASSI_B_CR2',
                'COMBINED_NE4S', 'AMYLPET',
                'PTAU_217_CONCNTRTN',
                'FL_UDSD']

df = df[df['FL_UDSD'] != "Unknown"]

metadata = Metadata.detect_from_dataframe(df)
metadata.update_column(column_name='PTID', sdtype='categorical')
#metadata.save_to_json("metadata.json")

In [9]:
CONFIGS = {
    "baseline": {
        "epochs": 500,
        "batch_size": 100,
        "generator_dim": (128, 128),
        "discriminator_dim": (128, 128),
        "embedding_dim": 128,
        "generator_lr": 2e-4,
        "discriminator_lr": 2e-4,
    },
    "small_model": {
        "epochs": 300,
        "batch_size": 50,
        "generator_dim": (64, 64),
        "discriminator_dim": (64, 64),
        "embedding_dim": 64,
        "generator_lr": 2e-4,
        "discriminator_lr": 2e-4,
    },
    "large_model": {
        "epochs": 500,
        "batch_size": 100,
        "generator_dim": (256, 256),
        "discriminator_dim": (256, 256),
        "embedding_dim": 256,
        "generator_lr": 2e-4,
        "discriminator_lr": 2e-4,
    },
    "deep_model": {
        "epochs": 500,
        "batch_size": 100,
        "generator_dim": (128, 128, 128),
        "discriminator_dim": (128, 128, 128),
        "embedding_dim": 128,
        "generator_lr": 2e-4,
        "discriminator_lr": 2e-4,
    },
    "high_lr": {
        "epochs": 500,
        "batch_size": 100,
        "generator_dim": (128, 128),
        "discriminator_dim": (128, 128),
        "embedding_dim": 128,
        "generator_lr": 5e-4,
        "discriminator_lr": 5e-4,
    },
    "large_batch": {
        "epochs": 500,
        "batch_size": 200,
        "generator_dim": (128, 128),
        "discriminator_dim": (128, 128),
        "embedding_dim": 128,
        "generator_lr": 2e-4,
        "discriminator_lr": 2e-4,
    },
}

In [11]:
def train_synthesizer(df: pd.DataFrame, metadata: Metadata, config: Dict) -> CTGANSynthesizer:
    """
        Function to train a CTGAN synthesizer on the given dataframe and metadata using the specified configuration.
        
        Args:
            df: The input dataframe to train the synthesizer on.
            metadata: The metadata object describing the structure of the data.
            config: A dictionary containing the configuration parameters for the synthesizer.
    """

    synthesizer = CTGANSynthesizer(
        embedding_dim=config['embedding_dim'],
        generator_dim=config['generator_dim'],
        discriminator_dim=config['discriminator_dim'],
        generator_lr=config['generator_lr'],
        discriminator_lr=config['discriminator_lr']
    )
    synthesizer.fit(
        df, metadata, epochs=config['epochs'], batch_size=config['batch_size'])
    return synthesizer

In [None]:
#loop the function and save the models and the quality reports for each configuration

for name, config in CONFIGS.items():
    synth = train_synthesizer(df, metadata, config)
    synthetic_data = synth.sample(len(df))
    quality_report = evaluate_quality(df, synthetic_data, metadata)
    
    print(f"Configuration: {name}")
    print(f"Quality Report: {quality_report}")
    
    # Save the model and the quality report
    synth.save(f"ctgan_model_{name}.pkl")
    
    # write to a csv the model name and the quality report
    with open("quality_reports.csv", "a") as f:
        f.write(f"{name},{quality_report.get_score()}\n")
        
    
    

baseline configuration: {'epochs': 500, 'batch_size': 100, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'embedding_dim': 128, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002}
small_model configuration: {'epochs': 300, 'batch_size': 50, 'generator_dim': (64, 64), 'discriminator_dim': (64, 64), 'embedding_dim': 64, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002}
large_model configuration: {'epochs': 500, 'batch_size': 100, 'generator_dim': (256, 256), 'discriminator_dim': (256, 256), 'embedding_dim': 256, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002}
deep_model configuration: {'epochs': 500, 'batch_size': 100, 'generator_dim': (128, 128, 128), 'discriminator_dim': (128, 128, 128), 'embedding_dim': 128, 'generator_lr': 0.0002, 'discriminator_lr': 0.0002}
high_lr configuration: {'epochs': 500, 'batch_size': 100, 'generator_dim': (128, 128), 'discriminator_dim': (128, 128), 'embedding_dim': 128, 'generator_lr': 0.0005, 'discriminator_lr': 0.0005}
large_batch co