In [None]:
import pandas as pd
import itertools
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import numpy as np
import pickle

# Load the original data
original_data = pd.read_csv('Boston.csv')

# Check
print("Original Data:")
print(original_data.head())

# Check the metadata/data structure
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(original_data)
print(metadata)

# Define batch sizes and epochs to test
batch_sizes = [200, 300, 500]
epochs_list = [1000, 2000, 3000, 5000]
batch_epoch_combinations = list(itertools.product(batch_sizes, epochs_list))

# Function to evaluate the synthetic data quality
def evaluate_synthetic_data(original_data, synthetic_data):
    # Example evaluation using mean absolute error
    errors = []
    for col in original_data.columns:
        if original_data[col].dtype in [np.float64, np.int64]:
            error = np.mean(np.abs(original_data[col] - synthetic_data[col]))
            errors.append(error)
    return np.mean(errors)

best_model = None
best_score = float('inf')
best_params = None

# Loop over each combination of batch size and epochs
for batch_size, epochs in batch_epoch_combinations:
    try:
        print(f"Training model with batch_size={batch_size}, epochs={epochs}")
        
        synthesizer = CTGANSynthesizer(
            metadata, 
            enforce_min_max_values=True,
            enforce_rounding=True,
            batch_size=batch_size, 
            epochs=epochs, 
            verbose=True,
            cuda=True  # Set to False if CUDA is not available
        )
        
        # Fit the synthesizer to the original data and generate synthetic data
        synthesizer.fit(original_data)
        synthetic_data = synthesizer.sample(len(original_data))

        # Evaluate the synthetic data
        score = evaluate_synthetic_data(original_data, synthetic_data)
        print(f"Score for batch_size={batch_size}, epochs={epochs}: {score}")
        
        # Check if this model is the best so far
        if score < best_score:
            best_score = score
            best_model = synthesizer
            best_params = (batch_size, epochs)
    except Exception as e:
        print(f"An error occurred for batch_size={batch_size}, epochs={epochs}: {e}")

# Save the best model
if best_model is not None:
    best_model_filename = f"best_model_bs{best_params[0]}_ep{best_params[1]}.pkl"
    with open(best_model_filename, 'wb') as f:
        pickle.dump(best_model, f)
    print(f"Best model saved as {best_model_filename}")

    # Save the synthetic data from the best model
    synthetic_data = best_model.sample(len(original_data))
    synthetic_data.to_csv('synthetic_data.csv', index=False)
    print("Best synthetic data generated and saved to 'synthetic_data.csv'")
else:
    print("No valid model was trained.")


In [None]:
#### Evaluation
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.evaluation.single_table import run_diagnostic
from sdv.evaluation.single_table import evaluate_quality
from sdv.evaluation.single_table import get_column_pair_plot
import pandas as pd

# Here, we do some evaluation to see how well the synthetic data matches the original data
original_data = pd.read_csv('Boston.csv')
synthetic_data = pd.read_csv('synthetic_data.csv')

original_data.keys()

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(original_data)

# Here is check the data types of the columns and data structure.
diagnostic = run_diagnostic(
    real_data=original_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

In [None]:
# Here we check how close the two data is. Score close to 1 is good.
quality_report = evaluate_quality(
    original_data,
    synthetic_data,
    metadata
)

quality_report.get_details(property_name='Column Shapes')

In [None]:
# Here we check how can the privacy be reserved in the synthetic data

# Find overlapping rows
overlapping_data = pd.merge(original_data, synthetic_data, how='inner')

# Display the overlapping rows
print("Overlapping data:")
print(overlapping_data)

In [30]:
# Here we show the distribution of selected columns in the original and synthetic data

fig = get_column_pair_plot(
    real_data=original_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_names=['rm', 'dis'],
    )
    
fig.show()