In [1]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata
import pandas as pd

data = pd.read_csv('data/air_quality_cleaned.csv')
data = data.drop(['Date', 'Time'], axis=1)

metadata = {
    "columns": {
        "CO(GT)": {"sdtype": "numerical"},
        "PT08.S1(CO)": {"sdtype": "numerical"},
        "C6H6(GT)": {"sdtype": "numerical"},
        "PT08.S2(NMHC)": {"sdtype": "numerical"},
        "NOx(GT)": {"sdtype": "numerical"},
        "PT08.S3(NOx)": {"sdtype": "numerical"},
        "NO2(GT)": {"sdtype": "numerical"},
        "PT08.S4(NO2)": {"sdtype": "numerical"},
        "PT08.S5(O3)": {"sdtype": "numerical"},
        "T": {"sdtype": "numerical"},
        "RH": {"sdtype": "numerical"},
        "AH": {"sdtype": "numerical"}
    }
}

# Assuming 'metadata_dict' is your dictionary
metadata = Metadata.load_from_dict(metadata)

synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(data=data)

for i in range(1, 31):
    # Create synthetic data
    synthetic_data = synthesizer.sample(6941)
    synthetic_data.to_csv(f"data/air_quality/sdv-copula_air_quality_{i}.csv", index=False)



In [4]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 12/12 [00:00<00:00, 207.57it/s]|
Column Shapes Score: 97.88%

(2/2) Evaluating Column Pair Trends: |██████████| 66/66 [00:00<00:00, 293.39it/s]|
Column Pair Trends Score: 99.07%

Overall Score (Average): 98.47%

