# 合成資料生成模組 - python3.9 版本

## 環境建置
Python 3.9.23

1. 建立 virtualenv
```
python -m venv venv
source venv/bin/activate
```
2. 安裝 core 套件（預先裝 wheel）
```
pip install --upgrade pip setuptools wheel
# pip install -r requirements.txt
pip install sdv==1.23.0
pip install sdmetrics==0.21.0
pip install pandas==2.3.0
pip install numpy==2.0.2
pip install scikit-learn==1.6.1
pip install scipy==1.13.1
pip install matplotlib==3.9.4
```

In [28]:
import pandas as pd
import logging
import time
import os
from sdv.metadata import SingleTableMetadata

from sdv.single_table import GaussianCopulaSynthesizer, CTGANSynthesizer

In [29]:
def get_data_from_model(model_path, num_rows=1000, condition_dict=None):
    logging.info(f"Loading model from: {model_path}")

    if "GaussianCopula" in model_path:
        model = GaussianCopulaSynthesizer.load(model_path)
    elif "CTGAN" in model_path:
        model = CTGANSynthesizer.load(model_path)
    else:
        raise ValueError(f"Cannot determine model type from file name: {model_path}")

    if condition_dict:
        logging.info(f"Sampling with conditions: {condition_dict}")
        return model.sample_conditions(conditions=condition_dict, num_rows=num_rows)
    else:
        return model.sample(num_rows=num_rows)

In [30]:
def data_sythesizer(args, input_df=pd.DataFrame()):
    """Synthesize input dataframe data and return synthetic output."""

    # === Step 1: 若有 primary key，先轉為 str 避免 regex 檢查錯誤 ===
    pri_key = args.primary_key
    if pri_key and pri_key in input_df.columns:
        input_df[pri_key] = input_df[pri_key].astype(str)

    # === Step 2: 建構 metadata ===
    # metadata = Metadata.detect_from_dataframe(data=input_df)
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=input_df)

    if pri_key:
        metadata.update_column(column_name=pri_key, sdtype="id")
        metadata.set_primary_key(pri_key)
        logging.info(f"Primary key '{pri_key}' set as sdtype='id'.")

    # === Step 3: Initialize model ===
    if args.synth_model == "GaussianCopula":
        print("Synthetic model arch: GaussianCopula")
        model = GaussianCopulaSynthesizer(metadata)
    elif args.synth_model == "CTGAN":
        if args.custom_setting:
            print("Synthetic model arch: CTGAN (custom)")
            model = CTGANSynthesizer(
                metadata=metadata,
                epochs=args.epochs,
                batch_size=args.batch_size,
                generator_dim=tuple(args.gen_dim),
                discriminator_dim=tuple(args.dis_dim),
                verbose=True,
            )
        else:
            logging.info("Synthetic model arch: CTGAN (default)")
            model = CTGANSynthesizer(metadata, verbose=True)
    else:
        raise ValueError(f"Unsupported synth_model: {args.synth_model}")

    # === Step 4: Fit model ===
    print("Fitting synthetic model ...")
    start_time = time.time()
    model.fit(input_df)
    print(f"Training completed in {time.time() - start_time:.2f} seconds")

    # === Step 5: Sample data ===
    output_df = model.sample(num_rows=args.num_rows)

    # === Step 6: Save model if specified ===
    if args.save_model:
        print("=== save Syn. Model file ===")
        model_name = f"syn_model_{args.synth_model}"
        if args.custom_setting:
            model_name += "-c"
        model_name += ".pkl"

        output_dir = os.path.dirname(args.output_fpath) if args.output_fpath else args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        model_path = os.path.join(output_dir, model_name)

        print(f"Saving model to {model_path}")
        model.save(model_path)

    # === Step 7: Save output CSV if specified ===
    if args.save_output and args.output_fpath:
        os.makedirs(os.path.dirname(args.output_fpath), exist_ok=True)
        print(f"Saving synthetic output to {args.output_fpath}")
        output_df.to_csv(args.output_fpath, index=False)

    return output_df

In [31]:
def set_args(args_list=None):
    """Main Function
    process input and do configs check
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_path", type=str, default="data/train.csv")
    parser.add_argument("--output_dir", type=str, default="data/output/")
    parser.add_argument("--synth_model", type=str, default="GaussianCopula", help="sythetic model type")
    parser.add_argument("--primary_key", type=str, default="", help="primary key in your tabular data")
    parser.add_argument("--num_rows", type=int, default=200, help="num rows of the output sythetic dataframe")
    parser.add_argument("--save_model", action="store_true", help="set for save model pkl file")
    parser.add_argument("--save_output", action="store_true", help="set for save output csv file")
    parser.add_argument("--save_report", action="store_true", help="set for save report csv and image files")
    
    parser.add_argument("--custom_setting", action="store_true", help="set for custom setting in CTGAN and TVAE Model")
    parser.add_argument("--epochs", type=int, default=300, help="set epochs for training CTGAN and TVAE Model")
    parser.add_argument("--batch_size", type=int, default=500, help="set batch size for training CTGAN and TVAE Model")
    parser.add_argument("--gen_dim", type=int, nargs="+", default=[256, 256], help="set gen dimension")
    parser.add_argument("--dis_dim", type=int, nargs="+", default=[256, 256], help="set dis dimension")
    
    parser.add_argument("--input_syn_model", type=str, default=None, help="path to your syn_data model file")
    parser.add_argument("--sample_condition", type=str, default=None, help="path to your syn_data sample condition json file")
    parser.add_argument("--output_fpath", type=str, default=None, help="set full file path for your syn_data output csv")

    return parser.parse_args(args_list)

In [24]:
def main(args):
    # logging.info(f"contents of args.primary_key {args.primary_key}")
    # logging.info(f"contents of args.custom_setting {args.custom_setting}")
    # logging.info(f"contents of args.gen_dim {args.gen_dim}")
    # logging.info(f"contents of args.dis_dim {args.dis_dim}")
    if not args.input_syn_model:
        print("=== Train Synthetic Model and Generate Sample SynData csv ===")
        assert os.path.exists(args.input_path), f"Can't find the input file at {args.input_path}."
        assert os.path.exists(args.output_dir), f"Can't find the output folder at {args.output_dir}."
        assert args.synth_model in ["GaussianCopula", "CTGAN"]
        # ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]

        
        input_path = args.input_path
        output_dir = args.output_dir
        # output_fname=args.output_fname

        print(f"input file path: {input_path}")
        print(f"output directory: {output_dir}")

        input_df = pd.read_csv(input_path)
        if args.primary_key:
            assert args.primary_key in input_df.columns

        # if "Id" in input_df.columns:
        #     input_df = input_df.drop(columns=["Id"])

        output_df = data_sythesizer(args=args, input_df=input_df)

        logging.info("output dataframe shape")
        logging.info(output_df.shape)
        logging.info("output dataframe head(5)")
        logging.info(output_df.head())


        if args.save_output:
            print("=== save output csv file ===")
            
            base = os.path.basename(input_path)
            output_fname = (
                os.path.splitext(base)[0] + "_" + args.synth_model + "_output.csv"
            )
            output_df.to_csv(os.path.join(output_dir, output_fname), index=False)
            
            print(f"saved to {os.path.join(output_dir, output_fname)}")

    else:
        print("=== Generate Synthetic Data from Syn. Model ===")
        model_path = args.input_syn_model
        num_rows = args.num_rows
        condition_fpath = args.sample_condition
        output_fpath=args.output_fpath

        print(f"The Syn. Model Path: {model_path}")
        print(f"Generate {num_rows} rows to {output_fpath}...")
        
        assert os.path.exists(model_path), f"Can't find the model_path pkl file: {model_path}."
        if not condition_fpath:
            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=None)
        else:
            assert os.path.exists(condition_fpath), f"Can't find the sample_condition json file: {condition_fpath}."
            assert condition_fpath[-5:] == ".json", f"{condition_fpath} must be a json file!"
            
            with open(condition_fpath, "r") as f:
                condition_dict = json.load(f)

            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=condition_dict)

        print("output dataframe shape")
        print(output_df.shape)
        print("output dataframe head(5)")
        print(output_df.head())

        if args.save_output:
            print(f"=== Save csv to {output_fpath} ===")
            output_df.to_csv(output_fpath, index=False)

# 使用模組訓練合成資料模型 + 生成合成資料

In [25]:
args = set_args([
    "--input_syn_model", "output/syn_model_GaussianCopula.pkl", # 合成資料生成模型路徑 
    "--output_fpath", "output/syn_data.csv",   # 合成資料輸出路徑
    "--num_rows", "6000",  # 生成的資料筆數
    "--save_output"
])

main(args)

=== Generate Synthetic Data from Syn. Model ===
The Syn. Model Path: output/syn_model_GaussianCopula.pkl
Generate 6000 rows to output/syn_data.csv...


'Loading model from: output/syn_model_GaussianCopula.pkl'

"{'EVENT': 'Load', 'TIMESTAMP': datetime.datetime(2025, 6, 18, 6, 15, 18, 452595), 'SYNTHESIZER CLASS NAME': 'GaussianCopulaSynthesizer', 'SYNTHESIZER ID': 'GaussianCopulaSynthesizer_1.23.0_e64b93aad4dc4a8e871d896a958cf61a'}"

"The real data in '3SsnPorch' was stored as 'int64' but the synthetic data could not be cast back to this type. If this is a problem, please check your input data and metadata settings."

"{'EVENT': 'Sample', 'TIMESTAMP': datetime.datetime(2025, 6, 18, 6, 15, 18, 453529), 'SYNTHESIZER CLASS NAME': 'GaussianCopulaSynthesizer', 'SYNTHESIZER ID': 'GaussianCopulaSynthesizer_1.23.0_e64b93aad4dc4a8e871d896a958cf61a', 'TOTAL NUMBER OF TABLES': 1, 'TOTAL NUMBER OF ROWS': 6000, 'TOTAL NUMBER OF COLUMNS': 81}"

output dataframe shape
(6000, 81)
output dataframe head(5)
              Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley  \
0  sdv-id-GZCyfr          48       RL         82.0    10489   Pave   NaN   
1  sdv-id-PcjWgv          86       RL         50.0     5545   Pave   NaN   
2  sdv-id-SfwDTL          32       RL         77.0    14440   Pave   NaN   
3  sdv-id-dVJhsp         138       RL         73.0    11653   Pave   NaN   
4  sdv-id-KzRwOB         128       RL         68.0     8953   Pave   NaN   

  LotShape LandContour Utilities  ... PoolArea PoolQC  Fence MiscFeature  \
0      Reg         Lvl    AllPub  ...        0    NaN  MnPrv         NaN   
1      IR1         Lvl    AllPub  ...        0    NaN    NaN         NaN   
2      Reg         Lvl    AllPub  ...        0    NaN    NaN         NaN   
3      Reg         Lvl    AllPub  ...        0    NaN  MnPrv         NaN   
4      Reg         Lvl    AllPub  ...        0    NaN  MnPrv         NaN   

  MiscVal MoSold YrSold  Sa

## 檢視真實資料與合成資料表單

In [14]:
import pandas as pd

real_data_df = pd.read_csv("input/data.csv")  # 真實資料路徑
syn_data_df = pd.read_csv("output/syn_data.csv")  # 合成資料預設檔名為: 真實資料檔名 + "_GaussianCopula_output"

FileNotFoundError: [Errno 2] No such file or directory: 'output/syn_data.csv'

In [None]:
real_data_df.head(10)

In [13]:
syn_data_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,38,RM,83.117324,10146,Pave,,Reg,Lvl,AllPub,...,-4,,,,16,1,2008,WD,Normal,172605
1,1,26,RL,78.984574,8350,Pave,,IR3,Low,AllPub,...,-5,,,,65,7,2006,WD,Normal,122255
2,2,65,RL,70.19725,9486,Pave,,Reg,Low,AllPub,...,7,,,,-250,5,2008,WD,Normal,195464
3,3,17,RL,,10334,Pave,,IR1,Lvl,AllPub,...,8,,,,-32,10,2008,WD,Abnorml,151816
4,4,72,RL,78.922532,10055,Pave,,Reg,Lvl,AllPub,...,5,,,,147,7,2010,WD,Normal,189342
5,5,23,RM,77.128277,7223,Pave,,Reg,Lvl,AllPub,...,9,,,,-148,9,2010,New,Normal,161592
6,6,73,RL,,10240,Pave,,IR1,Lvl,AllPub,...,-4,,,,-56,4,2009,WD,Normal,166694
7,7,45,RL,99.136819,13482,Pave,,Reg,Lvl,AllPub,...,-3,,,,-57,6,2008,New,Partial,333492
8,8,19,RL,,16833,Pave,,IR1,Lvl,AllPub,...,-7,,,,153,6,2007,WD,Abnorml,142353
9,9,31,RL,46.789488,10843,Pave,,Reg,Lvl,AllPub,...,8,,,,-131,8,2007,WD,Normal,253741
