# 合成資料模型訓練模組 - python3.6 版本

## 環境建置
1. 安裝 pyenv 依賴
```
sudo apt update
sudo apt install -y make build-essential libssl-dev zlib1g-dev \
  libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev
```
2. 安裝 Python 3.6.15
```
pyenv install 3.6.15
pyenv local 3.6.15
```
3. 建立 virtualenv
```
python -m venv venv
source venv/bin/activate
```
4. 安裝 core 套件（預先裝 wheel）
```
pip install --upgrade pip setuptools wheel
pip install numpy==1.19.5 cython==0.29.36 pandas==0.24.2
pip install sdv==0.3.6
pip install jupyterlab==3.2.9
pip install scipy==1.2.3 sdmetrics==0.0.2.dev0
```

In [18]:
import argparse
import json
import logging
import os
import time

import numpy as np
import pandas as pd
from sdv.tabular import GaussianCopula
from sdv.tabular import CTGAN

from sdv import SDV

In [19]:
from IPython.display import display
import logging

class JupyterHandler(logging.Handler):
    def emit(self, record):
        msg = self.format(record)
        display(msg)

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(JupyterHandler())

In [20]:
def get_data_from_model(model_path, num_rows=1000, condition_dict=None):
    logging.info("Generate Synthetic data from Model")
    model = SDV.load(model_path)
    
    # condition = Condition({'gender': 'M'}, num_rows=num_rows)
    if not condition_dict:
        sampled = model.sample(num_rows=num_rows)
    else:
        logging.info("Sample data by condition")
        condition = Condition(condition_dict, num_rows=num_rows)
        sampled = model.sample_conditions(conditions=[condition])
        
    return sampled

In [21]:
def data_sythesizer(args, input_df=pd.DataFrame()):
    """Synthesize input dataframe data and output dataframe

    Args:
        args (argparse): arguments for configs.
        input_df (DataFrame): input data. Defaults to pd.DataFrame().

    Returns:
        output_df (DataFrame): synthesized data output
    """
    pri_key = args.primary_key

    if args.synth_model == "GaussianCopula":
        logging.info("sythetic model arch: GaussianCopula")
        model = GaussianCopula(primary_key=pri_key) if pri_key else GaussianCopula()
    elif args.synth_model == "CTGAN":
        if args.custom_setting:
            logging.info("sythetic model arch: CTGAN-c")
            model = CTGAN(
                primary_key=pri_key,
                epochs=args.epochs,
                batch_size=args.batch_size,
                generator_dim=tuple(args.gen_dim),
                discriminator_dim=tuple(args.dis_dim),
                verbose=True,
            )
        else:
            logging.info("sythetic model arch: CTGAN")
            model = (
                CTGAN(primary_key=pri_key, verbose=True)
                if pri_key
                else CTGAN(verbose=True)
            )
    else:
        logging.info("the sythetic model is not supported!")

    logging.info("Synthetic model fitting data start ... ")
    start_time = time.time()
    model.fit(input_df)
    logging.info(f"Training time cost: {time.time()-start_time}")
    output_df = model.sample(num_rows=args.num_rows)

    if args.save_model:
        output_model_path = os.path.join(
            args.output_dir, f"syn_model_{args.synth_model}.pkl"
        )
        if args.custom_setting:
            output_model_path = os.path.join(
                args.output_dir, f"syn_model_{args.synth_model}-c.pkl"
            )

        logging.info(f"Save model to {output_model_path}")
        model.save(output_model_path)

    return output_df

In [22]:
def set_args(args_list=None):
    """Main Function
    process input and do configs check
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_path", type=str, default="data/train.csv")
    parser.add_argument("--output_dir", type=str, default="data/output/")
    parser.add_argument("--synth_model", type=str, default="GaussianCopula", help="sythetic model type")
    parser.add_argument("--primary_key", type=str, default="", help="primary key in your tabular data")
    parser.add_argument("--num_rows", type=int, default=200, help="num rows of the output sythetic dataframe")
    parser.add_argument("--save_model", action="store_true", help="set for save model pkl file")
    parser.add_argument("--save_output", action="store_true", help="set for save output csv file")
    parser.add_argument("--save_report", action="store_true", help="set for save report csv and image files")
    
    parser.add_argument("--custom_setting", action="store_true", help="set for custom setting in CTGAN and TVAE Model")
    parser.add_argument("--epochs", type=int, default=300, help="set epochs for training CTGAN and TVAE Model")
    parser.add_argument("--batch_size", type=int, default=500, help="set batch size for training CTGAN and TVAE Model")
    parser.add_argument("--gen_dim", type=int, nargs="+", default=[256, 256], help="set gen dimension")
    parser.add_argument("--dis_dim", type=int, nargs="+", default=[256, 256], help="set dis dimension")
    
    parser.add_argument("--input_syn_model", type=str, default=None, help="path to your syn_data model file")
    parser.add_argument("--sample_condition", type=str, default=None, help="path to your syn_data sample condition json file")
    parser.add_argument("--output_fpath", type=str, default=None, help="set full file path for your syn_data output csv")

    return parser.parse_args(args_list)

In [23]:
def main(args):
    # logging.info(f"contents of args.primary_key {args.primary_key}")
    # logging.info(f"contents of args.custom_setting {args.custom_setting}")
    # logging.info(f"contents of args.gen_dim {args.gen_dim}")
    # logging.info(f"contents of args.dis_dim {args.dis_dim}")
    if not args.input_syn_model:
        assert os.path.exists(args.input_path), f"Can't find the input file at {args.input_path}."
        assert os.path.exists(args.output_dir), f"Can't find the output folder at {args.output_dir}."
        assert args.synth_model in ["GaussianCopula", "CTGAN"]
        # ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]

        input_path = args.input_path
        output_dir = args.output_dir
        # output_fname=args.output_fname

        input_df = pd.read_csv(input_path)
        if args.primary_key:
            assert args.primary_key in input_df.columns

        # if "Id" in input_df.columns:
        #     input_df = input_df.drop(columns=["Id"])

        output_df = data_sythesizer(args=args, input_df=input_df)

        logging.info("output dataframe shape")
        logging.info(output_df.shape)
        logging.info("output dataframe head(5)")
        logging.info(output_df.head())

        if args.save_output:
            base = os.path.basename(input_path)
            output_fname = (
                os.path.splitext(base)[0] + "_" + args.synth_model + "_output.csv"
            )
            output_df.to_csv(os.path.join(output_dir, output_fname), index=False)
    else:
        model_path = args.input_syn_model
        num_rows = args.num_rows
        condition_fpath = args.sample_condition
        output_fpath=args.output_fpath
        assert os.path.exists(model_path), f"Can't find the model_path pkl file: {model_path}."
        if not condition_fpath:
            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=None)
        else:
            assert os.path.exists(condition_fpath), f"Can't find the sample_condition json file: {condition_fpath}."
            assert condition_fpath[-5:] == ".json", f"{condition_fpath} must be a json file!"
            
            with open(condition_fpath, "r") as f:
                condition_dict = json.load(f)

            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=condition_dict)

        logging.info("output dataframe shape")
        logging.info(output_df.shape)
        logging.info("output dataframe head(5)")
        logging.info(output_df.head())

        if args.save_output:
            output_df.to_csv(output_fpath, index=False)


# 使用模組訓練合成資料模型 + 生成合成資料

In [17]:
args = set_args([
    "--input_path", "input/data.csv", # input 真實資料 csv 格式的表單
    "--output_dir", "output/",   # 合成資料輸出路徑
    "--synth_model", "GaussianCopula",  # 合成資料模型演算法
    "--primary_key", "Id",  # key 欄位名稱
    "--num_rows", "100",  # 生成的資料筆數
    "--save_model",    # 設定模型
    "--save_output"    # 設定儲存
])

main(args)

INFO:root:sythetic model arch: GaussianCopula


'sythetic model arch: GaussianCopula'

'sythetic model arch: GaussianCopula'

INFO:root:Synthetic model fitting data start ... 


'Synthetic model fitting data start ... '

'Synthetic model fitting data start ... '

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field MSSubClass


'Loading transformer NumericalTransformer for field MSSubClass'

'Loading transformer NumericalTransformer for field MSSubClass'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field MSZoning


'Loading transformer OneHotEncodingTransformer for field MSZoning'

'Loading transformer OneHotEncodingTransformer for field MSZoning'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field LotFrontage


'Loading transformer NumericalTransformer for field LotFrontage'

'Loading transformer NumericalTransformer for field LotFrontage'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field LotArea


'Loading transformer NumericalTransformer for field LotArea'

'Loading transformer NumericalTransformer for field LotArea'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Street


'Loading transformer OneHotEncodingTransformer for field Street'

'Loading transformer OneHotEncodingTransformer for field Street'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Alley


'Loading transformer OneHotEncodingTransformer for field Alley'

'Loading transformer OneHotEncodingTransformer for field Alley'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field LotShape


'Loading transformer OneHotEncodingTransformer for field LotShape'

'Loading transformer OneHotEncodingTransformer for field LotShape'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field LandContour


'Loading transformer OneHotEncodingTransformer for field LandContour'

'Loading transformer OneHotEncodingTransformer for field LandContour'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Utilities


'Loading transformer OneHotEncodingTransformer for field Utilities'

'Loading transformer OneHotEncodingTransformer for field Utilities'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field LotConfig


'Loading transformer OneHotEncodingTransformer for field LotConfig'

'Loading transformer OneHotEncodingTransformer for field LotConfig'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field LandSlope


'Loading transformer OneHotEncodingTransformer for field LandSlope'

'Loading transformer OneHotEncodingTransformer for field LandSlope'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Neighborhood


'Loading transformer OneHotEncodingTransformer for field Neighborhood'

'Loading transformer OneHotEncodingTransformer for field Neighborhood'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Condition1


'Loading transformer OneHotEncodingTransformer for field Condition1'

'Loading transformer OneHotEncodingTransformer for field Condition1'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Condition2


'Loading transformer OneHotEncodingTransformer for field Condition2'

'Loading transformer OneHotEncodingTransformer for field Condition2'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BldgType


'Loading transformer OneHotEncodingTransformer for field BldgType'

'Loading transformer OneHotEncodingTransformer for field BldgType'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field HouseStyle


'Loading transformer OneHotEncodingTransformer for field HouseStyle'

'Loading transformer OneHotEncodingTransformer for field HouseStyle'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field OverallQual


'Loading transformer NumericalTransformer for field OverallQual'

'Loading transformer NumericalTransformer for field OverallQual'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field OverallCond


'Loading transformer NumericalTransformer for field OverallCond'

'Loading transformer NumericalTransformer for field OverallCond'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field YearBuilt


'Loading transformer NumericalTransformer for field YearBuilt'

'Loading transformer NumericalTransformer for field YearBuilt'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field YearRemodAdd


'Loading transformer NumericalTransformer for field YearRemodAdd'

'Loading transformer NumericalTransformer for field YearRemodAdd'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field RoofStyle


'Loading transformer OneHotEncodingTransformer for field RoofStyle'

'Loading transformer OneHotEncodingTransformer for field RoofStyle'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field RoofMatl


'Loading transformer OneHotEncodingTransformer for field RoofMatl'

'Loading transformer OneHotEncodingTransformer for field RoofMatl'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Exterior1st


'Loading transformer OneHotEncodingTransformer for field Exterior1st'

'Loading transformer OneHotEncodingTransformer for field Exterior1st'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Exterior2nd


'Loading transformer OneHotEncodingTransformer for field Exterior2nd'

'Loading transformer OneHotEncodingTransformer for field Exterior2nd'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field MasVnrType


'Loading transformer OneHotEncodingTransformer for field MasVnrType'

'Loading transformer OneHotEncodingTransformer for field MasVnrType'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field MasVnrArea


'Loading transformer NumericalTransformer for field MasVnrArea'

'Loading transformer NumericalTransformer for field MasVnrArea'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field ExterQual


'Loading transformer OneHotEncodingTransformer for field ExterQual'

'Loading transformer OneHotEncodingTransformer for field ExterQual'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field ExterCond


'Loading transformer OneHotEncodingTransformer for field ExterCond'

'Loading transformer OneHotEncodingTransformer for field ExterCond'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Foundation


'Loading transformer OneHotEncodingTransformer for field Foundation'

'Loading transformer OneHotEncodingTransformer for field Foundation'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BsmtQual


'Loading transformer OneHotEncodingTransformer for field BsmtQual'

'Loading transformer OneHotEncodingTransformer for field BsmtQual'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BsmtCond


'Loading transformer OneHotEncodingTransformer for field BsmtCond'

'Loading transformer OneHotEncodingTransformer for field BsmtCond'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BsmtExposure


'Loading transformer OneHotEncodingTransformer for field BsmtExposure'

'Loading transformer OneHotEncodingTransformer for field BsmtExposure'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BsmtFinType1


'Loading transformer OneHotEncodingTransformer for field BsmtFinType1'

'Loading transformer OneHotEncodingTransformer for field BsmtFinType1'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BsmtFinSF1


'Loading transformer NumericalTransformer for field BsmtFinSF1'

'Loading transformer NumericalTransformer for field BsmtFinSF1'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field BsmtFinType2


'Loading transformer OneHotEncodingTransformer for field BsmtFinType2'

'Loading transformer OneHotEncodingTransformer for field BsmtFinType2'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BsmtFinSF2


'Loading transformer NumericalTransformer for field BsmtFinSF2'

'Loading transformer NumericalTransformer for field BsmtFinSF2'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BsmtUnfSF


'Loading transformer NumericalTransformer for field BsmtUnfSF'

'Loading transformer NumericalTransformer for field BsmtUnfSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field TotalBsmtSF


'Loading transformer NumericalTransformer for field TotalBsmtSF'

'Loading transformer NumericalTransformer for field TotalBsmtSF'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Heating


'Loading transformer OneHotEncodingTransformer for field Heating'

'Loading transformer OneHotEncodingTransformer for field Heating'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field HeatingQC


'Loading transformer OneHotEncodingTransformer for field HeatingQC'

'Loading transformer OneHotEncodingTransformer for field HeatingQC'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field CentralAir


'Loading transformer OneHotEncodingTransformer for field CentralAir'

'Loading transformer OneHotEncodingTransformer for field CentralAir'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Electrical


'Loading transformer OneHotEncodingTransformer for field Electrical'

'Loading transformer OneHotEncodingTransformer for field Electrical'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field 1stFlrSF


'Loading transformer NumericalTransformer for field 1stFlrSF'

'Loading transformer NumericalTransformer for field 1stFlrSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field 2ndFlrSF


'Loading transformer NumericalTransformer for field 2ndFlrSF'

'Loading transformer NumericalTransformer for field 2ndFlrSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field LowQualFinSF


'Loading transformer NumericalTransformer for field LowQualFinSF'

'Loading transformer NumericalTransformer for field LowQualFinSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field GrLivArea


'Loading transformer NumericalTransformer for field GrLivArea'

'Loading transformer NumericalTransformer for field GrLivArea'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BsmtFullBath


'Loading transformer NumericalTransformer for field BsmtFullBath'

'Loading transformer NumericalTransformer for field BsmtFullBath'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BsmtHalfBath


'Loading transformer NumericalTransformer for field BsmtHalfBath'

'Loading transformer NumericalTransformer for field BsmtHalfBath'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field FullBath


'Loading transformer NumericalTransformer for field FullBath'

'Loading transformer NumericalTransformer for field FullBath'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field HalfBath


'Loading transformer NumericalTransformer for field HalfBath'

'Loading transformer NumericalTransformer for field HalfBath'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field BedroomAbvGr


'Loading transformer NumericalTransformer for field BedroomAbvGr'

'Loading transformer NumericalTransformer for field BedroomAbvGr'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field KitchenAbvGr


'Loading transformer NumericalTransformer for field KitchenAbvGr'

'Loading transformer NumericalTransformer for field KitchenAbvGr'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field KitchenQual


'Loading transformer OneHotEncodingTransformer for field KitchenQual'

'Loading transformer OneHotEncodingTransformer for field KitchenQual'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field TotRmsAbvGrd


'Loading transformer NumericalTransformer for field TotRmsAbvGrd'

'Loading transformer NumericalTransformer for field TotRmsAbvGrd'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Functional


'Loading transformer OneHotEncodingTransformer for field Functional'

'Loading transformer OneHotEncodingTransformer for field Functional'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field Fireplaces


'Loading transformer NumericalTransformer for field Fireplaces'

'Loading transformer NumericalTransformer for field Fireplaces'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field FireplaceQu


'Loading transformer OneHotEncodingTransformer for field FireplaceQu'

'Loading transformer OneHotEncodingTransformer for field FireplaceQu'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field GarageType


'Loading transformer OneHotEncodingTransformer for field GarageType'

'Loading transformer OneHotEncodingTransformer for field GarageType'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field GarageYrBlt


'Loading transformer NumericalTransformer for field GarageYrBlt'

'Loading transformer NumericalTransformer for field GarageYrBlt'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field GarageFinish


'Loading transformer OneHotEncodingTransformer for field GarageFinish'

'Loading transformer OneHotEncodingTransformer for field GarageFinish'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field GarageCars


'Loading transformer NumericalTransformer for field GarageCars'

'Loading transformer NumericalTransformer for field GarageCars'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field GarageArea


'Loading transformer NumericalTransformer for field GarageArea'

'Loading transformer NumericalTransformer for field GarageArea'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field GarageQual


'Loading transformer OneHotEncodingTransformer for field GarageQual'

'Loading transformer OneHotEncodingTransformer for field GarageQual'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field GarageCond


'Loading transformer OneHotEncodingTransformer for field GarageCond'

'Loading transformer OneHotEncodingTransformer for field GarageCond'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field PavedDrive


'Loading transformer OneHotEncodingTransformer for field PavedDrive'

'Loading transformer OneHotEncodingTransformer for field PavedDrive'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field WoodDeckSF


'Loading transformer NumericalTransformer for field WoodDeckSF'

'Loading transformer NumericalTransformer for field WoodDeckSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field OpenPorchSF


'Loading transformer NumericalTransformer for field OpenPorchSF'

'Loading transformer NumericalTransformer for field OpenPorchSF'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field EnclosedPorch


'Loading transformer NumericalTransformer for field EnclosedPorch'

'Loading transformer NumericalTransformer for field EnclosedPorch'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field 3SsnPorch


'Loading transformer NumericalTransformer for field 3SsnPorch'

'Loading transformer NumericalTransformer for field 3SsnPorch'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field ScreenPorch


'Loading transformer NumericalTransformer for field ScreenPorch'

'Loading transformer NumericalTransformer for field ScreenPorch'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field PoolArea


'Loading transformer NumericalTransformer for field PoolArea'

'Loading transformer NumericalTransformer for field PoolArea'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field PoolQC


'Loading transformer OneHotEncodingTransformer for field PoolQC'

'Loading transformer OneHotEncodingTransformer for field PoolQC'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field Fence


'Loading transformer OneHotEncodingTransformer for field Fence'

'Loading transformer OneHotEncodingTransformer for field Fence'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field MiscFeature


'Loading transformer OneHotEncodingTransformer for field MiscFeature'

'Loading transformer OneHotEncodingTransformer for field MiscFeature'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field MiscVal


'Loading transformer NumericalTransformer for field MiscVal'

'Loading transformer NumericalTransformer for field MiscVal'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field MoSold


'Loading transformer NumericalTransformer for field MoSold'

'Loading transformer NumericalTransformer for field MoSold'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field YrSold


'Loading transformer NumericalTransformer for field YrSold'

'Loading transformer NumericalTransformer for field YrSold'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field SaleType


'Loading transformer OneHotEncodingTransformer for field SaleType'

'Loading transformer OneHotEncodingTransformer for field SaleType'

INFO:sdv.metadata.table:Loading transformer OneHotEncodingTransformer for field SaleCondition


'Loading transformer OneHotEncodingTransformer for field SaleCondition'

'Loading transformer OneHotEncodingTransformer for field SaleCondition'

INFO:sdv.metadata.table:Loading transformer NumericalTransformer for field SalePrice


'Loading transformer NumericalTransformer for field SalePrice'

'Loading transformer NumericalTransformer for field SalePrice'

INFO:copulas.multivariate.gaussian:Fitting GaussianMultivariate()


'Fitting GaussianMultivariate()'

'Fitting GaussianMultivariate()'

INFO:root:Training time cost: 101.99387669563293


'Training time cost: 101.99387669563293'

'Training time cost: 101.99387669563293'

INFO:root:Save model to output/syn_model_GaussianCopula.pkl


'Save model to output/syn_model_GaussianCopula.pkl'

'Save model to output/syn_model_GaussianCopula.pkl'

INFO:root:output dataframe shape


'output dataframe shape'

'output dataframe shape'

INFO:root:(100, 81)


'(100, 81)'

'(100, 81)'

INFO:root:output dataframe head(5)


'output dataframe head(5)'

'output dataframe head(5)'

INFO:root:   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   0         147       RL    77.675637    13479   Pave  Grvl      IR1   
1   1          72       RL    55.645749     9367   Pave   NaN      IR1   
2   2          14       RL    50.562488    -1949   Pave  Grvl      Reg   
3   3          70       RL    72.056896    11284   Pave   NaN      IR1   
4   4          65       RL          NaN     7667   Pave   NaN      Reg   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...       -6    NaN   NaN         NaN     -11      6   
1         Lvl    AllPub  ...        4    NaN   NaN         NaN      26     11   
2         Bnk    AllPub  ...      -22    NaN   NaN         NaN      75      5   
3         Low    AllPub  ...       13    NaN   NaN         NaN      87      5   
4         Lvl    AllPub  ...      -10    NaN   NaN         NaN       7     11   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2010   

'   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n0   0         147       RL    77.675637    13479   Pave  Grvl      IR1   \n1   1          72       RL    55.645749     9367   Pave   NaN      IR1   \n2   2          14       RL    50.562488    -1949   Pave  Grvl      Reg   \n3   3          70       RL    72.056896    11284   Pave   NaN      IR1   \n4   4          65       RL          NaN     7667   Pave   NaN      Reg   \n\n  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n0         Lvl    AllPub  ...       -6    NaN   NaN         NaN     -11      6   \n1         Lvl    AllPub  ...        4    NaN   NaN         NaN      26     11   \n2         Bnk    AllPub  ...      -22    NaN   NaN         NaN      75      5   \n3         Low    AllPub  ...       13    NaN   NaN         NaN      87      5   \n4         Lvl    AllPub  ...      -10    NaN   NaN         NaN       7     11   \n\n  YrSold  SaleType  SaleCondition  SalePrice  \n0  

'   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n0   0         147       RL    77.675637    13479   Pave  Grvl      IR1   \n1   1          72       RL    55.645749     9367   Pave   NaN      IR1   \n2   2          14       RL    50.562488    -1949   Pave  Grvl      Reg   \n3   3          70       RL    72.056896    11284   Pave   NaN      IR1   \n4   4          65       RL          NaN     7667   Pave   NaN      Reg   \n\n  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \\\n0         Lvl    AllPub  ...       -6    NaN   NaN         NaN     -11      6   \n1         Lvl    AllPub  ...        4    NaN   NaN         NaN      26     11   \n2         Bnk    AllPub  ...      -22    NaN   NaN         NaN      75      5   \n3         Low    AllPub  ...       13    NaN   NaN         NaN      87      5   \n4         Lvl    AllPub  ...      -10    NaN   NaN         NaN       7     11   \n\n  YrSold  SaleType  SaleCondition  SalePrice  \n0  

## 檢視真實資料與合成資料表單

In [8]:
import pandas as pd

real_data_df = pd.read_csv("input/data.csv")  # 真實資料路徑
syn_data_df = pd.read_csv("output/data_GaussianCopula_output.csv")  # 合成資料預設檔名為: 真實資料檔名 + "_GaussianCopula_output"

In [9]:
real_data_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1118,20,RL,57.0,9764,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,130000
1,903,60,RL,63.0,7875,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,180000
2,658,70,RL,60.0,7200,Pave,,Reg,HLS,AllPub,...,0,,MnPrv,,0,2,2008,WD,Normal,149000
3,339,20,RL,91.0,14145,Pave,,Reg,Lvl,AllPub,...,0,,,Shed,400,5,2006,WD,Normal,202500
4,341,60,RL,85.0,14191,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,202900
5,553,20,RL,87.0,11146,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2009,WD,Normal,255500
6,1122,20,RL,84.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,New,Partial,212900
7,1116,20,RL,93.0,12085,Pave,,Reg,Lvl,AllPub,...,0,,,,0,11,2007,New,Partial,318000
8,1433,30,RL,60.0,10800,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,64500
9,489,190,RL,60.0,10800,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,ConLD,Normal,160000


In [10]:
syn_data_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,72,RM,40.641026,8038,Pave,,IR1,Lvl,AllPub,...,-8,,,,58,9,2008,WD,Normal,114858
1,1,48,RL,81.03292,10414,Grvl,,IR1,Bnk,AllPub,...,8,,,Shed,577,6,2006,WD,Normal,197172
2,2,20,RL,36.472885,-1829,Pave,,Reg,Lvl,AllPub,...,5,,,,-119,7,2006,WD,Normal,127683
3,3,177,RM,52.747695,10412,Pave,,Reg,Lvl,AllPub,...,8,,,,165,6,2009,WD,Normal,153709
4,4,11,RL,68.450861,11129,Pave,,Reg,Lvl,AllPub,...,-1,,,,441,1,2007,WD,Normal,164198
5,5,74,RL,68.291836,9277,Pave,,IR1,Lvl,AllPub,...,9,,,,-143,7,2006,WD,Normal,121669
6,6,17,RL,90.976485,8596,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,177,3,2010,WD,Normal,394032
7,7,75,RM,72.052803,8596,Pave,,Reg,Lvl,AllPub,...,9,,,,-49,2,2009,WD,Normal,127043
8,8,32,RL,67.761313,9481,Pave,,Reg,Lvl,AllPub,...,7,,,,45,3,2007,WD,Normal,175815
9,9,144,RM,46.634261,3031,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,-130,4,2006,WD,Normal,97889
