# 合成資料模型訓練模組 - python3.6 版本

## 環境建置
1. 安裝 pyenv 依賴
```
sudo apt update
sudo apt install -y make build-essential libssl-dev zlib1g-dev \
  libbz2-dev libreadline-dev libsqlite3-dev wget curl llvm \
  libncursesw5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev
```
2. 安裝 Python 3.6.15
```
pyenv install 3.6.15
pyenv local 3.6.15
```
3. 建立 virtualenv
```
python -m venv venv
source venv/bin/activate
```
4. 安裝 core 套件（預先裝 wheel）
```
pip install --upgrade pip setuptools wheel
pip install numpy==1.19.5 cython==0.29.36 pandas==0.24.2
pip install sdv==0.3.6
pip install jupyterlab==3.2.9
pip install scipy==1.2.3 sdmetrics==0.0.2.dev0
```

In [13]:
import argparse
import json
import logging
import os
import time

import numpy as np
import pandas as pd
from sdv.tabular import GaussianCopula
from sdv.tabular import CTGAN

from sdv import SDV

In [14]:
def get_data_from_model(model_path, num_rows=1000, condition_dict=None):
    logging.info("Generate Synthetic data from Model")
    model = SDV.load(model_path)
    
    # condition = Condition({'gender': 'M'}, num_rows=num_rows)
    if not condition_dict:
        sampled = model.sample(num_rows=num_rows)
    else:
        logging.info("Sample data by condition")
        condition = Condition(condition_dict, num_rows=num_rows)
        sampled = model.sample_conditions(conditions=[condition])
        
    return sampled

In [15]:
def data_sythesizer(args, input_df=pd.DataFrame()):
    """Synthesize input dataframe data and output dataframe

    Args:
        args (argparse): arguments for configs.
        input_df (DataFrame): input data. Defaults to pd.DataFrame().

    Returns:
        output_df (DataFrame): synthesized data output
    """
    pri_key = args.primary_key

    if args.synth_model == "GaussianCopula":
        logging.info("sythetic model arch: GaussianCopula")
        model = GaussianCopula(primary_key=pri_key) if pri_key else GaussianCopula()
    elif args.synth_model == "CTGAN":
        if args.custom_setting:
            logging.info("sythetic model arch: CTGAN-c")
            model = CTGAN(
                primary_key=pri_key,
                epochs=args.epochs,
                batch_size=args.batch_size,
                generator_dim=tuple(args.gen_dim),
                discriminator_dim=tuple(args.dis_dim),
                verbose=True,
            )
        else:
            logging.info("sythetic model arch: CTGAN")
            model = (
                CTGAN(primary_key=pri_key, verbose=True)
                if pri_key
                else CTGAN(verbose=True)
            )
    else:
        logging.info("the sythetic model is not supported!")

    logging.info("Synthetic model fitting data start ... ")
    start_time = time.time()
    model.fit(input_df)
    logging.info(f"Training time cost: {time.time()-start_time}")
    output_df = model.sample(num_rows=args.num_rows)

    if args.save_model:
        output_model_path = os.path.join(
            args.output_dir, f"syn_model_{args.synth_model}.pkl"
        )
        if args.custom_setting:
            output_model_path = os.path.join(
                args.output_dir, f"syn_model_{args.synth_model}-c.pkl"
            )

        logging.info(f"Save model to {output_model_path}")
        model.save(output_model_path)

    return output_df

In [16]:
def set_args(args_list=None):
    """Main Function
    process input and do configs check
    """
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--input_path", type=str, default="data/train.csv")
    parser.add_argument("--output_dir", type=str, default="data/output/")
    parser.add_argument("--synth_model", type=str, default="GaussianCopula", help="sythetic model type")
    parser.add_argument("--primary_key", type=str, default="", help="primary key in your tabular data")
    parser.add_argument("--num_rows", type=int, default=200, help="num rows of the output sythetic dataframe")
    parser.add_argument("--save_model", action="store_true", help="set for save model pkl file")
    parser.add_argument("--save_output", action="store_true", help="set for save output csv file")
    parser.add_argument("--save_report", action="store_true", help="set for save report csv and image files")
    
    parser.add_argument("--custom_setting", action="store_true", help="set for custom setting in CTGAN and TVAE Model")
    parser.add_argument("--epochs", type=int, default=300, help="set epochs for training CTGAN and TVAE Model")
    parser.add_argument("--batch_size", type=int, default=500, help="set batch size for training CTGAN and TVAE Model")
    parser.add_argument("--gen_dim", type=int, nargs="+", default=[256, 256], help="set gen dimension")
    parser.add_argument("--dis_dim", type=int, nargs="+", default=[256, 256], help="set dis dimension")
    
    parser.add_argument("--input_syn_model", type=str, default=None, help="path to your syn_data model file")
    parser.add_argument("--sample_condition", type=str, default=None, help="path to your syn_data sample condition json file")
    parser.add_argument("--output_fpath", type=str, default=None, help="set full file path for your syn_data output csv")

    return parser.parse_args(args_list)

In [17]:
def main(args):
    # logging.info(f"contents of args.primary_key {args.primary_key}")
    # logging.info(f"contents of args.custom_setting {args.custom_setting}")
    # logging.info(f"contents of args.gen_dim {args.gen_dim}")
    # logging.info(f"contents of args.dis_dim {args.dis_dim}")
    if not args.input_syn_model:
        assert os.path.exists(args.input_path), f"Can't find the input file at {args.input_path}."
        assert os.path.exists(args.output_dir), f"Can't find the output folder at {args.output_dir}."
        assert args.synth_model in ["GaussianCopula", "CTGAN"]
        # ["GaussianCopula", "CTGAN", "CopulaGAN", "TVAE"]

        input_path = args.input_path
        output_dir = args.output_dir
        # output_fname=args.output_fname

        input_df = pd.read_csv(input_path)
        if args.primary_key:
            assert args.primary_key in input_df.columns

        # if "Id" in input_df.columns:
        #     input_df = input_df.drop(columns=["Id"])

        output_df = data_sythesizer(args=args, input_df=input_df)

        logging.info("output dataframe shape")
        logging.info(output_df.shape)
        logging.info("output dataframe head(5)")
        logging.info(output_df.head())

        if args.save_output:
            base = os.path.basename(input_path)
            output_fname = (
                os.path.splitext(base)[0] + "_" + args.synth_model + "_output.csv"
            )
            output_df.to_csv(os.path.join(output_dir, output_fname), index=False)
    else:
        model_path = args.input_syn_model
        num_rows = args.num_rows
        condition_fpath = args.sample_condition
        output_fpath=args.output_fpath
        assert os.path.exists(model_path), f"Can't find the model_path pkl file: {model_path}."
        if not condition_fpath:
            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=None)
        else:
            assert os.path.exists(condition_fpath), f"Can't find the sample_condition json file: {condition_fpath}."
            assert condition_fpath[-5:] == ".json", f"{condition_fpath} must be a json file!"
            
            with open(condition_fpath, "r") as f:
                condition_dict = json.load(f)

            output_df = get_data_from_model(model_path, num_rows=num_rows, condition_dict=condition_dict)

        logging.info("output dataframe shape")
        logging.info(output_df.shape)
        logging.info("output dataframe head(5)")
        logging.info(output_df.head())

        if args.save_output:
            output_df.to_csv(output_fpath, index=False)


# 使用模組訓練合成資料模型 + 生成合成資料

In [19]:
args = set_args([
    "--input_path", "input/data.csv", # input 真實資料 csv 格式的表單
    "--output_dir", "output/",   # 合成資料輸出路徑
    "--synth_model", "GaussianCopula",  # 合成資料模型演算法
    "--primary_key", "Id",  # key 欄位名稱
    "--num_rows", "100",  # 生成的資料筆數
    "--save_model",    # 設定模型
    "--save_output"    # 設定儲存
])

main(args)

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  improvement from the last five Jacobian evaluations.
  numpy.max(numpy.abs(fsim[0] - fsim[1:])) <= fatol):


## 檢視真實資料與合成資料表單

In [20]:
import pandas as pd

real_data_df = pd.read_csv("input/data.csv")  # 真實資料路徑
syn_data_df = pd.read_csv("output/data_GaussianCopula_output.csv")  # 合成資料預設檔名為: 真實資料檔名 + "_GaussianCopula_output"

In [21]:
real_data_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1118,20,RL,57.0,9764,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2008,WD,Normal,130000
1,903,60,RL,63.0,7875,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,WD,Normal,180000
2,658,70,RL,60.0,7200,Pave,,Reg,HLS,AllPub,...,0,,MnPrv,,0,2,2008,WD,Normal,149000
3,339,20,RL,91.0,14145,Pave,,Reg,Lvl,AllPub,...,0,,,Shed,400,5,2006,WD,Normal,202500
4,341,60,RL,85.0,14191,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,202900
5,553,20,RL,87.0,11146,Pave,,IR1,Lvl,AllPub,...,0,,,,0,7,2009,WD,Normal,255500
6,1122,20,RL,84.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,7,2006,New,Partial,212900
7,1116,20,RL,93.0,12085,Pave,,Reg,Lvl,AllPub,...,0,,,,0,11,2007,New,Partial,318000
8,1433,30,RL,60.0,10800,Pave,Grvl,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,64500
9,489,190,RL,60.0,10800,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2006,ConLD,Normal,160000


In [22]:
syn_data_df.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,0,75,RL,,7387,Pave,,IR1,Lvl,AllPub,...,-6,,,,130,3,2007,WD,Normal,182596
1,1,40,RL,63.126273,10907,Grvl,,Reg,Lvl,AllPub,...,5,,,,-85,11,2007,CWD,Normal,206618
2,2,23,RL,64.989874,7803,Pave,,Reg,Lvl,AllPub,...,-3,,,,62,3,2009,New,Partial,226065
3,3,12,RL,92.554917,16655,Pave,,Reg,Lvl,AllPub,...,-1,,,Shed,483,2,2007,WD,Normal,121224
4,4,84,RL,88.221283,11714,Pave,,IR1,Lvl,AllPub,...,10,,,,128,5,2010,WD,Normal,286501
5,5,123,RL,56.299399,12077,Pave,,Reg,Lvl,AllPub,...,0,,,,-58,6,2009,WD,Normal,265605
6,6,33,RL,61.841508,5113,Pave,Pave,Reg,Lvl,AllPub,...,0,,GdPrv,,17,12,2006,WD,Normal,115775
7,7,18,RM,62.203574,5106,Pave,,Reg,Lvl,AllPub,...,-6,,,,73,5,2010,WD,Abnorml,60124
8,8,62,RL,,14543,Pave,,IR1,Lvl,AllPub,...,-12,,MnPrv,,51,6,2010,WD,Normal,184092
9,9,51,RL,80.563656,10142,Pave,,Reg,Lvl,AllPub,...,4,,,,-90,5,2009,WD,Normal,199925
