In [1]:
# tutorial_syn_seq.ipynb

##############################################################################
# 0. 라이브러리 임포트
##############################################################################
import pandas as pd
import numpy as np

# 필요시, synthetic library 또는 local import
# 예) from myproject.plugins.core.syn_seq_dataloader import Syn_SeqDataLoader
#     from myproject.plugins.core.models.syn_seq.syn_seq_encoder import Syn_SeqEncoder
# 여기서는 예시 코드로 가정
from synthcity.plugins.core.dataloader import Syn_SeqDataLoader
from synthcity.plugins.core.models.syn_seq.syn_seq_encoder import Syn_SeqEncoder


##############################################################################
# 1. 샘플 데이터 준비
##############################################################################
# 예: diabetes-like dataset 형태 (임의 생성 or load)
np.random.seed(0)

rows = 10
df = pd.DataFrame({
    "age": np.random.randint(20, 70, size=rows),
    "sex": np.random.choice(["M","F"], size=rows),
    "bmi": np.random.normal(25, 4, size=rows),
    "bp": np.random.uniform(-0.04, 0.08, size=rows),
    "s1": np.random.randint(100, 200, size=rows),
    "s2": np.random.randint(0, 5, size=rows),
    "date_col": pd.date_range("2020-01-01", periods=rows, freq="D"),
    "category_label": np.random.choice(["X","Y","Z"], size=rows),
    "target": np.random.randint(0,2,size=rows)
})

print("Original df:\n", df.head())




  from .autonotebook import tqdm as notebook_tqdm


    The default C++ compiler could not be found on your system.
    You need to either define the CXX environment variable or a symlink to the g++ command.
    For example if g++-8 is the command you can do
      import os
      os.environ['CXX'] = 'g++-8'
    
Original df:
    age sex        bmi        bp   s1  s2   date_col category_label  target
0   64   F  15.296182  0.041466  100   2 2020-01-01              X       1
1   67   M  23.187777  0.046476  100   0 2020-01-02              Z       1
2   20   M  23.116916  0.029842  136   1 2020-01-03              Y       0
3   23   M  28.892064  0.024485  153   1 2020-01-04              X       1
4   23   M  19.887404  0.051034  105   1 2020-01-05              Y       0


In [2]:
##############################################################################
# 2. Syn_SeqDataLoader 생성
##############################################################################
custom_col_type = {
    "age": "numeric",  # auto-detect도 가능하지만 예시로 override
    "sex": "category",
    "date_col": "date" # 명시적 선언
    # 나머지는 max_categories 기준으로 자동 결정
}

special_value_map = {
    "bp": [-0.04],   # 예: bp가 -0.04 값을 특별취급
    "age": [23],
    "target": [0],   # 예: target이 0이 대부분이라 특수 취급
}

syn_order = ["date_col","sex","age","bmi","bp","s1","s2","category_label","target"]
loader = Syn_SeqDataLoader(
    data=df,
    syn_order=syn_order,
    special_value=special_value_map,
    col_type=custom_col_type,
    max_categories=4,
)

# DataLoader init 시, 내부 Syn_SeqEncoder도 fit()됨
# => columns split, date offset, etc.는 transform 전까지 미적용
print("\n[INFO] Loader created. loader.shape =", loader.shape)
print("loader columns =", loader.columns)




[INFO] Syn_SeqDataLoader init complete:
  - syn_order: ['date_col', 'sex', 'age', 'bmi', 'bp', 's1', 's2', 'category_label', 'target']
  - special_value: {'bp': [-0.04], 'age': [23], 'target': [0]}
  - col_type: {'age': 'numeric', 'sex': 'category', 'date_col': 'date'}
  - data shape: (10, 9)

[DEBUG] After encoder.fit(), ready for preprocessing with detected info:
  - special_value: {'bp': [-0.04], 'age': [23], 'target': [0]}
  - encoder.col_map =>
       date_col : converted_type=numeric, method=cart
       sex : converted_type=category, method=cart
       age : converted_type=numeric, method=cart
       bmi : converted_type=numeric, method=cart
       bp : converted_type=numeric, method=cart
       s1 : converted_type=numeric, method=cart
       s2 : converted_type=category, method=cart
       category_label : converted_type=category, method=cart
       target : converted_type=category, method=cart
  - variable_selection_:
                date_col  sex  age  bmi  bp  s1  s2  categor

In [3]:
##############################################################################
# 3. transform (encode) 테스트
##############################################################################
encoded_loader, enc_dict = loader.encode()
# => encoded_loader._df : transform 결과 DataFrame
# => enc_dict["syn_seq_encoder"] : 실제 Syn_SeqEncoder 객체

print("\n--- After encode() ---")
encoded_df = encoded_loader.dataframe()
print(encoded_df)

print("\nEncoded loader columns:", encoded_loader.columns)

print("\nCheck the 'enc_dict':", enc_dict.keys())
encoder_obj = enc_dict["syn_seq_encoder"]  # Syn_SeqEncoder




[INFO] Syn_SeqDataLoader init complete:
  - syn_order: ['date_col', 'sex', 'age', 'bmi', 'bp', 's1', 's2', 'category_label', 'target']
  - special_value: {'bp': [-0.04], 'age': [23], 'target': [0]}
  - col_type: {'age': 'numeric', 'sex': 'category', 'date_col': 'date'}
  - data shape: (10, 9)

[DEBUG] After encoder.fit(), ready for preprocessing with detected info:
  - special_value: {'bp': [-0.04], 'age': [23], 'target': [0]}
  - encoder.col_map =>
       date_col : converted_type=numeric, method=cart
       sex : converted_type=category, method=cart
       age : converted_type=numeric, method=cart
       bmi : converted_type=numeric, method=cart
       bp : converted_type=numeric, method=cart
       s1 : converted_type=numeric, method=cart
       s2 : converted_type=category, method=cart
       category_label : converted_type=category, method=cart
       target : converted_type=category, method=cart
  - variable_selection_:
                date_col  sex  age  bmi  bp  s1  s2  categor

In [4]:
##############################################################################
# 4. 내부 Encoder 정보 확인
##############################################################################
print("\n>>> encoder_obj.special_val:", encoder_obj.columns_special_values)
print("\n>>> encoder_obj.syn_order:", encoder_obj.syn_order)
print(">>> encoder_obj.col_map:", encoder_obj.col_map)
#print(">>> encoder_obj.original_dtype_map:", encoder_obj.original_dtype_map)
#print(">>> encoder_obj.date_mins:", encoder_obj.date_mins)

if encoder_obj.variable_selection_ is not None:
    print("\n>>> variable_selection_:\n", encoder_obj.variable_selection_)

# 'method_assignments'는 이 예시 구조에서는 제거했거나
# aggregator가 실제 학습 시점에 정할 수 있음.




>>> encoder_obj.special_val: {'bp': [-0.04], 'age': [23], 'target': [0]}

>>> encoder_obj.syn_order: ['date_col', 'sex', 'age_cat', 'age', 'bmi', 'bp_cat', 'bp', 's1', 's2', 'category_label', 'target']
>>> encoder_obj.col_map: {'date_col': {'original_dtype': 'datetime64[ns]', 'converted_type': 'numeric', 'method': 'cart'}, 'sex': {'original_dtype': 'object', 'converted_type': 'category', 'method': 'cart'}, 'age': {'original_dtype': 'int32', 'converted_type': 'numeric', 'method': 'cart'}, 'bmi': {'original_dtype': 'float64', 'converted_type': 'numeric', 'method': 'cart'}, 'bp': {'original_dtype': 'float64', 'converted_type': 'numeric', 'method': 'cart'}, 's1': {'original_dtype': 'int32', 'converted_type': 'numeric', 'method': 'cart'}, 's2': {'original_dtype': 'int32', 'converted_type': 'category', 'method': 'cart'}, 'category_label': {'original_dtype': 'object', 'converted_type': 'category', 'method': 'cart'}, 'target': {'original_dtype': 'int32', 'converted_type': 'category', 'method'

In [None]:
##############################################################################
# 5. update_user_custom 예시 (syn_order, variable_selection, method)
##############################################################################
user_custom = {
    "syn_order": ["sex","bmi","date_col","age","bp","s1","s2","category_label","target"],
    "variable_selection": {
        "s1": ["sex","bmi","age"],  # s1 => predict from sex,bmi,age
        "target": ["sex","bp","s1"] # target => predict from sex,bp,s1
    },
    "method": {
        "bp": "polyreg"  # bp만 특별히 'polyreg' 쓰고, 나머지는 aggregator가 default
    }
}

print("\n--- update_user_custom(...) example ---")
loader.update_user_custom(user_custom)
# => 여기서는 transform 하지 않고, 설정만 반영

print("syn_order after user_custom:", loader.syn_order)
print("loader._method:", getattr(loader, "_method", None))


##############################################################################
# 6. 최종 encode 한번 더 호출 (예: aggregator.fit 시점)
##############################################################################
print("\n--- final encode() after user_custom ---")
enc_loader2, enc_dict2 = loader.encode()
print("new encoded df:\n", enc_loader2.dataframe())

encoder2 = enc_dict2["syn_seq_encoder"]
print("\n>>> updated variable_selection_:\n", encoder2.variable_selection_)

##############################################################################
# 7. decode 예시
##############################################################################
decoded_loader = enc_loader2.decode(enc_dict2)
df_decoded = decoded_loader.dataframe()
print("\n--- Decoded df:\n", df_decoded.head())

print("\nCheck if date_col, original dtype, special values are restored properly.")


##############################################################################
# 8. 실제 aggregator 예시(간단):
##############################################################################
# (실제로 aggregator.fit(loader) 호출 시:
#  1) loader.encode() -> (encoded_loader, enc)
#  2) aggregator 내부 column별 model train
#  3) aggregator.generate() -> aggregator 내부 syn -> decode() => Syn_SeqDataLoader

print("\n[Done tutorial]")