0. 필요한 모듈 로드하기

In [1]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# To ignore all warnings
import warnings

warnings.filterwarnings("ignore")

1. 데이터 로드 / train, val 나누기

In [2]:
train_all = pd.read_csv('/data/train.csv')

In [3]:
train = train_all.drop(columns="ID")

In [4]:
from sklearn.model_selection import train_test_split

X_train, y_train = train.drop(columns="Fraud_Type"), train["Fraud_Type"]

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

In [5]:
train = pd.concat((X_train, y_train), axis=1)
val = pd.concat((X_val, y_val), axis=1)

2. 데이터 전처리

In [6]:
only_m_customers = (
    train.groupby("Customer_identification_number")["Fraud_Type"]
    .apply(lambda x: set(x) == {"m"})
    .reset_index()
)


# Fraud_Type이 'm'만 있는 Customer_identification_number 필터링


only_m_customers = only_m_customers[only_m_customers["Fraud_Type"] == True][
    "Customer_identification_number"
]


# 해당 Customer_identification_number와 일치하는 행들을 train에서 제거


train = train[~train["Customer_identification_number"].isin(only_m_customers)]

In [7]:
train.shape

(24468, 63)

In [8]:
hdb = train[
    [
        "Customer_identification_number",
        "Customer_Birthyear",
        "Customer_Gender",
        "Customer_personal_identifier",
        "Customer_registration_datetime",
        "Account_account_number",
        "Account_creation_datetime",
    ]
]


hdb = hdb.drop_duplicates()


hdb.shape

(977, 7)

In [9]:
train["7_29_30"] = (
    train["Customer_credit_rating"]
    + "_"
    + train["Account_indicator_release_limit_excess"].astype(str)
    + "_"
    + train["Account_amount_daily_limit"].astype(str)
)
train["7_29_30"] = (
    train["Customer_credit_rating"]
    + "_"
    + train["Account_indicator_release_limit_excess"].astype(str)
    + "_"
    + train["Account_amount_daily_limit"].astype(str)
)
train["7_29_30"].value_counts()

7_29_30
B_0_2000000     8645
A_0_2000000     3898
C_0_1000000     3892
B_1_50000000    2270
D_0_1000000     1043
E_0_1000000     1003
A_1_50000000     988
S_0_2000000      984
C_1_10000000     956
S_1_50000000     283
E_1_10000000     260
D_1_10000000     246
Name: count, dtype: int64

In [10]:
train["40_41"] = train["Channel"] + "_" + train["Operating_System"]
train["40_41"].value_counts()

40_41
Others_Windows      3370
Others_Others       3235
ATM_Others          3060
ATM_Windows         3010
mobile_Others       2017
mobile_iOS          1981
mobile_Android      1933
internet_Windows    1507
internet_Linux      1470
internet_Others     1444
internet_macOS      1441
Name: count, dtype: int64

In [11]:
train["35/34"] = train.apply(
    lambda row: (
        row["Account_one_month_std_dev"] / row["Account_one_month_max_amount"]
        if row["Account_one_month_max_amount"] != 0
        else 0
    ),
    axis=1,
)


train["37/36"] = train.apply(
    lambda row: (
        row["Account_dawn_one_month_std_dev"] / row["Account_dawn_one_month_max_amount"]
        if row["Account_dawn_one_month_max_amount"] != 0
        else 0
    ),
    axis=1,
)


train["36/34"] = train.apply(
    lambda row: (
        row["Account_dawn_one_month_max_amount"] / row["Account_one_month_max_amount"]
        if row["Account_one_month_max_amount"] != 0
        else 0
    ),
    axis=1,
)

In [12]:
def convert_to_float_year(dt):
    year = dt.year
    start_of_year = pd.Timestamp(year=year, month=1, day=1)
    next_year = pd.Timestamp(year=year + 1, month=1, day=1)
    year_elapsed = (dt - start_of_year).total_seconds()
    year_duration = (next_year - start_of_year).total_seconds()
    fractional_year = year_elapsed / year_duration
    return year + fractional_year


train["Account_creation_datetime"] = pd.to_datetime(train["Account_creation_datetime"])
train["Transaction_Datetime"] = pd.to_datetime(train["Transaction_Datetime"])
train["Last_atm_transaction_datetime"] = pd.to_datetime(
    train["Last_atm_transaction_datetime"]
)
train["Last_bank_branch_transaction_datetime"] = pd.to_datetime(
    train["Last_bank_branch_transaction_datetime"]
)
train["Transaction_resumed_date"] = pd.to_datetime(train["Transaction_resumed_date"])
train["Customer_registration_datetime"] = pd.to_datetime(
    train["Customer_registration_datetime"]
)

train["Transaction_hour"] = train["Transaction_Datetime"].apply(lambda x: x.hour)

train["거래재개일자_소수"] = train["Transaction_resumed_date"].apply(
    convert_to_float_year
)
train["거래일자_소수"] = train["Transaction_Datetime"].apply(convert_to_float_year)
train["마지막ATM거래일자_소수"] = train["Last_atm_transaction_datetime"].apply(
    convert_to_float_year
)
train["마지막영업점거래일자_소수"] = train[
    "Last_bank_branch_transaction_datetime"
].apply(convert_to_float_year)

train["거래까지걸린시간_소수"] = train["거래일자_소수"] - train["거래재개일자_소수"]
train["거래까지걸린시간_ATM_소수"] = (
    train["거래일자_소수"] - train["마지막ATM거래일자_소수"]
)
train["거래까지걸린시간_영업점_소수"] = (
    train["거래일자_소수"] - train["마지막영업점거래일자_소수"]
)
train["거래당시나이"] = train["거래일자_소수"] - train["Customer_Birthyear"]

In [13]:
drop_columns = [
    "Customer_identification_number",
    "Customer_personal_identifier",
    "Account_account_number",
    "Location",
    "IP_Address",
    "MAC_Address",
    "Recipient_Account_Number",
    "Customer_Gender", 
    "Account_one_month_std_dev",
    "Account_dawn_one_month_std_dev",
    "Account_dawn_one_month_max_amount",  
    "Customer_credit_rating",
    "Account_indicator_release_limit_excess",
    "Account_amount_daily_limit",
    "Channel",
    "Operating_System",
    "Transaction_Failure_Status",
    "Account_creation_datetime",
    "Transaction_Datetime",
    "Last_atm_transaction_datetime",
    "Last_bank_branch_transaction_datetime",
    "Transaction_resumed_date",
    "Customer_registration_datetime",
    "거래재개일자_소수",
    "거래일자_소수",
    "마지막ATM거래일자_소수",
    "마지막영업점거래일자_소수",  
]


train.drop(columns=drop_columns, inplace=True)

In [14]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24468 entries, 29502 to 8911
Data columns (total 50 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Customer_Birthyear                             24468 non-null  int64  
 1   Customer_flag_change_of_authentication_1       24468 non-null  int64  
 2   Customer_flag_change_of_authentication_2       24468 non-null  int64  
 3   Customer_flag_change_of_authentication_3       24468 non-null  int64  
 4   Customer_flag_change_of_authentication_4       24468 non-null  int64  
 5   Customer_rooting_jailbreak_indicator           24468 non-null  int64  
 6   Customer_mobile_roaming_indicator              24468 non-null  int64  
 7   Customer_VPN_Indicator                         24468 non-null  int64  
 8   Customer_loan_type                             24468 non-null  object 
 9   Customer_flag_terminal_malicious_behavior_1    24468

In [19]:
import numpy as np
from scipy import stats


def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()

    # z-score 계산
    z_scores = (series - mean) / std

    # n_std를 넘는 z-score를 가진 값을 n_std에 해당하는 원본 값으로 대체
    series = series.mask(z_scores > n_std, mean + n_std * std)
    series = series.mask(z_scores < -n_std, mean - n_std * std)

    return series


train["Time_difference_seconds"] = pd.to_timedelta(
    train["Time_difference"]
).dt.total_seconds()

## 이상치 처리 - 시간

# 거래까지걸린시간_소수의 평균값 계산
mean_value = train["거래까지걸린시간_소수"].mean()
train.loc[train["Time_difference_seconds"] > 31536000, "거래까지걸린시간_소수"] = (
    mean_value
)

mean_value = train["Time_difference_seconds"].mean()
train["Time_difference_seconds"] = train["Time_difference_seconds"].apply(
    lambda x: mean_value if x > 31536000 else x
)

min_positive_value = train["Time_difference_seconds"][
    train["Time_difference_seconds"] >= 0
].min()
train["Time_difference_seconds"] = train["Time_difference_seconds"].apply(
    lambda x: min_positive_value if x < 0 else x
)

## 이상치 처리 - 금액
cost_cols = [
    "Account_initial_balance",
    "Account_balance",
    "Account_remaining_amount_daily_limit_exceeded",
    "Account_one_month_max_amount",
    "Transaction_Amount",
]

for col in cost_cols:
    train[col] = handle_outliers(train[col])

# 모든 Fraud_Type 목록 생성 (m 포함)
fraud_types = train["Fraud_Type"].unique()

# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

In [20]:
all_cat_columns = [
    "Customer_flag_change_of_authentication_1",
    "Customer_flag_change_of_authentication_2",
    "Customer_flag_change_of_authentication_3",
    "Customer_flag_change_of_authentication_4",
    "Customer_rooting_jailbreak_indicator",
    "Customer_mobile_roaming_indicator",
    "Customer_VPN_Indicator",
    "Customer_loan_type",
    "Customer_flag_terminal_malicious_behavior_1",
    "Customer_flag_terminal_malicious_behavior_2",
    "Customer_flag_terminal_malicious_behavior_3",
    "Customer_flag_terminal_malicious_behavior_4",
    "Customer_flag_terminal_malicious_behavior_5",
    "Customer_flag_terminal_malicious_behavior_6",
    "Customer_inquery_atm_limit",
    "Customer_increase_atm_limit",
    "Account_indicator_Openbanking",
    "Account_release_suspention",
    "Another_Person_Account",
    "Unused_terminal_status",
    "Flag_deposit_more_than_tenMillion",
    "Unused_account_status",
    "Recipient_account_suspend_status",
    "First_time_iOS_by_vulnerable_user",
    "Error_Code",
    "Type_General_Automatic",
    "Transaction_num_connection_failure",
    "Number_of_transaction_with_the_account",
    "Transaction_history_with_the_account",
    "Fraud_Type",
    "7_29_30",
    "40_41",
    "Account_account_type",
    "Transaction_hour",
    "Access_Medium",
]

3. Train

In [21]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from ForestDiffusion import ForestDiffusionModel
from tqdm import tqdm

N_SAMPLE = 90  # Train, Validation 비율이 90:10
N_CLS_PER_GEN = 500
all_synthetic_data = pd.DataFrame()

# 모든 범주형 변수에 대해 LabelEncoder 적용
le_dict = {}
for col in all_cat_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    le_dict[col] = le

fraud_types = train["Fraud_Type"].unique()
total_types = len(fraud_types)

train.drop(["Time_difference"], axis=1, inplace=True)
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 24468 entries, 29502 to 8911
Data columns (total 50 columns):
 #   Column                                         Non-Null Count  Dtype  
---  ------                                         --------------  -----  
 0   Customer_Birthyear                             24468 non-null  int64  
 1   Customer_flag_change_of_authentication_1       24468 non-null  int32  
 2   Customer_flag_change_of_authentication_2       24468 non-null  int32  
 3   Customer_flag_change_of_authentication_3       24468 non-null  int32  
 4   Customer_flag_change_of_authentication_4       24468 non-null  int32  
 5   Customer_rooting_jailbreak_indicator           24468 non-null  int32  
 6   Customer_mobile_roaming_indicator              24468 non-null  int32  
 7   Customer_VPN_Indicator                         24468 non-null  int32  
 8   Customer_loan_type                             24468 non-null  int32  
 9   Customer_flag_terminal_malicious_behavior_1    24468

In [None]:
for i, fraud_type in enumerate(fraud_types, 1):
    original_fraud_type = le_dict["Fraud_Type"].inverse_transform([fraud_type])[0]
    print(f"Processing Fraud Type {original_fraud_type} ({i}/{total_types})")

    subset = train[train["Fraud_Type"] == fraud_type].sample(
        n=N_SAMPLE, random_state=42
    )

    X = subset.drop(columns=["Fraud_Type"])
    y = subset["Fraud_Type"]

    print("Training model...")
    # categorical, binary_categorical feature를 주석처리
    forest_model = ForestDiffusionModel(
        X.values,
        label_y=y.values,
        n_t=100,
        duplicate_K=100,
        bin_indexes=[],
        cat_indexes=[],
        int_indexes=[],
        diffusion_type="flow",
        n_jobs=-1,
    )

    print(f"Generating {N_CLS_PER_GEN} samples...")
    synthetic_data = []
    for _ in tqdm(
        range(N_CLS_PER_GEN), desc=f"Generating samples for Type {fraud_type}"
    ):
        synthetic_data.append(forest_model.generate(batch_size=1))
    synthetic_data = np.vstack(synthetic_data)

    synthetic_df = pd.DataFrame(
        synthetic_data, columns=list(X.columns) + ["Fraud_Type"]
    )

    for col in cost_cols:
        synthetic_df[col] = handle_outliers(synthetic_df[col])

    synthetic_df["Time_difference"] = pd.to_timedelta(
        synthetic_df["Time_difference_seconds"], unit="s"
    )

    # Time_difference_seconds 컬럼 제거
    synthetic_df = synthetic_df.drop("Time_difference_seconds", axis=1)

    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat(
        [all_synthetic_data, synthetic_df], ignore_index=True
    )

    print(f"Completed Fraud Type {fraud_type}\n")

    # 최종 결과 확인
    print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)

In [None]:
for col in all_cat_columns:
    if col in all_synthetic_data.columns:
        all_synthetic_data[col] = le_dict[col].inverse_transform(
            all_synthetic_data[col].astype(int)
        )

5. 데이터 후처리

In [146]:
all_synthetic_data["Account_one_month_std_dev"] = (
    all_synthetic_data["35/34"] * all_synthetic_data["Account_one_month_max_amount"]
).astype(int)


all_synthetic_data["Account_dawn_one_month_max_amount"] = (
    all_synthetic_data["36/34"] * all_synthetic_data["Account_one_month_max_amount"]
).astype(int)


all_synthetic_data["Account_dawn_one_month_std_dev"] = (
    all_synthetic_data["37/36"]
    * all_synthetic_data["Account_dawn_one_month_max_amount"]
).astype(int)


all_synthetic_data.drop(columns=["35/34", "36/34", "37/36"], inplace=True)

In [148]:
all_synthetic_data[["Channel", "Operating_System"]] = all_synthetic_data[
    "40_41"
].str.split("_", expand=True)


all_synthetic_data[
    [
        "Customer_credit_rating",
        "Account_indicator_release_limit_excess",
        "Account_amount_daily_limit",
    ]
] = all_synthetic_data["7_29_30"].str.split("_", expand=True)


all_synthetic_data["Account_indicator_release_limit_excess"] = all_synthetic_data[
    "Account_indicator_release_limit_excess"
].astype(int)


all_synthetic_data["Account_amount_daily_limit"] = all_synthetic_data[
    "Account_amount_daily_limit"
].astype(int)


all_synthetic_data["Transaction_Failure_Status"] = all_synthetic_data[
    "Error_Code"
].apply(lambda x: 0 if x == "a" else 1)


all_synthetic_data.drop(columns=["7_29_30", "40_41"], inplace=True)

In [149]:
for col in all_synthetic_data.columns:
    if col not in train_all.columns:
        print(col)
print("")
for col in train_all.columns:
    if col not in all_synthetic_data.columns:
        print(col)

Transaction_hour
거래까지걸린시간_소수
거래까지걸린시간_ATM_소수
거래까지걸린시간_영업점_소수
거래당시나이
등록부터 계좌개설까지 걸린시간_소수
등록당시나이

ID
Customer_Gender
Customer_personal_identifier
Customer_identification_number
Customer_registration_datetime
Account_account_number
Account_creation_datetime
Transaction_Datetime
IP_Address
MAC_Address
Location
Recipient_Account_Number
Last_atm_transaction_datetime
Last_bank_branch_transaction_datetime
Transaction_resumed_date


In [150]:
import math


def convert_from_float_year(float_year):

    year = int(math.floor(float_year))

    fractional_part = float_year - year

    start_of_year = pd.Timestamp(year=year, month=1, day=1)

    next_year = pd.Timestamp(year=year + 1, month=1, day=1)

    year_duration = (next_year - start_of_year).total_seconds()

    elapsed_seconds = round(fractional_part * year_duration)  
    result_date = start_of_year + pd.to_timedelta(elapsed_seconds, unit="s")

    return result_date


all_synthetic_data["거래일자_소수"] = (
    all_synthetic_data["Customer_Birthyear"] + all_synthetic_data["거래당시나이"]
)
all_synthetic_data["거래재개일자_소수"] = (
    all_synthetic_data["거래일자_소수"] - all_synthetic_data["거래까지걸린시간_소수"]
)
all_synthetic_data["마지막ATM거래일자_소수"] = (
    all_synthetic_data["거래일자_소수"]
    - all_synthetic_data["거래까지걸린시간_ATM_소수"]
)
all_synthetic_data["마지막영업점거래일자_소수"] = (
    all_synthetic_data["거래일자_소수"]
    - all_synthetic_data["거래까지걸린시간_영업점_소수"]
)


all_synthetic_data["Transaction_Datetime"] = all_synthetic_data["거래일자_소수"].apply(
    convert_from_float_year
)


all_synthetic_data["Transaction_resumed_date"] = all_synthetic_data[
    "거래재개일자_소수"
].apply(convert_from_float_year)


all_synthetic_data["Last_atm_transaction_datetime"] = all_synthetic_data[
    "마지막ATM거래일자_소수"
].apply(convert_from_float_year)


all_synthetic_data["Last_bank_branch_transaction_datetime"] = all_synthetic_data[
    "마지막영업점거래일자_소수"
].apply(convert_from_float_year)


all_synthetic_data.drop(
    columns=[
        "거래일자_소수",
        "거래재개일자_소수",
        "마지막ATM거래일자_소수",
        "마지막영업점거래일자_소수",
        "거래까지걸린시간_소수",
        "거래까지걸린시간_ATM_소수",
        "거래까지걸린시간_영업점_소수",
        "거래당시나이",
    ],
    inplace=True,
)

In [151]:
all_synthetic_data["Customer_personal_identifier"] = "홍길동"
all_synthetic_data["Customer_identification_number"] = "aaaaaa-aaaaaaa"
all_synthetic_data["Account_account_number"] = "aaaaaaaaaa"
all_synthetic_data["Customer_Gender"] = "male"
all_synthetic_data["IP_Address"] = "38.117.123.196"
all_synthetic_data["MAC_Address"] = "5e:76:37:86:60:c3"
all_synthetic_data["Location"] = "서울특별시 강서구 가양동 37.568238 126.845059"
all_synthetic_data["Recipient_Account_Number"] = "aaaaaaaaaa"
all_synthetic_data["Customer_registration_datetime"] = "1999-12-31 23:59:59"
all_synthetic_data["Account_creation_datetime"] = "1999-12-31 23:59:59"

In [152]:
all_synthetic_data.head()

Unnamed: 0,Customer_Birthyear,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,Customer_rooting_jailbreak_indicator,Customer_mobile_roaming_indicator,Customer_VPN_Indicator,Customer_loan_type,Customer_flag_terminal_malicious_behavior_1,...,Customer_personal_identifier,Customer_identification_number,Account_account_number,Customer_Gender,IP_Address,MAC_Address,Location,Recipient_Account_Number,Customer_registration_datetime,Account_creation_datetime
0,1974.199202,1,0,1,0,0,0,0,b,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
1,1987.205354,0,1,0,0,0,0,0,b,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
2,1976.145828,0,0,0,1,0,0,1,b,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
3,1990.037501,0,0,1,1,0,0,0,a,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
4,1996.618474,0,1,0,1,0,0,0,a,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59


In [155]:
# all_synthetic_data의 열 순서를 test_all의 열 순서로 변경
hour = all_synthetic_data["Transaction_hour"]
all_synthetic_data = all_synthetic_data.reindex(columns=train_all.columns)

import pandas as pd
import numpy as np

# 기존 데이터 로드
train = pd.read_csv("C:/Users/Jae/Source/DACON/FSI/train.csv")

# 수치형 특성 수정
numeric_columns = [
    "Customer_Birthyear",
    "Account_initial_balance",
    "Account_balance",
    "Account_remaining_amount_daily_limit_exceeded",
    "Account_one_month_max_amount",
    "Transaction_Amount",
]

for col in numeric_columns:
    if col in all_synthetic_data.columns and col in train.columns:
        all_synthetic_data[col] = (
            all_synthetic_data[col].round().astype(train[col].dtype)
        )

# int32를 int64로 변환
int32_to_int64_columns = [
    "Account_indicator_release_limit_excess",
    "Account_amount_daily_limit",
    "Account_one_month_std_dev",
    "Account_dawn_one_month_max_amount",
    "Account_dawn_one_month_std_dev",
]

for col in int32_to_int64_columns:
    if col in all_synthetic_data.columns and col in train.columns:
        all_synthetic_data[col] = all_synthetic_data[col].astype("int64")

# 날짜/시간 형식 수정
datetime_columns = [
    "Customer_registration_datetime",
    "Account_creation_datetime",
    "Transaction_Datetime",
    "Last_atm_transaction_datetime",
    "Last_bank_branch_transaction_datetime",
    "Transaction_resumed_date",
]

for col in datetime_columns:
    if col in all_synthetic_data.columns and col in train.columns:
        all_synthetic_data[col] = pd.to_datetime(all_synthetic_data[col]).dt.strftime(
            "%Y-%m-%d %H:%M:%S"
        )

# object를 int64로 변환
binary_columns = [
    "Customer_flag_change_of_authentication_1",
    "Customer_flag_change_of_authentication_2",
    "Customer_flag_change_of_authentication_3",
    "Customer_flag_change_of_authentication_4",
    "Customer_rooting_jailbreak_indicator",
    "Customer_mobile_roaming_indicator",
    "Customer_VPN_Indicator",
    "Customer_flag_terminal_malicious_behavior_1",
    "Customer_flag_terminal_malicious_behavior_2",
    "Customer_flag_terminal_malicious_behavior_3",
    "Customer_flag_terminal_malicious_behavior_4",
    "Customer_flag_terminal_malicious_behavior_5",
    "Customer_flag_terminal_malicious_behavior_6",
    "Customer_inquery_atm_limit",
    "Customer_increase_atm_limit",
    "Account_indicator_Openbanking",
    "Account_release_suspention",
    "Another_Person_Account",
    "Unused_terminal_status",
    "Flag_deposit_more_than_tenMillion",
    "Unused_account_status",
    "Recipient_account_suspend_status",
    "First_time_iOS_by_vulnerable_user",
]

for col in binary_columns:
    if col in all_synthetic_data.columns and col in train.columns:
        all_synthetic_data[col] = all_synthetic_data[col].round().astype("int64")

non_negative_int_columns = [
    "Transaction_num_connection_failure",
    "Number_of_transaction_with_the_account",
    "Transaction_history_with_the_account",
]

for col in non_negative_int_columns:
    if col in all_synthetic_data.columns and col in train.columns:
        all_synthetic_data[col] = np.maximum(all_synthetic_data[col].round(), 0).astype(
            "int64"
        )

# Time_difference 형식 수정
if (
    "Time_difference" in all_synthetic_data.columns
    and "Time_difference" in train.columns
):
    all_synthetic_data["Time_difference"] = all_synthetic_data[
        "Time_difference"
    ].astype(str)

# 열 순서 맞추기
all_synthetic_data = all_synthetic_data.reindex(columns=train.columns)

# 변경된 데이터 저장
all_synthetic_data.to_csv(
    "./clf_concat_forestdiffusion.csv", encoding="UTF-8-sig", index=False
)

print("Data types have been standardized and saved.")
print(all_synthetic_data.dtypes)
print(all_synthetic_data.shape)


all_synthetic_data.drop(columns="ID", inplace=True)
all_synthetic_data = pd.concat((hour, all_synthetic_data), axis=1)
print(all_synthetic_data.shape)

In [None]:
all_synthetic_data.to_csv(
    f"/data/ForestDiffusion_clf_submission.csv",
    encoding="UTF-8-sig",
    index=False,
)