0. 필요한 모듈 로드

In [4]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import torch
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm
from gretel_synthetics.actgan import ACTGAN

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

1. 데이터 로드 / train, val 나누기

In [5]:
train_all = pd.read_csv("/data/train.csv")

In [6]:
train = train_all.drop(columns="ID")

In [7]:
from sklearn.model_selection import train_test_split

X_train, y_train = train.drop(columns="Fraud_Type"), train["Fraud_Type"]

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42, stratify=y_train)

In [8]:
train = pd.concat((X_train, y_train), axis=1)
val = pd.concat((X_val, y_val), axis=1)

2. 데이터 전처리

In [9]:
only_m_customers = train.groupby('Customer_identification_number')['Fraud_Type'].apply(lambda x: set(x) == {'m'}).reset_index()

# Fraud_Type이 'm'만 있는 Customer_identification_number 필터링
only_m_customers = only_m_customers[only_m_customers['Fraud_Type'] == True]['Customer_identification_number']

# 해당 Customer_identification_number와 일치하는 행들을 train에서 제거
train = train[~train['Customer_identification_number'].isin(only_m_customers)]

In [10]:
hdb = train[['Customer_identification_number', 'Customer_Birthyear', 'Customer_Gender', 'Customer_personal_identifier', 'Customer_registration_datetime', 'Account_account_number', 'Account_creation_datetime']]

hdb = hdb.drop_duplicates()

In [11]:
train['7_29_30'] = train['Customer_credit_rating'] + '_' + train['Account_indicator_release_limit_excess'].astype(str) +'_' + train['Account_amount_daily_limit'].astype(str)
train['7_29_30'] = train['Customer_credit_rating'] + '_' + train['Account_indicator_release_limit_excess'].astype(str) +'_' + train['Account_amount_daily_limit'].astype(str)

In [12]:
train['40_41'] = train['Channel'] + '_' + train['Operating_System']

In [13]:
train['35/34'] = train.apply(lambda row: row['Account_one_month_std_dev'] / row['Account_one_month_max_amount'] if row['Account_one_month_max_amount'] != 0 else 0, axis=1)
train['37/36'] = train.apply(lambda row: row['Account_dawn_one_month_std_dev'] / row['Account_dawn_one_month_max_amount'] if row['Account_dawn_one_month_max_amount'] != 0 else 0, axis=1)
train['36/34'] = train.apply(lambda row: row['Account_dawn_one_month_max_amount'] / row['Account_one_month_max_amount'] if row['Account_one_month_max_amount'] != 0 else 0, axis=1)

In [14]:
def convert_to_float_year(dt):
        year = dt.year
        start_of_year = pd.Timestamp(year=year, month=1, day=1)
        next_year = pd.Timestamp(year=year + 1, month=1, day=1)
        year_elapsed = (dt - start_of_year).total_seconds()
        year_duration = (next_year - start_of_year).total_seconds()
        fractional_year = year_elapsed / year_duration
        return year + fractional_year

train['Account_creation_datetime'] = pd.to_datetime(train['Account_creation_datetime'])
train['Transaction_Datetime'] = pd.to_datetime(train['Transaction_Datetime'])
train['Last_atm_transaction_datetime'] = pd.to_datetime(train['Last_atm_transaction_datetime'])
train['Last_bank_branch_transaction_datetime'] = pd.to_datetime(train['Last_bank_branch_transaction_datetime'])
train['Transaction_resumed_date'] = pd.to_datetime(train['Transaction_resumed_date'])
train['Customer_registration_datetime'] = pd.to_datetime(train['Customer_registration_datetime'])

train['Transaction_hour'] = train['Transaction_Datetime'].apply(lambda x: x.hour)

train['거래재개일자_소수'] = train['Transaction_resumed_date'].apply(convert_to_float_year)
train['거래일자_소수'] = train['Transaction_Datetime'].apply(convert_to_float_year)
train['마지막ATM거래일자_소수'] = train['Last_atm_transaction_datetime'].apply(convert_to_float_year)
train['마지막영업점거래일자_소수'] = train['Last_bank_branch_transaction_datetime'].apply(convert_to_float_year)

train['거래까지걸린시간_소수'] = train['거래일자_소수'] - train['거래재개일자_소수']
train['거래까지걸린시간_ATM_소수'] = train['거래일자_소수'] - train['마지막ATM거래일자_소수']
train['거래까지걸린시간_영업점_소수'] = train['거래일자_소수'] - train['마지막영업점거래일자_소수']
train['거래당시나이'] = train['거래일자_소수'] - train['Customer_Birthyear']

In [15]:
drop_columns = ['Customer_identification_number', 'Customer_personal_identifier', 'Account_account_number', 'Location', 'IP_Address', 'MAC_Address', 'Recipient_Account_Number', 'Customer_Gender',
                'Account_one_month_std_dev', 'Account_dawn_one_month_std_dev', 'Account_dawn_one_month_max_amount',
                'Customer_credit_rating', 'Account_indicator_release_limit_excess', 'Account_amount_daily_limit',
                'Channel', 'Operating_System',
                'Transaction_Failure_Status',
                'Account_creation_datetime', 'Transaction_Datetime', 'Last_atm_transaction_datetime', 'Last_bank_branch_transaction_datetime', 'Transaction_resumed_date', 'Customer_registration_datetime',
                '거래재개일자_소수', '거래일자_소수', '마지막ATM거래일자_소수', '마지막영업점거래일자_소수'
                ]

train.drop(columns=drop_columns, inplace=True)

3. Train

In [17]:
def handle_outliers(series, n_std=3):
    mean = series.mean()
    std = series.std()

    # z-score 계산
    z_scores = (series - mean) / std

    # n_std를 넘는 z-score를 가진 값을 n_std에 해당하는 원본 값으로 대체
    series = series.mask(z_scores > n_std, mean + n_std * std)
    series = series.mask(z_scores < -n_std, mean - n_std * std)

    return series


train['Time_difference_seconds'] = pd.to_timedelta(train['Time_difference']).dt.total_seconds()


# 거래까지걸린시간_소수의 평균값 계산
mean_value = train['거래까지걸린시간_소수'].mean()
train.loc[train['Time_difference_seconds'] > 31536000, '거래까지걸린시간_소수'] = mean_value

mean_value = train['Time_difference_seconds'].mean()
train['Time_difference_seconds'] = train['Time_difference_seconds'].apply(
    lambda x: mean_value if x > 31536000 else x
)

min_positive_value = train['Time_difference_seconds'][train['Time_difference_seconds'] >= 0].min()
train['Time_difference_seconds'] = train['Time_difference_seconds'].apply(
    lambda x: min_positive_value if x < 0 else x
)



category_columns = [
    'Customer_flag_change_of_authentication_1',
    'Customer_flag_change_of_authentication_2', 'Customer_flag_change_of_authentication_3',
    'Customer_flag_change_of_authentication_4', 'Customer_rooting_jailbreak_indicator',
    'Customer_mobile_roaming_indicator', 'Customer_VPN_Indicator',
    'Customer_loan_type', 'Customer_flag_terminal_malicious_behavior_1',
    'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
    'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5',
    'Customer_flag_terminal_malicious_behavior_6', 'Customer_inquery_atm_limit',
    'Customer_increase_atm_limit', 'Account_indicator_Openbanking',
    'Account_release_suspention', 'Error_Code', 'Type_General_Automatic',
    'Access_Medium',
    'Transaction_num_connection_failure',
    'Another_Person_Account', 'Unused_terminal_status',
    'Flag_deposit_more_than_tenMillion', 'Unused_account_status',
    'Recipient_account_suspend_status', 'Number_of_transaction_with_the_account',
    'Transaction_history_with_the_account', 'First_time_iOS_by_vulnerable_user',
    'Fraud_Type', '7_29_30', '40_41'
]

for col in category_columns:
  train[col] = train[col].astype(str)

# 나머지 열들 중 정수형은 int64로, 실수형은 float64로 변환
int_columns = [
    'Account_initial_balance', 'Account_balance',
    'Account_remaining_amount_daily_limit_exceeded', 'Account_one_month_max_amount',
    'Transaction_Amount', 'Distance', 'Time_difference_seconds'
]

float_columns = [
    '35/34', '37/36', '36/34',
     '거래까지걸린시간_소수', '거래까지걸린시간_ATM_소수',
    '거래까지걸린시간_영업점_소수',
]

for col in int_columns:
    train[col] = train[col].astype('int64')

for col in float_columns:
    train[col] = train[col].astype('float64')

## 이상치 처리 - 금액
cost_cols = [
    'Account_initial_balance', 'Account_balance', 'Account_remaining_amount_daily_limit_exceeded', 'Account_one_month_max_amount',
    'Transaction_Amount'
]

for col in cost_cols:
    train[col] = handle_outliers(train[col])

# 모든 Fraud_Type 목록 생성 (m 포함)
fraud_types = train['Fraud_Type'].unique()


# 모든 합성 데이터를 저장할 DataFrame 초기화
all_synthetic_data = pd.DataFrame()

N_SAMPLE = 90

# 각 Fraud_Type에 대해 학습된 모델을 저장할 딕셔너리
models_dict = {}

# 각 Fraud_Type에 대해 합성 데이터 생성 및 저장
for fraud_type in tqdm(fraud_types):
    # 'm' Fraud_Type만 건너뛰기
    if fraud_type == 'm':
        continue

    # EPOCH 설정
    EPOCH = 100

    print(f"Processing Fraud_Type: {fraud_type}")

    # 해당 Fraud_Type에 대한 서브셋 생성
    subset = train[train["Fraud_Type"] == fraud_type]

    # 모든 Fraud_Type에 대해 샘플링
    subset = subset.sample(n=N_SAMPLE, random_state=42)

    # Time_difference 열 제외 (초 단위로 변환된 컬럼만 사용)
    subset = subset.drop('Time_difference', axis=1)


    synthesizer =ACTGAN(
                   epochs=100,
                   cuda=True
                        )

    synthesizer.fit(subset)

    # 학습된 모델을 딕셔너리에 저장
    models_dict[fraud_type] = synthesizer

    # 학습된 모델로 합성 데이터 생성
    synthetic_subset = synthesizer.sample(num_rows=500)

    for col in cost_cols:
        synthetic_subset[col] = handle_outliers(synthetic_subset[col])

    # Time_difference_seconds를 다시 timedelta로 변환
    synthetic_subset['Time_difference'] = pd.to_timedelta(synthetic_subset['Time_difference_seconds'], unit='s')

    # Time_difference_seconds 컬럼 제거
    synthetic_subset = synthetic_subset.drop('Time_difference_seconds', axis=1)

    # 생성된 데이터를 all_synthetic_data에 추가
    all_synthetic_data = pd.concat([all_synthetic_data, synthetic_subset], ignore_index=True)

    # 최종 결과 확인
    print("\nCurrent All Synthetic Data Shape:", all_synthetic_data.shape)


print("\nFinal All Synthetic Data Shape:", all_synthetic_data.shape)

  0%|          | 0/13 [00:00<?, ?it/s]

Processing Fraud_Type: h


 15%|█▌        | 2/13 [00:08<00:44,  4.08s/it]


Current All Synthetic Data Shape: (500, 50)
Processing Fraud_Type: d


 23%|██▎       | 3/13 [00:16<00:59,  5.98s/it]


Current All Synthetic Data Shape: (1000, 50)
Processing Fraud_Type: f


 31%|███       | 4/13 [00:25<01:03,  7.02s/it]


Current All Synthetic Data Shape: (1500, 50)
Processing Fraud_Type: j


 38%|███▊      | 5/13 [00:34<01:01,  7.67s/it]


Current All Synthetic Data Shape: (2000, 50)
Processing Fraud_Type: i


 46%|████▌     | 6/13 [00:46<01:04,  9.17s/it]


Current All Synthetic Data Shape: (2500, 50)
Processing Fraud_Type: b


 54%|█████▍    | 7/13 [00:55<00:54,  9.14s/it]


Current All Synthetic Data Shape: (3000, 50)
Processing Fraud_Type: g


 62%|██████▏   | 8/13 [01:03<00:44,  8.85s/it]


Current All Synthetic Data Shape: (3500, 50)
Processing Fraud_Type: l


 69%|██████▉   | 9/13 [01:13<00:35,  8.93s/it]


Current All Synthetic Data Shape: (4000, 50)
Processing Fraud_Type: k


 77%|███████▋  | 10/13 [01:22<00:26,  8.99s/it]


Current All Synthetic Data Shape: (4500, 50)
Processing Fraud_Type: c


 85%|████████▍ | 11/13 [01:29<00:17,  8.57s/it]


Current All Synthetic Data Shape: (5000, 50)
Processing Fraud_Type: a


 92%|█████████▏| 12/13 [01:38<00:08,  8.71s/it]


Current All Synthetic Data Shape: (5500, 50)
Processing Fraud_Type: e


100%|██████████| 13/13 [01:47<00:00,  8.25s/it]


Current All Synthetic Data Shape: (6000, 50)

Final All Synthetic Data Shape: (6000, 50)





In [18]:
all_synthetic_data["Fraud_Type"].value_counts()

Unnamed: 0,Fraud_Type
h,500
d,500
f,500
j,500
i,500
b,500
g,500
l,500
k,500
c,500


4. 데이터 후처리

In [19]:
all_synthetic_data['Account_one_month_std_dev'] = (all_synthetic_data['35/34']*all_synthetic_data['Account_one_month_max_amount']).astype(int)
all_synthetic_data['Account_dawn_one_month_max_amount'] = (all_synthetic_data['36/34']*all_synthetic_data['Account_one_month_max_amount']).astype(int)
all_synthetic_data['Account_dawn_one_month_std_dev'] = (all_synthetic_data['37/36']*all_synthetic_data['Account_dawn_one_month_max_amount']).astype(int)

all_synthetic_data.drop(columns=['35/34', '36/34', '37/36'], inplace=True)

In [20]:
all_synthetic_data[['Channel', 'Operating_System']] = all_synthetic_data['40_41'].str.split('_', expand=True)

all_synthetic_data[['Customer_credit_rating', 'Account_indicator_release_limit_excess', 'Account_amount_daily_limit']] = all_synthetic_data['7_29_30'].str.split('_', expand=True)
all_synthetic_data['Account_indicator_release_limit_excess'] = all_synthetic_data['Account_indicator_release_limit_excess'].astype(int)
all_synthetic_data['Account_amount_daily_limit'] = all_synthetic_data['Account_amount_daily_limit'].astype(int)

all_synthetic_data['Transaction_Failure_Status'] = all_synthetic_data['Error_Code'].apply(lambda x: 0 if x == 'a' else 1)

all_synthetic_data.drop(columns=['7_29_30', '40_41'], inplace=True)

In [21]:
for col in all_synthetic_data.columns:
    if col not in train_all.columns:
        print(col)
print("")
for col in train_all.columns:
    if col not in all_synthetic_data.columns:
        print(col)

Transaction_hour
거래까지걸린시간_소수
거래까지걸린시간_ATM_소수
거래까지걸린시간_영업점_소수
거래당시나이

ID
Customer_Gender
Customer_personal_identifier
Customer_identification_number
Customer_registration_datetime
Account_account_number
Account_creation_datetime
Transaction_Datetime
IP_Address
MAC_Address
Location
Recipient_Account_Number
Last_atm_transaction_datetime
Last_bank_branch_transaction_datetime
Transaction_resumed_date


In [22]:
import math

def convert_from_float_year(float_year):
    year = int(math.floor(float_year))
    fractional_part = float_year - year
    start_of_year = pd.Timestamp(year=year, month=1, day=1)
    next_year = pd.Timestamp(year=year + 1, month=1, day=1)
    year_duration = (next_year - start_of_year).total_seconds()
    elapsed_seconds = round(fractional_part * year_duration)  # 초를 반올림
    result_date = start_of_year + pd.to_timedelta(elapsed_seconds, unit='s')
    return result_date

all_synthetic_data['거래일자_소수'] = all_synthetic_data['Customer_Birthyear'] + all_synthetic_data['거래당시나이']
all_synthetic_data['거래재개일자_소수'] = all_synthetic_data['거래일자_소수'] - all_synthetic_data['거래까지걸린시간_소수']
all_synthetic_data['마지막ATM거래일자_소수'] = all_synthetic_data['거래일자_소수'] - all_synthetic_data['거래까지걸린시간_ATM_소수']
all_synthetic_data['마지막영업점거래일자_소수'] = all_synthetic_data['거래일자_소수'] - all_synthetic_data['거래까지걸린시간_영업점_소수']

all_synthetic_data['Transaction_Datetime'] = all_synthetic_data['거래일자_소수'].apply(convert_from_float_year)
all_synthetic_data['Transaction_resumed_date'] = all_synthetic_data['거래재개일자_소수'].apply(convert_from_float_year)
all_synthetic_data['Last_atm_transaction_datetime'] = all_synthetic_data['마지막ATM거래일자_소수'].apply(convert_from_float_year)
all_synthetic_data['Last_bank_branch_transaction_datetime'] = all_synthetic_data['마지막영업점거래일자_소수'].apply(convert_from_float_year)

all_synthetic_data.drop(columns=['거래일자_소수', '거래재개일자_소수', '마지막ATM거래일자_소수', '마지막영업점거래일자_소수',
                                '거래까지걸린시간_소수', '거래까지걸린시간_ATM_소수', '거래까지걸린시간_영업점_소수', '거래당시나이',
                                ], inplace=True)

In [23]:
all_synthetic_data['Customer_personal_identifier'] = '홍길동'
all_synthetic_data['Customer_identification_number'] = 'aaaaaa-aaaaaaa'
all_synthetic_data['Account_account_number'] = 'aaaaaaaaaa'
all_synthetic_data['Customer_Gender'] = 'male'
all_synthetic_data['IP_Address'] = '38.117.123.196'
all_synthetic_data['MAC_Address'] = '5e:76:37:86:60:c3'
all_synthetic_data['Location'] = '서울특별시 강서구 가양동 37.568238 126.845059'
all_synthetic_data['Recipient_Account_Number'] = 'aaaaaaaaaa'
all_synthetic_data['Customer_registration_datetime'] = '1999-12-31 23:59:59'
all_synthetic_data['Account_creation_datetime'] = '1999-12-31 23:59:59'

In [24]:
all_synthetic_data.head()

Unnamed: 0,Customer_Birthyear,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,Customer_rooting_jailbreak_indicator,Customer_mobile_roaming_indicator,Customer_VPN_Indicator,Customer_loan_type,Customer_flag_terminal_malicious_behavior_1,...,Customer_personal_identifier,Customer_identification_number,Account_account_number,Customer_Gender,IP_Address,MAC_Address,Location,Recipient_Account_Number,Customer_registration_datetime,Account_creation_datetime
0,1999,1,1,1,0,0,0,0,b,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
1,1971,1,0,1,1,0,0,0,c,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
2,2004,1,0,0,1,0,0,0,c,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
3,1998,1,1,0,0,0,0,0,c,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59
4,2002,1,1,1,1,1,0,0,c,0,...,홍길동,aaaaaa-aaaaaaa,aaaaaaaaaa,male,38.117.123.196,5e:76:37:86:60:c3,서울특별시 강서구 가양동 37.568238 126.845059,aaaaaaaaaa,1999-12-31 23:59:59,1999-12-31 23:59:59


In [25]:
# all_synthetic_data의 열 순서를 test_all의 열 순서로 변경
hour = all_synthetic_data['Transaction_hour']
all_synthetic_data = all_synthetic_data.reindex(columns=train_all.columns)
all_synthetic_data.drop(columns='ID', inplace=True)
all_synthetic_data = pd.concat((hour, all_synthetic_data), axis=1)
print(all_synthetic_data.shape)

(6000, 64)


In [26]:
all_synthetic_data.to_csv('/data/ACTGAN_clf_submission.csv', encoding='UTF-8-sig', index=False)