In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
path = '/content/drive/MyDrive/Colab Notebooks/fsi/open/'
save_path = '/content/drive/MyDrive/Colab Notebooks/fsi/processed_data/1_all_pre_syn.csv'

train_all = pd.read_csv(path + "train.csv")
test_all = pd.read_csv(path + "test.csv")

# 합성 모델 생성 전 가공된 데이터들 (이상치 처리만 적용함.)
saved_train = pd.read_csv(save_path)

# 합성 모델 생성 전 가공된 데이터들 (이상치 처리 + 각 라벨별 좋은 샘플 30개씩)
saved_train_m = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fsi/processed_data/1_30_sample_pre_syn.csv')

save_path = '/content/drive/MyDrive/Colab Notebooks/fsi/processed_data'

# 합성 모델로 생성된 데이터들
all_synthetic_data = pd.read_csv(save_path + '/filtered_syn_for_cls.csv')

train = train_all.drop(columns="ID")
saved_train["Fraud_Type"].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,118800
a,100
j,100
h,100
k,100
c,100
g,100
i,100
b,100
f,100


In [4]:
fraud_types = train_all.Fraud_Type.unique()
fin_500_syn_data = pd.DataFrame()

for x in fraud_types:
    fin_tmp = all_synthetic_data[all_synthetic_data.Fraud_Type == x].sort_values('predicted_max_values', ascending=False).head(400)
    fin_500_syn_data = pd.concat([fin_500_syn_data, fin_tmp], axis=0)

fin_500_syn_data.drop('predicted_max_values', axis=1, inplace=True)
fin_500_syn_data.drop('model', axis=1, inplace=True)

In [5]:
filtered_train = saved_train#[~saved_train.isin(valid)].dropna()
filtered_train_m = filtered_train[filtered_train.Fraud_Type == 'm'].sample(n=500, random_state=42)
filtered_train_not_m = filtered_train[filtered_train.Fraud_Type != 'm']

# train 합치기
train_total = pd.concat([filtered_train_not_m, filtered_train_m, fin_500_syn_data[fin_500_syn_data.Fraud_Type != 'm']])
train_total.Fraud_Type.value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
a,500
j,500
h,500
k,500
c,500
g,500
i,500
b,500
f,500
d,500


In [6]:
# validation 데이터 고르기
remaining_valid_data = filtered_train[filtered_train.Fraud_Type == 'm'].drop(filtered_train_m.index)
valid = remaining_valid_data.sample(n=200, random_state=42)

# valid 합치기
fraud_types = train_all.Fraud_Type.unique()
fin_200_syn_data = pd.DataFrame()

for x in fraud_types:
    if x != 'm':
        fin_tmp = all_synthetic_data[all_synthetic_data.Fraud_Type == x].sort_values('predicted_max_values', ascending=False).reset_index(drop=True)
        fin_200_syn_data = pd.concat([fin_200_syn_data, fin_tmp.iloc[400:600]], axis=0)

fin_200_syn_data.drop('predicted_max_values', axis=1, inplace=True)
fin_200_syn_data.drop('model', axis=1, inplace=True)

all_synthetic_data.drop('predicted_max_values', axis=1, inplace=True)
all_synthetic_data.drop('model', axis=1, inplace=True)

valid = pd.concat([valid, fin_200_syn_data])
valid.Fraud_Type.value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,200
a,200
j,200
h,200
k,200
c,200
g,200
i,200
b,200
f,200


In [7]:
# 중복되는 행 있는지 확인
combined_data = pd.concat([train_total, valid])
duplicate_rows = combined_data[combined_data.duplicated(keep=False)]
print(duplicate_rows)

Empty DataFrame
Columns: [Customer_Birthyear, Customer_Gender, Customer_personal_identifier, Customer_identification_number, Customer_registration_datetime, Customer_credit_rating, Customer_flag_change_of_authentication_1, Customer_flag_change_of_authentication_2, Customer_flag_change_of_authentication_3, Customer_flag_change_of_authentication_4, Customer_rooting_jailbreak_indicator, Customer_mobile_roaming_indicator, Customer_VPN_Indicator, Customer_loan_type, Customer_flag_terminal_malicious_behavior_1, Customer_flag_terminal_malicious_behavior_2, Customer_flag_terminal_malicious_behavior_3, Customer_flag_terminal_malicious_behavior_4, Customer_flag_terminal_malicious_behavior_5, Customer_flag_terminal_malicious_behavior_6, Customer_inquery_atm_limit, Customer_increase_atm_limit, Account_account_number, Account_account_type, Account_creation_datetime, Account_initial_balance, Account_balance, Account_indicator_release_limit_excess, Account_amount_daily_limit, Account_indicator_Openba

# 전처리

In [8]:
from sklearn.utils import shuffle

# 데이터프레임 셔플링
train_total.reset_index(inplace=True, drop=True)
train_total = shuffle(train_total, random_state=42)

In [9]:
train_x = train_total.drop(columns=['Fraud_Type'])
train_y = train_total['Fraud_Type']

valid_x = valid.drop(columns=['Fraud_Type'])
valid_y = valid['Fraud_Type']

test_all.drop("ID", axis=1, inplace=True)

In [10]:
# time_difference를 초로 바꾸기
train_x['Time_difference_seconds'] = pd.to_timedelta(train_x['Time_difference']).dt.total_seconds()
valid_x['Time_difference_seconds'] = pd.to_timedelta(valid_x['Time_difference']).dt.total_seconds()
test_all['Time_difference_seconds'] = pd.to_timedelta(test_all['Time_difference']).dt.total_seconds()

In [11]:
# unique한 코드 지우기
train_x.drop('Customer_personal_identifier', axis=1, inplace=True)
valid_x.drop('Customer_personal_identifier', axis=1, inplace=True)
test_all.drop('Customer_personal_identifier', axis=1, inplace=True)

train_x.drop('Customer_identification_number', axis=1, inplace=True)
valid_x.drop('Customer_identification_number', axis=1, inplace=True)
test_all.drop('Customer_identification_number', axis=1, inplace=True)

# 거의 unique한 코드 지우기
train_x.drop('Account_account_number', axis=1, inplace=True)
valid_x.drop('Account_account_number', axis=1, inplace=True)
test_all.drop('Account_account_number', axis=1, inplace=True)

train_x.drop('Recipient_Account_Number', axis=1, inplace=True)
valid_x.drop('Recipient_Account_Number', axis=1, inplace=True)
test_all.drop('Recipient_Account_Number', axis=1, inplace=True)

In [12]:
# 레이블 정리
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)
valid_y_encoded = le_subclass.transform(valid_y)

for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [13]:
# 변수 추가

#거래 시간대 나누기
def convert_timeline(hour):
    if 0 <= hour < 6:
        return 0 #새벽
    elif 6 <= hour < 12:
        return 1 #오전
    elif 12 <= hour < 18:
        return 2 #오후
    else:
        return 3 #저녁
train_x['transaction_time_of_day'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.hour.apply(convert_timeline)
valid_x['transaction_time_of_day'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.hour.apply(convert_timeline)
test_all['transaction_time_of_day'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.hour.apply(convert_timeline)

# 최근 ATM 거래 이후의 시간 차이 (분)
train_x['Minutes_since_last_ATM'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Last_atm_transaction_datetime'])).dt.total_seconds() / 60
valid_x['Minutes_since_last_ATM'] = (pd.to_datetime(valid_x['Transaction_Datetime']) - pd.to_datetime(valid_x['Last_atm_transaction_datetime'])).dt.total_seconds() / 60
test_all['Minutes_since_last_ATM'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Last_atm_transaction_datetime'])).dt.total_seconds() / 60

# 계좌 재개 이후 거래 까지의 시간 차이 (분)
train_x['Minutes_since_resumed'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Transaction_resumed_date'])).dt.total_seconds() / 60
valid_x['Minutes_since_resumed'] = (pd.to_datetime(valid_x['Transaction_Datetime']) - pd.to_datetime(valid_x['Transaction_resumed_date'])).dt.total_seconds() / 60
test_all['Minutes_since_resumed'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Transaction_resumed_date'])).dt.total_seconds() / 60

# 최근 영업점 거래 이후의 시간 차이 (분)
train_x['Minutes_since_last_branch'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Last_bank_branch_transaction_datetime'])).dt.total_seconds() / 60
valid_x['Minutes_since_last_branch'] = (pd.to_datetime(valid_x['Transaction_Datetime']) - pd.to_datetime(valid_x['Last_bank_branch_transaction_datetime'])).dt.total_seconds() / 60
test_all['Minutes_since_last_branch'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Last_bank_branch_transaction_datetime'])).dt.total_seconds() / 60

In [14]:
malicious_behaviors = ['Customer_flag_terminal_malicious_behavior_1',
                       'Customer_flag_terminal_malicious_behavior_2',
                       'Customer_flag_terminal_malicious_behavior_3',
                       'Customer_flag_terminal_malicious_behavior_4',
                       'Customer_flag_terminal_malicious_behavior_5',
                       'Customer_flag_terminal_malicious_behavior_6']

# 악성 행위
train_x['Total_Malicious_Behaviors'] = train_x[malicious_behaviors].sum(axis=1)
valid_x['Total_Malicious_Behaviors'] = valid_x[malicious_behaviors].sum(axis=1)
test_all['Total_Malicious_Behaviors'] = test_all[malicious_behaviors].sum(axis=1)

# 거래 요일
train_x['Day_of_Week'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.dayofweek
valid_x['Day_of_Week'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.dayofweek
test_all['Day_of_Week'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.dayofweek

# 주말 여부
train_x['Is_Weekend'] = train_x['Day_of_Week'].isin([5, 6]).astype(int)
valid_x['Is_Weekend'] = valid_x['Day_of_Week'].isin([5, 6]).astype(int)
test_all['Is_Weekend'] = test_all['Day_of_Week'].isin([5, 6]).astype(int)

# 인증 방법 변경 횟수
flag_change = ['Customer_flag_change_of_authentication_1',
                'Customer_flag_change_of_authentication_2',
                'Customer_flag_change_of_authentication_3',
                'Customer_flag_change_of_authentication_4']
train_x['Auth_Change_Count'] = train_x[flag_change].sum(axis=1)
valid_x['Auth_Change_Count'] = valid_x[flag_change].sum(axis=1)
test_all['Auth_Change_Count'] = test_all[flag_change].sum(axis=1)

# 보안 침해 위험
train_x['Security_Risk'] = train_x[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)
valid_x['Security_Risk'] = valid_x[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)
test_all['Security_Risk'] = test_all[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)

# 거래 넘은 횟수
train_x['Is_Above_Max_Amount'] = train_x['Transaction_Amount'] > train_x['Account_one_month_max_amount']
train_x['Is_Above_Dawn_Max_Amount'] = train_x['Transaction_Amount'] > train_x['Account_dawn_one_month_max_amount']
train_x['ATM_Limit_Exceeded'] = train_x['Customer_inquery_atm_limit'] + train_x['Customer_increase_atm_limit']

valid_x['Is_Above_Max_Amount'] = valid_x['Transaction_Amount'] > valid_x['Account_one_month_max_amount']
valid_x['Is_Above_Dawn_Max_Amount'] = valid_x['Transaction_Amount'] > valid_x['Account_dawn_one_month_max_amount']
valid_x['ATM_Limit_Exceeded'] = valid_x['Customer_inquery_atm_limit'] + valid_x['Customer_increase_atm_limit']

test_all['Is_Above_Max_Amount'] = test_all['Transaction_Amount'] > test_all['Account_one_month_max_amount']
test_all['Is_Above_Dawn_Max_Amount'] = test_all['Transaction_Amount'] > test_all['Account_dawn_one_month_max_amount']
test_all['ATM_Limit_Exceeded'] = test_all['Customer_inquery_atm_limit'] + test_all['Customer_increase_atm_limit']

In [15]:
# train_x
# 'Time_difference' 열을 문자열로 변환
train_x['Time_difference'] = train_x['Time_difference'].astype(str)
valid_x['Time_difference'] = valid_x['Time_difference'].astype(str)
test_all['Time_difference'] = test_all['Time_difference'].astype(str)

In [16]:
# 각 컬럼 중 numerical에서 object로 변환할 코드 찾기
#for x in train_x.columns:
#    print(x, train_x[x].dtype, train_x[x].unique())
#    print()

In [17]:
train_x.reset_index(inplace=True, drop=True)
valid_x.reset_index(inplace=True, drop=True)
test_all.reset_index(inplace=True, drop=True)

In [18]:
# 위치 나누기
#train_x.reset_index(inplace=True, drop=True)
#test_all.reset_index(inplace=True, drop=True)

train_x['location_first'] =  train_x['Location'].apply(lambda x: x.split(' ')[0])
train_x['location_second'] = train_x['Location'].apply(lambda x: x.split(' ')[1])
train_x['location_last'] = train_x['Location'].apply(lambda x: x.split(' ')[2])
#train_x['location_dong'] = train_x['Location'].apply(lambda x: x.split(' ')[3])
train_x['location_lat'] = train_x['Location'].apply(lambda x: x.split(' ')[-2])
train_x['location_long'] = train_x['Location'].apply(lambda x: x.split(' ')[-1])


valid_x['location_first'] =  valid_x['Location'].apply(lambda x: x.split(' ')[0])
valid_x['location_second'] = valid_x['Location'].apply(lambda x: x.split(' ')[1])
valid_x['location_last'] = valid_x['Location'].apply(lambda x: x.split(' ')[2])
#valid_x['location_dong'] = valid_x['Location'].apply(lambda x: x.split(' ')[3])
valid_x['location_lat'] = valid_x['Location'].apply(lambda x: x.split(' ')[-2])
valid_x['location_long'] = valid_x['Location'].apply(lambda x: x.split(' ')[-1])


test_all['location_first'] =  test_all['Location'].apply(lambda x: x.split(' ')[0])
test_all['location_second'] = test_all['Location'].apply(lambda x: x.split(' ')[1])
test_all['location_last'] = test_all['Location'].apply(lambda x: x.split(' ')[2])
#train_x['location_dong'] = test_all['Location'].apply(lambda x: x.split(' ')[3])
test_all['location_lat'] = test_all['Location'].apply(lambda x: x.split(' ')[-2])
test_all['location_long'] = test_all['Location'].apply(lambda x: x.split(' ')[-1])

# 삭제
train_x.drop('Location', axis=1, inplace=True)
valid_x.drop('Location', axis=1, inplace=True)
test_all.drop('Location', axis=1, inplace=True)

# 위경도 삭제
train_x.drop('location_long', axis=1, inplace=True)
train_x.drop('location_lat', axis=1, inplace=True)

valid_x.drop('location_long', axis=1, inplace=True)
valid_x.drop('location_lat', axis=1, inplace=True)

test_all.drop('location_long', axis=1, inplace=True)
test_all.drop('location_lat', axis=1, inplace=True)

In [19]:
# 날짜 나누기
train_x['Customer_registration_datetime_year'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.year
train_x['Customer_registration_datetime_month'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.month
train_x['Customer_registration_datetime_day'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.day
train_x['Customer_registration_datetime_hour'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.hour

train_x['Account_creation_datetime_year'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.year
train_x['Account_creation_datetime_month'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.month
train_x['Account_creation_datetime_day'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.day
train_x['Account_creation_datetime_hour'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.hour

train_x['Transaction_Datetime_year'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.year
train_x['Transaction_Datetime_month'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.month
train_x['Transaction_Datetime_day'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.day
train_x['Transaction_Datetime_hour'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.hour

train_x['Last_atm_transaction_datetime_year'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.year
train_x['Last_atm_transaction_datetime_month'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.month
train_x['Last_atm_transaction_datetime_day'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.day
train_x['Last_atm_transaction_datetime_hour'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.hour

train_x['Last_bank_branch_transaction_datetime_year'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.year
train_x['Last_bank_branch_transaction_datetime_month'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.month
train_x['Last_bank_branch_transaction_datetime_day'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.day
train_x['Last_bank_branch_transaction_datetime_hour'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.hour

train_x['Transaction_resumed_date_year'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.year
train_x['Transaction_resumed_date_month'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.month
train_x['Transaction_resumed_date_day'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.day
train_x['Transaction_resumed_date_hour'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.hour






valid_x['Customer_registration_datetime_year'] = pd.to_datetime(valid_x['Customer_registration_datetime']).dt.year
valid_x['Customer_registration_datetime_month'] = pd.to_datetime(valid_x['Customer_registration_datetime']).dt.month
valid_x['Customer_registration_datetime_day'] = pd.to_datetime(valid_x['Customer_registration_datetime']).dt.day
valid_x['Customer_registration_datetime_hour'] = pd.to_datetime(valid_x['Customer_registration_datetime']).dt.hour

valid_x['Account_creation_datetime_year'] = pd.to_datetime(valid_x['Account_creation_datetime']).dt.year
valid_x['Account_creation_datetime_month'] = pd.to_datetime(valid_x['Account_creation_datetime']).dt.month
valid_x['Account_creation_datetime_day'] = pd.to_datetime(valid_x['Account_creation_datetime']).dt.day
valid_x['Account_creation_datetime_hour'] = pd.to_datetime(valid_x['Account_creation_datetime']).dt.hour

valid_x['Transaction_Datetime_year'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.year
valid_x['Transaction_Datetime_month'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.month
valid_x['Transaction_Datetime_day'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.day
valid_x['Transaction_Datetime_hour'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.hour

valid_x['Last_atm_transaction_datetime_year'] = pd.to_datetime(valid_x['Last_atm_transaction_datetime']).dt.year
valid_x['Last_atm_transaction_datetime_month'] = pd.to_datetime(valid_x['Last_atm_transaction_datetime']).dt.month
valid_x['Last_atm_transaction_datetime_day'] = pd.to_datetime(valid_x['Last_atm_transaction_datetime']).dt.day
valid_x['Last_atm_transaction_datetime_hour'] = pd.to_datetime(valid_x['Last_atm_transaction_datetime']).dt.hour

valid_x['Last_bank_branch_transaction_datetime_year'] = pd.to_datetime(valid_x['Last_bank_branch_transaction_datetime']).dt.year
valid_x['Last_bank_branch_transaction_datetime_month'] = pd.to_datetime(valid_x['Last_bank_branch_transaction_datetime']).dt.month
valid_x['Last_bank_branch_transaction_datetime_day'] = pd.to_datetime(valid_x['Last_bank_branch_transaction_datetime']).dt.day
valid_x['Last_bank_branch_transaction_datetime_hour'] = pd.to_datetime(valid_x['Last_bank_branch_transaction_datetime']).dt.hour

valid_x['Transaction_resumed_date_year'] = pd.to_datetime(valid_x['Transaction_resumed_date']).dt.year
valid_x['Transaction_resumed_date_month'] = pd.to_datetime(valid_x['Transaction_resumed_date']).dt.month
valid_x['Transaction_resumed_date_day'] = pd.to_datetime(valid_x['Transaction_resumed_date']).dt.day
valid_x['Transaction_resumed_date_hour'] = pd.to_datetime(valid_x['Transaction_resumed_date']).dt.hour










test_all['Customer_registration_datetime_year'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.year
test_all['Customer_registration_datetime_month'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.month
test_all['Customer_registration_datetime_day'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.day
test_all['Customer_registration_datetime_hour'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.hour

test_all['Account_creation_datetime_year'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.year
test_all['Account_creation_datetime_month'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.month
test_all['Account_creation_datetime_day'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.day
test_all['Account_creation_datetime_hour'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.hour

test_all['Transaction_Datetime_year'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.year
test_all['Transaction_Datetime_month'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.month
test_all['Transaction_Datetime_day'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.day
test_all['Transaction_Datetime_hour'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.hour

test_all['Last_atm_transaction_datetime_year'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.year
test_all['Last_atm_transaction_datetime_month'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.month
test_all['Last_atm_transaction_datetime_day'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.day
test_all['Last_atm_transaction_datetime_hour'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.hour

test_all['Last_bank_branch_transaction_datetime_year'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.year
test_all['Last_bank_branch_transaction_datetime_month'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.month
test_all['Last_bank_branch_transaction_datetime_day'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.day
test_all['Last_bank_branch_transaction_datetime_hour'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.hour

test_all['Transaction_resumed_date_year'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.year
test_all['Transaction_resumed_date_month'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.month
test_all['Transaction_resumed_date_day'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.day
test_all['Transaction_resumed_date_hour'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.hour




train_x.drop('Customer_registration_datetime', axis=1, inplace=True)
train_x.drop('Account_creation_datetime', axis=1, inplace=True)
train_x.drop('Transaction_Datetime', axis=1, inplace=True)
train_x.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
train_x.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
train_x.drop('Transaction_resumed_date', axis=1, inplace=True)

valid_x.drop('Customer_registration_datetime', axis=1, inplace=True)
valid_x.drop('Account_creation_datetime', axis=1, inplace=True)
valid_x.drop('Transaction_Datetime', axis=1, inplace=True)
valid_x.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
valid_x.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
valid_x.drop('Transaction_resumed_date', axis=1, inplace=True)

test_all.drop('Customer_registration_datetime', axis=1, inplace=True)
test_all.drop('Account_creation_datetime', axis=1, inplace=True)
test_all.drop('Transaction_Datetime', axis=1, inplace=True)
test_all.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
test_all.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
test_all.drop('Transaction_resumed_date', axis=1, inplace=True)

In [20]:
train_x.drop('Time_difference', axis=1, inplace=True)
valid_x.drop('Time_difference', axis=1, inplace=True)
test_all.drop('Time_difference', axis=1, inplace=True)

In [21]:
# 각 칼럼별 자료형 재정리

# numerical로 되어있는데 categorical로 변환해야할 칼럼들 -> 생년월일, 연, 월, 일, 시각 제외
change_cols = ['Customer_flag_change_of_authentication_1',
'Customer_flag_change_of_authentication_2',
'Customer_flag_change_of_authentication_3',
'Customer_flag_change_of_authentication_4',
'Customer_rooting_jailbreak_indicator',
'Customer_mobile_roaming_indicator', 'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6',
'Customer_inquery_atm_limit', 'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user', 'transaction_time_of_day',
'Day_of_Week', 'Is_Weekend'
]

for x in change_cols:
    train_x[x] = train_x[x].astype('category')
    valid_x[x] = valid_x[x].astype('category')
    test_all[x] = test_all[x].astype('category')

In [22]:
# 이진 값은 원-핫 인코딩 적용
train_x = pd.get_dummies(train_x, columns=['Customer_Gender'], drop_first=True)
train_x = pd.get_dummies(train_x, columns=['Type_General_Automatic'], drop_first=True)

valid_x = pd.get_dummies(valid_x, columns=['Customer_Gender'], drop_first=True)
valid_x = pd.get_dummies(valid_x, columns=['Type_General_Automatic'], drop_first=True)

test_all = pd.get_dummies(test_all, columns=['Customer_Gender'], drop_first=True)
test_all = pd.get_dummies(test_all, columns=['Type_General_Automatic'], drop_first=True)

In [23]:
# Customer_credit_rating 는 순서가 있기에, ordinal encoding 적용
# Day_of_Week, transaction_time_of_day 도 ordinal 이미 적용
encoder_credit = OrdinalEncoder(categories=[['E', 'D', 'C', 'B', 'A', 'S']])

train_x['Customer_credit_rating_Encoded'] = encoder_credit.fit_transform(train_x[['Customer_credit_rating']])
valid_x['Customer_credit_rating_Encoded'] = encoder_credit.fit_transform(valid_x[['Customer_credit_rating']])
test_all['Customer_credit_rating_Encoded'] = encoder_credit.transform(test_all[['Customer_credit_rating']])

train_x.drop('Customer_credit_rating', axis=1, inplace=True)
valid_x.drop('Customer_credit_rating', axis=1, inplace=True)
test_all.drop('Customer_credit_rating', axis=1, inplace=True)

In [24]:
# 위치 범주의 빈도 계산
location_first_frequency_encoding = train_x['location_first'].value_counts()
train_x['location_first_Encoded'] = train_x['location_first'].map(location_first_frequency_encoding)
valid_x['location_first_Encoded'] = valid_x['location_first'].map(location_first_frequency_encoding).fillna(0)
test_all['location_first_Encoded'] = test_all['location_first'].map(location_first_frequency_encoding).fillna(0)
train_x.drop('location_first', axis=1, inplace=True)
valid_x.drop('location_first', axis=1, inplace=True)
test_all.drop('location_first', axis=1, inplace=True)

location_second_frequency_encoding = train_x['location_second'].value_counts()
train_x['location_second_Encoded'] = train_x['location_second'].map(location_second_frequency_encoding)
valid_x['location_second_Encoded'] = valid_x['location_second'].map(location_second_frequency_encoding).fillna(0)
test_all['location_second_Encoded'] = test_all['location_second'].map(location_second_frequency_encoding).fillna(0)
train_x.drop('location_second', axis=1, inplace=True)
valid_x.drop('location_second', axis=1, inplace=True)
test_all.drop('location_second', axis=1, inplace=True)

location_last_frequency_encoding = train_x['location_last'].value_counts()
train_x['location_last_Encoded'] = train_x['location_last'].map(location_last_frequency_encoding)
valid_x['location_last_Encoded'] = valid_x['location_last'].map(location_last_frequency_encoding).fillna(0)
test_all['location_last_Encoded'] = test_all['location_last'].map(location_last_frequency_encoding).fillna(0)
train_x.drop('location_last', axis=1, inplace=True)
valid_x.drop('location_last', axis=1, inplace=True)
test_all.drop('location_last', axis=1, inplace=True)

In [25]:
# from maclookup import ApiClient로 제조사 정보 추출할 수 있지만, 외부 API를 사용하는 것이므로 적용 불가.
# -> 해당 칼럼 지우기
train_x.drop('MAC_Address', axis=1, inplace=True)
valid_x.drop('MAC_Address', axis=1, inplace=True)
test_all.drop('MAC_Address', axis=1, inplace=True)

In [26]:
# 공용 IP vs 사설 IP 구분
import ipaddress

def is_private_ip(ip):
    try:
        return ipaddress.ip_address(ip).is_private
    except ValueError:
        return None

train_x['is_private_ip'] = train_x['IP_Address'].apply(is_private_ip)
valid_x['is_private_ip'] = valid_x['IP_Address'].apply(is_private_ip)
test_all['is_private_ip'] = test_all['IP_Address'].apply(is_private_ip)

train_x.drop('IP_Address', axis=1, inplace=True)
valid_x.drop('IP_Address', axis=1, inplace=True)
test_all.drop('IP_Address', axis=1, inplace=True)

In [27]:
# 위치 범주의 빈도 계산> -> 일단 적용
Error_Code_frequency_encoding = train_x['Error_Code'].value_counts()
train_x['Error_Code_Encoded'] = train_x['Error_Code'].map(Error_Code_frequency_encoding)
valid_x['Error_Code_Encoded'] = valid_x['Error_Code'].map(Error_Code_frequency_encoding).fillna(0)
test_all['Error_Code_Encoded'] = test_all['Error_Code'].map(Error_Code_frequency_encoding).fillna(0)
train_x.drop('Error_Code', axis=1, inplace=True)
valid_x.drop('Error_Code', axis=1, inplace=True)
test_all.drop('Error_Code', axis=1, inplace=True)

Customer_loan_type_frequency_encoding = train_x['Customer_loan_type'].value_counts()
train_x['Customer_loan_type_Encoded'] = train_x['Customer_loan_type'].map(Customer_loan_type_frequency_encoding)
valid_x['Customer_loan_type_Encoded'] = valid_x['Customer_loan_type'].map(Customer_loan_type_frequency_encoding).fillna(0)
test_all['Customer_loan_type_Encoded'] = test_all['Customer_loan_type'].map(Customer_loan_type_frequency_encoding).fillna(0)
train_x.drop('Customer_loan_type', axis=1, inplace=True)
valid_x.drop('Customer_loan_type', axis=1, inplace=True)
test_all.drop('Customer_loan_type', axis=1, inplace=True)

Account_account_type_frequency_encoding = train_x['Account_account_type'].value_counts()
train_x['Account_account_type_Encoded'] = train_x['Account_account_type'].map(Account_account_type_frequency_encoding)
valid_x['Account_account_type_Encoded'] = valid_x['Account_account_type'].map(Account_account_type_frequency_encoding).fillna(0)
test_all['Account_account_type_Encoded'] = test_all['Account_account_type'].map(Account_account_type_frequency_encoding).fillna(0)
train_x.drop('Account_account_type', axis=1, inplace=True)
valid_x.drop('Account_account_type', axis=1, inplace=True)
test_all.drop('Account_account_type', axis=1, inplace=True)

Channel_frequency_encoding = train_x['Channel'].value_counts()
train_x['Channel_Encoded'] = train_x['Channel'].map(Channel_frequency_encoding)
valid_x['Channel_Encoded'] = valid_x['Channel'].map(Channel_frequency_encoding).fillna(0)
test_all['Channel_Encoded'] = test_all['Channel'].map(Channel_frequency_encoding).fillna(0)
train_x.drop('Channel', axis=1, inplace=True)
valid_x.drop('Channel', axis=1, inplace=True)
test_all.drop('Channel', axis=1, inplace=True)

Operating_System_frequency_encoding = train_x['Operating_System'].value_counts()
train_x['Operating_System_Encoded'] = train_x['Operating_System'].map(Operating_System_frequency_encoding)
valid_x['Operating_System_Encoded'] = valid_x['Operating_System'].map(Operating_System_frequency_encoding).fillna(0)
test_all['Operating_System_Encoded'] = test_all['Operating_System'].map(Operating_System_frequency_encoding).fillna(0)
train_x.drop('Operating_System', axis=1, inplace=True)
valid_x.drop('Operating_System', axis=1, inplace=True)
test_all.drop('Operating_System', axis=1, inplace=True)

Access_Medium_frequency_encoding = train_x['Access_Medium'].value_counts()
train_x['Access_Medium_Encoded'] = train_x['Access_Medium'].map(Access_Medium_frequency_encoding)
valid_x['Access_Medium_Encoded'] = valid_x['Access_Medium'].map(Access_Medium_frequency_encoding).fillna(0)
test_all['Access_Medium_Encoded'] = test_all['Access_Medium'].map(Access_Medium_frequency_encoding).fillna(0)
train_x.drop('Access_Medium', axis=1, inplace=True)
valid_x.drop('Access_Medium', axis=1, inplace=True)
test_all.drop('Access_Medium', axis=1, inplace=True)

In [28]:
# 칼럼 일치 확인
print(set(train_x.columns) - set(valid_x.columns))
print(set(train_x.columns) - set(test_all.columns))

print(set(valid_x.columns) - set(train_x.columns))
print(set(valid_x.columns) - set(test_all.columns))

print(set(test_all.columns) - set(train_x.columns))
print(set(test_all.columns) - set(valid_x.columns))

set()
set()
set()
set()
set()
set()


In [29]:
# 수치
#for x in numeric_features:
#    print(x, train_x[x].dtype, train_x[x].unique())
#    print()

In [30]:
# 숫자형 변수 선택
numeric_features = train_x.select_dtypes(include=['int64', 'float64', 'int32']).columns

# 범주형 변수 인코딩 -> target/frequency encoding으로 변경하기
categorical_columns = train_x.select_dtypes(include=['object', 'category', 'boolean']).columns

In [31]:
# standard scaler 적용
from sklearn.preprocessing import StandardScaler

for x in numeric_features:
    scaler = StandardScaler()
    train_x[x] = scaler.fit_transform(train_x[[x]])
    valid_x[x] = scaler.transform(valid_x[[x]])
    test_all[x] = scaler.transform(test_all[[x]])

In [32]:
# 카테고리컬
#for x in categorical_columns:
#    print(x, train_x[x].dtype, train_x[x].unique())
#    print()

In [33]:
for x in categorical_columns:
    train_x[x] = train_x[x].astype('int64')
    valid_x[x] = valid_x[x].astype('int64')
    test_all[x] = test_all[x].astype('int64')

In [34]:
print(train_x.shape)
print(valid_x.shape)
print(test_all.shape)

(6500, 89)
(2600, 89)
(120000, 89)


# 모델 적용

In [None]:
!pip install scikit-learn xgboost lightgbm



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score
import xgboost as xgb
import lightgbm as lgb


# 모델 정의
clf1 = RandomForestClassifier(random_state=69)
clf2 = xgb.XGBClassifier(random_state=69, use_label_encoder=False, eval_metric='mlogloss')
clf3 = lgb.LGBMClassifier(random_state=69)

# 하이퍼파라미터 그리드 정의
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

param_grid_lgb = {
    'n_estimators': [100, 200],
    'num_leaves': [31, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'feature_fraction': [0.8, 1.0]
}

# K-Fold 정의
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=69)

# GridSearchCV로 하이퍼파라미터 튜닝
grid_rf = GridSearchCV(estimator=clf1, param_grid=param_grid_rf, cv=kf, n_jobs=-1, scoring='f1_macro')
grid_xgb = GridSearchCV(estimator=clf2, param_grid=param_grid_xgb, cv=kf, n_jobs=-1, scoring='f1_macro')
grid_lgb = GridSearchCV(estimator=clf3, param_grid=param_grid_lgb, cv=kf, n_jobs=-1, scoring='f1_macro')

# 개별 모델 학습
grid_rf.fit(train_x, train_y_encoded)

In [None]:
#train_set = xgb.DMatrix(train_x, label=train_y_encoded, enable_categorical=True)
grid_xgb.fit(train_x, train_y_encoded)
grid_lgb.fit(train_x, train_y_encoded)

# 최적 하이퍼파라미터 출력
print("Best parameters for RandomForest: ", grid_rf.best_params_)
print("Best parameters for XGBoost: ", grid_xgb.best_params_)
print("Best parameters for LightGBM: ", grid_lgb.best_params_)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004856 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4567
[LightGBM] [Info] Number of data points in the train set: 6500, number of used features: 86
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949
[LightGBM] [Info] Start training from score -2.564949


In [None]:
from sklearn.metrics import f1_score

# Soft Voting 앙상블 구성
voting_clf = VotingClassifier(
    estimators=[
        ('rf', grid_rf.best_estimator_),
        ('xgb', grid_xgb.best_estimator_),
        ('lgb', grid_lgb.best_estimator_)
    ],
    voting='soft',  # Soft Voting 사용
    n_jobs=-1
)

# 앙상블 모델 학습
voting_clf.fit(train_x, train_y_encoded)

# 예측 및 성능 평가
y_pred = voting_clf.predict(train_x)
f1_macro = f1_score(train_y_encoded, y_pred, average='macro')
print(f"Ensemble Model F1 Macro Score: {f1_macro:.4f}")

Ensemble Model F1 Macro Score: 1.0000


In [None]:
# 베스트 모델, 앙상블 모델 저장
import pickle

with open('6_rf_classifier.pkl', 'wb') as f:
    pickle.dump(grid_rf.best_estimator_, f)
with open('6_xgb_classifier.pkl', 'wb') as f:
    pickle.dump(grid_xgb.best_estimator_, f)
with open('6_lgb_classifier.pkl', 'wb') as f:
    pickle.dump(grid_lgb.best_estimator_, f)
with open('6_voting_classifier.pkl', 'wb') as f:
    pickle.dump(voting_clf, f)

In [None]:
# valid 적용
from sklearn.metrics import classification_report

# 예측 및 성능 평가
y_pred = grid_rf.best_estimator_.predict(valid_x)
f1_macro = f1_score(valid_y_encoded, y_pred, average='macro')
print(f"Random Forest Model F1 Macro Score: {f1_macro:.4f}")

y_pred = grid_xgb.best_estimator_.predict(valid_x)
f1_macro = f1_score(valid_y_encoded, y_pred, average='macro')
print(f"XGBoost Model F1 Macro Score: {f1_macro:.4f}")

y_pred = grid_lgb.best_estimator_.predict(valid_x)
f1_macro = f1_score(valid_y_encoded, y_pred, average='macro')
print(f"LightGBM Model F1 Macro Score: {f1_macro:.4f}")


y_pred = voting_clf.predict(valid_x)
f1_macro = f1_score(valid_y_encoded, y_pred, average='macro')
print(f"Ensemble Model F1 Macro Score: {f1_macro:.4f}")
report = classification_report(valid_y_encoded, y_pred)
print(report)

Random Forest Model F1 Macro Score: 0.9946
XGBoost Model F1 Macro Score: 0.9973
LightGBM Model F1 Macro Score: 0.9977
Ensemble Model F1 Macro Score: 0.9981
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       200
           1       0.99      1.00      0.99       200
           2       1.00      1.00      1.00       200
           3       1.00      1.00      1.00       200
           4       1.00      1.00      1.00       200
           5       1.00      1.00      1.00       200
           6       1.00      1.00      1.00       200
           7       1.00      1.00      1.00       200
           8       1.00      1.00      1.00       200
           9       1.00      1.00      1.00       200
          10       1.00      1.00      1.00       200
          11       1.00      1.00      1.00       200
          12       1.00      0.97      0.99       200

    accuracy                           1.00      2600
   macro avg       1.00      1.0

# Inference

In [None]:
# 테스트 데이터 인코딩
test_x_encoded = test_all.copy()

In [None]:
# 특성 순서 저장
feature_order = train_x.columns.tolist()

# 특성 순서 맞추기 및 데이터 타입 일치
for col in feature_order:
    test_all[col] = test_all[col].astype(test_all[col].dtype)

In [None]:
# 예측
predictions = voting_clf.predict(test_all)
predictions_label = le_subclass.inverse_transform(predictions)



In [None]:
# 분류 예측 결과 제출 데이터프레임(DataFrame)
# 분류 예측 결과 데이터프레임 파일명을 반드시 clf_submission.csv 로 지정해야합니다.
clf_submission = pd.read_csv(path + "sample_submission.csv")
clf_submission["Fraud_Type"] = predictions_label
clf_submission.head()

Unnamed: 0,ID,Fraud_Type
0,TEST_000000,j
1,TEST_000001,m
2,TEST_000002,m
3,TEST_000003,m
4,TEST_000004,b


In [None]:
# 합성 데이터 생성 결과 제출 데이터프레임(DataFrame)
# 합성 데이터 생성 결과 데이터프레임 파일명을 반드시 syn_submission.csv 로 지정해야합니다.
all_synthetic_data.head()

Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Last_bank_branch_transaction_datetime,Flag_deposit_more_than_tenMillion,Unused_account_status,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date,Time_difference
0,1952,male,김주원,CKckSV-WcykjZO,2004-02-10 01:25:04,A,0,0,1,0,...,2018-07-15 14:28:23,0,0,0,0,5,0,m,2024-08-25 19:05:19,0 days 02:31:16
1,1959,male,박승현,qKrsRq-tazCNVN,2005-10-30 04:58:08,B,0,0,0,0,...,2044-11-25 13:12:14,0,1,0,0,0,0,m,2021-03-17 23:54:05,6 days 22:10:57
2,1975,male,권지훈,eFVRTS-AVwOBPh,2003-09-15 01:46:40,A,0,0,0,0,...,2015-09-26 12:02:11,0,0,0,1,0,0,m,2008-01-11 10:16:31,6 days 22:10:57
3,1951,male,허상현,QHZCxU-DWxmINt,2007-12-13 16:20:24,B,1,0,0,0,...,2014-08-27 18:09:28,1,0,0,0,0,0,m,2006-08-11 00:48:12,0 days 00:33:52
4,1951,male,이예지,UqHuXG-ybAYkRw,2003-09-11 05:30:53,A,0,0,0,0,...,2009-12-09 06:12:06,0,1,0,0,0,0,m,2023-10-15 19:19:56,0 days 01:27:46


In [None]:
# 샘플링 추가
class_m = all_synthetic_data.Fraud_Type.unique()
class_m

array(['m', 'a', 'j', 'h', 'k', 'c', 'g', 'i', 'b', 'f', 'd', 'e', 'l'],
      dtype=object)

In [None]:
fin_gen = pd.DataFrame()
for x in class_m:
    tmp = all_synthetic_data[all_synthetic_data['Fraud_Type'] == x]
    tmp = tmp.sample(n=1000, random_state=42)
    fin_gen = pd.concat([fin_gen, tmp])

fin_gen.reset_index(inplace=True, drop=True)

In [None]:
# 폴더 생성 및 작업 디렉토리 변경
os.makedirs('./submission', exist_ok=True)
os.chdir("./submission/")

# CSV 파일로 저장
clf_submission.to_csv('./clf_submission.csv', encoding='UTF-8-sig', index=False)
fin_gen.to_csv('./syn_submission.csv', encoding='UTF-8-sig', index=False)

# ZIP 파일 생성 및 CSV 파일 추가
with zipfile.ZipFile("../0828_submission_soft_voting_filter_before_after_30_500.zip", 'w') as submission:
    submission.write('clf_submission.csv')
    submission.write('syn_submission.csv')

print('Done.')

Done.
