In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 제출 파일 생성 관련
import os
import zipfile

# 데이터 처리 및 분석
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

# 머신러닝 전처리
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# 머신러닝 모델
import xgboost as xgb

# To ignore all warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/fsi/open/'
save_path = '/content/drive/MyDrive/Colab Notebooks/fsi/processed_data/1_all_pre_syn.csv'

train_all = pd.read_csv(path + "train.csv")
test_all = pd.read_csv(path + "test.csv")
saved_train = pd.read_csv(save_path)
saved_train_m = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/fsi/processed_data/1_30_sample_pre_syn.csv')

save_path = '/content/drive/MyDrive/Colab Notebooks/fsi/processed_data'

train = train_all.drop(columns="ID")
saved_train["Fraud_Type"].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,118800
a,100
j,100
h,100
k,100
c,100
g,100
i,100
b,100
f,100


In [None]:
# 원본 데이터 준비
filtered_train = saved_train
filtered_train_m = filtered_train[filtered_train.Fraud_Type == 'm'].sample(n=1000, random_state=69)
filtered_train = filtered_train[filtered_train.Fraud_Type != 'm']
train_total_real = pd.concat([filtered_train, filtered_train_m])
train_total_real["Fraud_Type"].value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,1000
a,100
j,100
h,100
k,100
c,100
g,100
i,100
b,100
f,100


In [None]:
# 원본 데이터 전처리

# 공용 IP vs 사설 IP 구분
import ipaddress
from sklearn.utils import shuffle

def is_private_ip(ip):
    try:
        return ipaddress.ip_address(ip).is_private
    except ValueError:
        return None

#거래 시간대 나누기
def convert_timeline(hour):
    if 0 <= hour < 6:
        return 0 #새벽
    elif 6 <= hour < 12:
        return 1 #오전
    elif 12 <= hour < 18:
        return 2 #오후
    else:
        return 3 #저녁

def process_data(train_x, test_all):
    # time_difference를 초로 바꾸기
    train_x['Time_difference_seconds'] = pd.to_timedelta(train_x['Time_difference']).dt.total_seconds()
    test_all['Time_difference_seconds'] = pd.to_timedelta(test_all['Time_difference']).dt.total_seconds()

    # unique한 코드 지우기
    train_x.drop('Customer_personal_identifier', axis=1, inplace=True)
    test_all.drop('Customer_personal_identifier', axis=1, inplace=True)

    train_x.drop('Customer_identification_number', axis=1, inplace=True)
    test_all.drop('Customer_identification_number', axis=1, inplace=True)

    # 거의 unique한 코드 지우기
    train_x.drop('Account_account_number', axis=1, inplace=True)
    test_all.drop('Account_account_number', axis=1, inplace=True)

    train_x.drop('Recipient_Account_Number', axis=1, inplace=True)
    test_all.drop('Recipient_Account_Number', axis=1, inplace=True)

    # 변수 추가
    train_x['transaction_time_of_day'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.hour.apply(convert_timeline)
    test_all['transaction_time_of_day'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.hour.apply(convert_timeline)

    # 최근 ATM 거래 이후의 시간 차이 (분)
    train_x['Minutes_since_last_ATM'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Last_atm_transaction_datetime'])).dt.total_seconds() / 60
    test_all['Minutes_since_last_ATM'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Last_atm_transaction_datetime'])).dt.total_seconds() / 60

    # 계좌 재개 이후 거래 까지의 시간 차이 (분)
    train_x['Minutes_since_resumed'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Transaction_resumed_date'])).dt.total_seconds() / 60
    test_all['Minutes_since_resumed'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Transaction_resumed_date'])).dt.total_seconds() / 60

    # 최근 영업점 거래 이후의 시간 차이 (분)
    train_x['Minutes_since_last_branch'] = (pd.to_datetime(train_x['Transaction_Datetime']) - pd.to_datetime(train_x['Last_bank_branch_transaction_datetime'])).dt.total_seconds() / 60
    test_all['Minutes_since_last_branch'] = (pd.to_datetime(test_all['Transaction_Datetime']) - pd.to_datetime(test_all['Last_bank_branch_transaction_datetime'])).dt.total_seconds() / 60

    malicious_behaviors = ['Customer_flag_terminal_malicious_behavior_1',
                        'Customer_flag_terminal_malicious_behavior_2',
                        'Customer_flag_terminal_malicious_behavior_3',
                        'Customer_flag_terminal_malicious_behavior_4',
                        'Customer_flag_terminal_malicious_behavior_5',
                        'Customer_flag_terminal_malicious_behavior_6']

    # 악성 행위
    train_x['Total_Malicious_Behaviors'] = train_x[malicious_behaviors].sum(axis=1)
    #valid_x['Total_Malicious_Behaviors'] = valid_x[malicious_behaviors].sum(axis=1)
    test_all['Total_Malicious_Behaviors'] = test_all[malicious_behaviors].sum(axis=1)

    # 거래 요일
    train_x['Day_of_Week'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.dayofweek
    #valid_x['Day_of_Week'] = pd.to_datetime(valid_x['Transaction_Datetime']).dt.dayofweek
    test_all['Day_of_Week'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.dayofweek

    # 주말 여부
    train_x['Is_Weekend'] = train_x['Day_of_Week'].isin([5, 6]).astype(int)
    #valid_x['Is_Weekend'] = valid_x['Day_of_Week'].isin([5, 6]).astype(int)
    test_all['Is_Weekend'] = test_all['Day_of_Week'].isin([5, 6]).astype(int)

    # 인증 방법 변경 횟수
    flag_change = ['Customer_flag_change_of_authentication_1',
                    'Customer_flag_change_of_authentication_2',
                    'Customer_flag_change_of_authentication_3',
                    'Customer_flag_change_of_authentication_4']
    train_x['Auth_Change_Count'] = train_x[flag_change].sum(axis=1)
    #valid_x['Auth_Change_Count'] = valid_x[flag_change].sum(axis=1)
    test_all['Auth_Change_Count'] = test_all[flag_change].sum(axis=1)

    # 보안 침해 위험
    train_x['Security_Risk'] = train_x[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)
    #valid_x['Security_Risk'] = valid_x[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)
    test_all['Security_Risk'] = test_all[['Customer_rooting_jailbreak_indicator', 'Customer_VPN_Indicator', 'Customer_mobile_roaming_indicator']].sum(axis=1)

    # 거래 넘은 횟수
    train_x['Is_Above_Max_Amount'] = train_x['Transaction_Amount'] > train_x['Account_one_month_max_amount']
    train_x['Is_Above_Dawn_Max_Amount'] = train_x['Transaction_Amount'] > train_x['Account_dawn_one_month_max_amount']
    train_x['ATM_Limit_Exceeded'] = train_x['Customer_inquery_atm_limit'] + train_x['Customer_increase_atm_limit']

    #valid_x['Is_Above_Max_Amount'] = valid_x['Transaction_Amount'] > valid_x['Account_one_month_max_amount']
    #valid_x['Is_Above_Dawn_Max_Amount'] = valid_x['Transaction_Amount'] > valid_x['Account_dawn_one_month_max_amount']
    #valid_x['ATM_Limit_Exceeded'] = valid_x['Customer_inquery_atm_limit'] + valid_x['Customer_increase_atm_limit']

    test_all['Is_Above_Max_Amount'] = test_all['Transaction_Amount'] > test_all['Account_one_month_max_amount']
    test_all['Is_Above_Dawn_Max_Amount'] = test_all['Transaction_Amount'] > test_all['Account_dawn_one_month_max_amount']
    test_all['ATM_Limit_Exceeded'] = test_all['Customer_inquery_atm_limit'] + test_all['Customer_increase_atm_limit']

    # train_x
    # 'Time_difference' 열을 문자열로 변환
    train_x['Time_difference'] = train_x['Time_difference'].astype(str)
    test_all['Time_difference'] = test_all['Time_difference'].astype(str)

    train_x.reset_index(inplace=True, drop=True)
    test_all.reset_index(inplace=True, drop=True)

    # 위치 나누기
    #train_x.reset_index(inplace=True, drop=True)
    #test_all.reset_index(inplace=True, drop=True)

    train_x['location_first'] =  train_x['Location'].apply(lambda x: x.split(' ')[0])
    train_x['location_second'] = train_x['Location'].apply(lambda x: x.split(' ')[1])
    train_x['location_last'] = train_x['Location'].apply(lambda x: x.split(' ')[2])
    #train_x['location_dong'] = train_x['Location'].apply(lambda x: x.split(' ')[3])
    train_x['location_lat'] = train_x['Location'].apply(lambda x: x.split(' ')[-2])
    train_x['location_long'] = train_x['Location'].apply(lambda x: x.split(' ')[-1])


    test_all['location_first'] =  train_x['Location'].apply(lambda x: x.split(' ')[0])
    test_all['location_second'] = train_x['Location'].apply(lambda x: x.split(' ')[1])
    test_all['location_last'] = train_x['Location'].apply(lambda x: x.split(' ')[2])
    #train_x['location_dong'] = train_x['Location'].apply(lambda x: x.split(' ')[3])
    test_all['location_lat'] = train_x['Location'].apply(lambda x: x.split(' ')[-2])
    test_all['location_long'] = train_x['Location'].apply(lambda x: x.split(' ')[-1])

    # 삭제
    train_x.drop('Location', axis=1, inplace=True)
    test_all.drop('Location', axis=1, inplace=True)

    # 위경도 삭제
    train_x.drop('location_long', axis=1, inplace=True)
    train_x.drop('location_lat', axis=1, inplace=True)

    test_all.drop('location_long', axis=1, inplace=True)
    test_all.drop('location_lat', axis=1, inplace=True)

    # 날짜 나누기
    train_x['Customer_registration_datetime_year'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.year
    train_x['Customer_registration_datetime_month'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.month
    train_x['Customer_registration_datetime_day'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.day
    train_x['Customer_registration_datetime_hour'] = pd.to_datetime(train_x['Customer_registration_datetime']).dt.hour

    train_x['Account_creation_datetime_year'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.year
    train_x['Account_creation_datetime_month'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.month
    train_x['Account_creation_datetime_day'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.day
    train_x['Account_creation_datetime_hour'] = pd.to_datetime(train_x['Account_creation_datetime']).dt.hour

    train_x['Transaction_Datetime_year'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.year
    train_x['Transaction_Datetime_month'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.month
    train_x['Transaction_Datetime_day'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.day
    train_x['Transaction_Datetime_hour'] = pd.to_datetime(train_x['Transaction_Datetime']).dt.hour

    train_x['Last_atm_transaction_datetime_year'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.year
    train_x['Last_atm_transaction_datetime_month'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.month
    train_x['Last_atm_transaction_datetime_day'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.day
    train_x['Last_atm_transaction_datetime_hour'] = pd.to_datetime(train_x['Last_atm_transaction_datetime']).dt.hour

    train_x['Last_bank_branch_transaction_datetime_year'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.year
    train_x['Last_bank_branch_transaction_datetime_month'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.month
    train_x['Last_bank_branch_transaction_datetime_day'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.day
    train_x['Last_bank_branch_transaction_datetime_hour'] = pd.to_datetime(train_x['Last_bank_branch_transaction_datetime']).dt.hour

    train_x['Transaction_resumed_date_year'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.year
    train_x['Transaction_resumed_date_month'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.month
    train_x['Transaction_resumed_date_day'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.day
    train_x['Transaction_resumed_date_hour'] = pd.to_datetime(train_x['Transaction_resumed_date']).dt.hour




    test_all['Customer_registration_datetime_year'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.year
    test_all['Customer_registration_datetime_month'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.month
    test_all['Customer_registration_datetime_day'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.day
    test_all['Customer_registration_datetime_hour'] = pd.to_datetime(test_all['Customer_registration_datetime']).dt.hour

    test_all['Account_creation_datetime_year'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.year
    test_all['Account_creation_datetime_month'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.month
    test_all['Account_creation_datetime_day'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.day
    test_all['Account_creation_datetime_hour'] = pd.to_datetime(test_all['Account_creation_datetime']).dt.hour

    test_all['Transaction_Datetime_year'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.year
    test_all['Transaction_Datetime_month'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.month
    test_all['Transaction_Datetime_day'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.day
    test_all['Transaction_Datetime_hour'] = pd.to_datetime(test_all['Transaction_Datetime']).dt.hour

    test_all['Last_atm_transaction_datetime_year'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.year
    test_all['Last_atm_transaction_datetime_month'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.month
    test_all['Last_atm_transaction_datetime_day'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.day
    test_all['Last_atm_transaction_datetime_hour'] = pd.to_datetime(test_all['Last_atm_transaction_datetime']).dt.hour

    test_all['Last_bank_branch_transaction_datetime_year'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.year
    test_all['Last_bank_branch_transaction_datetime_month'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.month
    test_all['Last_bank_branch_transaction_datetime_day'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.day
    test_all['Last_bank_branch_transaction_datetime_hour'] = pd.to_datetime(test_all['Last_bank_branch_transaction_datetime']).dt.hour

    test_all['Transaction_resumed_date_year'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.year
    test_all['Transaction_resumed_date_month'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.month
    test_all['Transaction_resumed_date_day'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.day
    test_all['Transaction_resumed_date_hour'] = pd.to_datetime(test_all['Transaction_resumed_date']).dt.hour




    train_x.drop('Customer_registration_datetime', axis=1, inplace=True)
    train_x.drop('Account_creation_datetime', axis=1, inplace=True)
    train_x.drop('Transaction_Datetime', axis=1, inplace=True)
    train_x.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
    train_x.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
    train_x.drop('Transaction_resumed_date', axis=1, inplace=True)

    test_all.drop('Customer_registration_datetime', axis=1, inplace=True)
    test_all.drop('Account_creation_datetime', axis=1, inplace=True)
    test_all.drop('Transaction_Datetime', axis=1, inplace=True)
    test_all.drop('Last_atm_transaction_datetime', axis=1, inplace=True)
    test_all.drop('Last_bank_branch_transaction_datetime', axis=1, inplace=True)
    test_all.drop('Transaction_resumed_date', axis=1, inplace=True)

    train_x.drop('Time_difference', axis=1, inplace=True)
    test_all.drop('Time_difference', axis=1, inplace=True)

    # 각 칼럼별 자료형 재정리

    # numerical로 되어있는데 categorical로 변환해야할 칼럼들 -> 생년월일, 연, 월, 일, 시각 제외
    change_cols = ['Customer_flag_change_of_authentication_1',
    'Customer_flag_change_of_authentication_2',
    'Customer_flag_change_of_authentication_3',
    'Customer_flag_change_of_authentication_4',
    'Customer_rooting_jailbreak_indicator',
    'Customer_mobile_roaming_indicator', 'Customer_VPN_Indicator', 'Customer_flag_terminal_malicious_behavior_1',
    'Customer_flag_terminal_malicious_behavior_2', 'Customer_flag_terminal_malicious_behavior_3',
    'Customer_flag_terminal_malicious_behavior_4', 'Customer_flag_terminal_malicious_behavior_5', 'Customer_flag_terminal_malicious_behavior_6',
    'Customer_inquery_atm_limit', 'Customer_increase_atm_limit', 'Account_indicator_release_limit_excess',
    'Account_indicator_Openbanking', 'Account_release_suspention', 'Transaction_Failure_Status',
    'Another_Person_Account', 'Unused_terminal_status', 'Flag_deposit_more_than_tenMillion',
    'Unused_account_status', 'Recipient_account_suspend_status', 'First_time_iOS_by_vulnerable_user', 'transaction_time_of_day',
    'Day_of_Week', 'Is_Weekend'
    ]

    for x in change_cols:
        train_x[x] = train_x[x].astype('category')
        test_all[x] = test_all[x].astype('category')

    # 이진 값은 원-핫 인코딩 적용
    train_x = pd.get_dummies(train_x, columns=['Customer_Gender'], drop_first=True)
    if (train_x['Type_General_Automatic'] == 'General').all():
        train_x['Type_General_Automatic_general'] = 1
    else:
        train_x = pd.get_dummies(train_x, columns=['Type_General_Automatic'], drop_first=True)

    test_all = pd.get_dummies(test_all, columns=['Customer_Gender'], drop_first=True)
    test_all = pd.get_dummies(test_all, columns=['Type_General_Automatic'], drop_first=True)

    # Customer_credit_rating 는 순서가 있기에, ordinal encoding 적용
    # Day_of_Week, transaction_time_of_day 도 ordinal 이미 적용
    encoder_credit = OrdinalEncoder(categories=[['E', 'D', 'C', 'B', 'A', 'S']])

    train_x['Customer_credit_rating_Encoded'] = encoder_credit.fit_transform(train_x[['Customer_credit_rating']])
    test_all['Customer_credit_rating_Encoded'] = encoder_credit.transform(test_all[['Customer_credit_rating']])

    train_x.drop('Customer_credit_rating', axis=1, inplace=True)
    test_all.drop('Customer_credit_rating', axis=1, inplace=True)

    # 위치 범주의 빈도 계산
    location_first_frequency_encoding = train_x['location_first'].value_counts()
    train_x['location_first_Encoded'] = train_x['location_first'].map(location_first_frequency_encoding)
    test_all['location_first_Encoded'] = test_all['location_first'].map(location_first_frequency_encoding).fillna(0)
    train_x.drop('location_first', axis=1, inplace=True)
    test_all.drop('location_first', axis=1, inplace=True)

    location_second_frequency_encoding = train_x['location_second'].value_counts()
    train_x['location_second_Encoded'] = train_x['location_second'].map(location_second_frequency_encoding)
    test_all['location_second_Encoded'] = test_all['location_second'].map(location_second_frequency_encoding).fillna(0)
    train_x.drop('location_second', axis=1, inplace=True)
    test_all.drop('location_second', axis=1, inplace=True)

    location_last_frequency_encoding = train_x['location_last'].value_counts()
    train_x['location_last_Encoded'] = train_x['location_last'].map(location_last_frequency_encoding)
    test_all['location_last_Encoded'] = test_all['location_last'].map(location_last_frequency_encoding).fillna(0)
    train_x.drop('location_last', axis=1, inplace=True)
    test_all.drop('location_last', axis=1, inplace=True)

    # from maclookup import ApiClient로 제조사 정보 추출할 수 있지만, 외부 API를 사용하는 것이므로 적용 불가.
    # -> 해당 칼럼 지우기
    train_x.drop('MAC_Address', axis=1, inplace=True)
    test_all.drop('MAC_Address', axis=1, inplace=True)

    train_x['is_private_ip'] = train_x['IP_Address'].apply(is_private_ip)
    test_all['is_private_ip'] = test_all['IP_Address'].apply(is_private_ip)

    train_x.drop('IP_Address', axis=1, inplace=True)
    test_all.drop('IP_Address', axis=1, inplace=True)

    # 위치 범주의 빈도 계산> -> 일단 적용
    Error_Code_frequency_encoding = train_x['Error_Code'].value_counts()
    train_x['Error_Code_Encoded'] = train_x['Error_Code'].map(Error_Code_frequency_encoding)
    test_all['Error_Code_Encoded'] = test_all['Error_Code'].map(Error_Code_frequency_encoding).fillna(0)
    train_x.drop('Error_Code', axis=1, inplace=True)
    test_all.drop('Error_Code', axis=1, inplace=True)

    Customer_loan_type_frequency_encoding = train_x['Customer_loan_type'].value_counts()
    train_x['Customer_loan_type_Encoded'] = train_x['Customer_loan_type'].map(Customer_loan_type_frequency_encoding)
    test_all['Customer_loan_type_Encoded'] = test_all['Customer_loan_type'].map(Customer_loan_type_frequency_encoding).fillna(0)
    train_x.drop('Customer_loan_type', axis=1, inplace=True)
    test_all.drop('Customer_loan_type', axis=1, inplace=True)

    Account_account_type_frequency_encoding = train_x['Account_account_type'].value_counts()
    train_x['Account_account_type_Encoded'] = train_x['Account_account_type'].map(Account_account_type_frequency_encoding)
    test_all['Account_account_type_Encoded'] = test_all['Account_account_type'].map(Account_account_type_frequency_encoding).fillna(0)
    train_x.drop('Account_account_type', axis=1, inplace=True)
    test_all.drop('Account_account_type', axis=1, inplace=True)

    Channel_frequency_encoding = train_x['Channel'].value_counts()
    train_x['Channel_Encoded'] = train_x['Channel'].map(Channel_frequency_encoding)
    test_all['Channel_Encoded'] = test_all['Channel'].map(Channel_frequency_encoding).fillna(0)
    train_x.drop('Channel', axis=1, inplace=True)
    test_all.drop('Channel', axis=1, inplace=True)

    Operating_System_frequency_encoding = train_x['Operating_System'].value_counts()
    train_x['Operating_System_Encoded'] = train_x['Operating_System'].map(Operating_System_frequency_encoding)
    test_all['Operating_System_Encoded'] = test_all['Operating_System'].map(Operating_System_frequency_encoding).fillna(0)
    train_x.drop('Operating_System', axis=1, inplace=True)
    test_all.drop('Operating_System', axis=1, inplace=True)

    Access_Medium_frequency_encoding = train_x['Access_Medium'].value_counts()
    train_x['Access_Medium_Encoded'] = train_x['Access_Medium'].map(Access_Medium_frequency_encoding)
    test_all['Access_Medium_Encoded'] = test_all['Access_Medium'].map(Access_Medium_frequency_encoding).fillna(0)
    train_x.drop('Access_Medium', axis=1, inplace=True)
    test_all.drop('Access_Medium', axis=1, inplace=True)

    # 숫자형 변수 선택
    numeric_features = train_x.select_dtypes(include=['int64', 'float64', 'int32']).columns

    # 범주형 변수 인코딩 -> target/frequency encoding으로 변경하기
    categorical_columns = train_x.select_dtypes(include=['object', 'category', 'boolean']).columns

    # standard scaler 적용
    from sklearn.preprocessing import StandardScaler

    for x in numeric_features:
        scaler = StandardScaler()
        train_x[x] = scaler.fit_transform(train_x[[x]])
        test_all[x] = scaler.transform(test_all[[x]])

    for x in categorical_columns:
        train_x[x] = train_x[x].astype('int64')
        test_all[x] = test_all[x].astype('int64')

    return train_x, test_all

In [None]:
# 데이터프레임 셔플링
train_total_real = shuffle(train_total_real, random_state=69)
train_y = train_total_real['Fraud_Type']
train_total_real = train_total_real.drop(columns=['Fraud_Type'])
train_real_input = train_total_real.copy()

# 레이블 정리
le_subclass = LabelEncoder()
train_y_encoded = le_subclass.fit_transform(train_y)

test_all = pd.read_csv(path + "test.csv")
processed_train_real, test_all = process_data(train_real_input, test_all)

In [None]:
for i, label in enumerate(le_subclass.classes_):
    print(f"원래 레이블: {label}, 변환된 숫자: {i}")

원래 레이블: a, 변환된 숫자: 0
원래 레이블: b, 변환된 숫자: 1
원래 레이블: c, 변환된 숫자: 2
원래 레이블: d, 변환된 숫자: 3
원래 레이블: e, 변환된 숫자: 4
원래 레이블: f, 변환된 숫자: 5
원래 레이블: g, 변환된 숫자: 6
원래 레이블: h, 변환된 숫자: 7
원래 레이블: i, 변환된 숫자: 8
원래 레이블: j, 변환된 숫자: 9
원래 레이블: k, 변환된 숫자: 10
원래 레이블: l, 변환된 숫자: 11
원래 레이블: m, 변환된 숫자: 12


In [None]:
# 클래스별 가중치 계산
class_counts = np.bincount(train_y_encoded.astype(int))
class_weights = class_counts.max() / class_counts

# 샘플 가중치 설정
sample_weights = class_weights[train_y_encoded.astype(int)]

In [None]:
# 원본 데이터만 모델 학습
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split

# 하이퍼파라미터 범위 설정
param_dist = {
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],
    'max_depth': [4, 6, 8, 10],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3],
    'reg_alpha': [0, 0.1, 0.5, 1.0],
    'reg_lambda': [0.1, 1.0, 10.0]
}

# XGBClassifier 객체 생성
model = xgb.XGBClassifier(
    random_state=69,
    num_class=13,
    objective='multi:softmax',
    use_label_encoder=False,
    eval_metric='mlogloss'
)

# RandomizedSearchCV 객체 생성
random_search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=100,  # 시도할 하이퍼파라미터 조합의 수
    scoring='f1_macro',
    n_jobs=-1,
    cv=3,
    verbose=2,
    random_state=69
)

# RandomizedSearchCV 실행
random_search.fit(processed_train_real, train_y_encoded, sample_weight=sample_weights)

# 최적의 하이퍼파라미터 출력
print(f"Best parameters found: {random_search.best_params_}")
print(f"Best cross-validation accuracy: {random_search.best_score_}")

# 최적의 모델로 예측
best_model = random_search.best_estimator_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Best parameters found: {'subsample': 1.0, 'reg_lambda': 0.1, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 0.8}
Best cross-validation accuracy: 0.7604763973469595


In [None]:
import pickle

# 파일명
filename = '5_filtered_xgb_model.pkl'

# 모델 저장
pickle.dump(best_model, open(filename, 'wb'))

# 생성 데이터 평가 - copulagan

In [None]:
candi1_syn = pd.read_csv(save_path + '/copulagan_syn_data.csv')

# candi1_syn의 Fraud_Type별 Account_remaining_amount_daily_limit_exceeded 평균 계산
fraud_type_means = candi1_syn.groupby('Fraud_Type')['Account_remaining_amount_daily_limit_exceeded'].mean()
# nan 값을 Fraud_Type별 평균으로 채우기
for fraud_type, mean_value in fraud_type_means.items():
  candi1_syn.loc[candi1_syn['Fraud_Type'] == fraud_type, 'Account_remaining_amount_daily_limit_exceeded'] = candi1_syn.loc[candi1_syn['Fraud_Type'] == fraud_type, 'Account_remaining_amount_daily_limit_exceeded'].fillna(mean_value)


train_syn_y = candi1_syn['Fraud_Type']
candi1_syn = candi1_syn.drop(columns=['Fraud_Type'])

test_all = pd.read_csv(path + "test.csv")
syn_train_y_encoded = le_subclass.transform(train_syn_y)

train_real_input = train_total_real.copy()
_, processed_candi1_syn = process_data(train_real_input, candi1_syn)

In [None]:
# 합성 데이터의 예측 값 선별
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

predicted_proba = best_model.predict_proba(processed_candi1_syn)
max_indices = np.argmax(predicted_proba, axis=1)
max_values = np.max(predicted_proba, axis=1)

predictions = best_model.predict(processed_candi1_syn)
predictions_label = le_subclass.inverse_transform(predictions)

candi1 = pd.DataFrame(max_indices, columns=['predicted_max_indices'])
candi1['predicted_max_values'] = max_values
candi1['candi_label'] = syn_train_y_encoded
candi1['transed_label'] = predictions_label
candi1['model'] = 'copulagan'
print(f1_score(syn_train_y_encoded, max_indices, average='macro'))
report = classification_report(syn_train_y_encoded, max_indices)
print(report)

0.5171247804410861
              precision    recall  f1-score   support

           0       0.65      0.99      0.78     10000
           1       0.96      0.36      0.53     10000
           2       0.25      0.25      0.25     10000
           3       0.47      0.19      0.27     10000
           4       0.68      0.85      0.76     10000
           5       0.68      0.99      0.81     10000
           6       0.71      0.09      0.16     10000
           7       0.62      0.73      0.67     10000
           8       0.31      0.51      0.39     10000
           9       0.85      0.47      0.61     10000
          10       0.63      0.82      0.72     10000
          11       0.54      0.26      0.36     10000
          12       0.35      0.60      0.44     10000

    accuracy                           0.55    130000
   macro avg       0.59      0.55      0.52    130000
weighted avg       0.59      0.55      0.52    130000



In [None]:
candi1['correct'] = (candi1['candi_label'] == candi1['predicted_max_indices'])
candi1_correct = candi1[candi1.correct == True]
candi1_transed = candi1_correct.transed_label.value_counts().reset_index()

candi1_pb = candi1_correct.groupby('transed_label')['predicted_max_values'].mean().reset_index()
candi1_fin = pd.merge(candi1_transed, candi1_pb, on='transed_label')
candi1_fin

Unnamed: 0,transed_label,count,predicted_max_values
0,a,9943,0.990269
1,f,9855,0.975367
2,e,8521,0.883253
3,k,8224,0.893761
4,h,7284,0.845022
5,m,5998,0.752158
6,i,5119,0.795533
7,j,4711,0.720906
8,b,3623,0.925227
9,l,2645,0.747582


# 생성 데이터 평가 - tvae

In [None]:
# 생성된 데이터 전처리
candi2_syn = pd.read_csv(save_path + '/tvae_syn_data.csv')

train_syn_y = candi2_syn['Fraud_Type']
candi2_syn = candi2_syn.drop(columns=['Fraud_Type'])

test_all = pd.read_csv(path + "test.csv")
syn_train_y_encoded = le_subclass.transform(train_syn_y)

train_real_input = train_total_real.copy()
_, processed_candi2_syn = process_data(train_real_input, candi2_syn)

In [None]:
# 합성 데이터의 예측 값 선별
from sklearn.metrics import f1_score

predicted_proba = best_model.predict_proba(processed_candi2_syn)
max_indices = np.argmax(predicted_proba, axis=1)
max_values = np.max(predicted_proba, axis=1)

predictions = best_model.predict(processed_candi2_syn)
predictions_label = le_subclass.inverse_transform(predictions)

candi2 = pd.DataFrame(max_indices, columns=['predicted_max_indices'])
candi2['predicted_max_values'] = max_values
candi2['candi_label'] = syn_train_y_encoded
candi2['transed_label'] = predictions_label
candi2['model'] = 'tvae'
print(f1_score(syn_train_y_encoded, max_indices, average='macro'))
report = classification_report(syn_train_y_encoded, max_indices)
print(report)

0.6286918804376316
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     10000
           1       1.00      0.69      0.81     10000
           2       0.92      0.49      0.64     10000
           3       0.60      0.31      0.41     10000
           4       0.50      0.93      0.65     10000
           5       0.61      0.97      0.75     10000
           6       0.59      0.07      0.13     10000
           7       0.95      0.79      0.87     10000
           8       0.29      0.88      0.44     10000
           9       0.84      0.30      0.45     10000
          10       0.96      0.53      0.68     10000
          11       1.00      0.75      0.86     10000
          12       0.47      0.54      0.50     10000

    accuracy                           0.64    130000
   macro avg       0.75      0.64      0.63    130000
weighted avg       0.75      0.64      0.63    130000



In [None]:
candi2['correct'] = (candi2['candi_label'] == candi2['predicted_max_indices'])
candi2_correct = candi2[candi2.correct == True]
candi2_transed = candi2_correct.transed_label.value_counts().reset_index()

candi2_pb = candi2_correct.groupby('transed_label')['predicted_max_values'].mean().reset_index()
candi2_fin = pd.merge(candi2_transed, candi2_pb, on='transed_label')
candi2_fin

Unnamed: 0,transed_label,count,predicted_max_values
0,a,10000,0.999306
1,f,9677,0.997164
2,e,9344,0.965291
3,i,8774,0.916796
4,h,7920,0.834107
5,l,7522,0.884727
6,b,6852,0.85559
7,m,5362,0.800444
8,k,5308,0.838756
9,c,4912,0.763007


# 생성 데이터 평가 - ctgan

In [None]:
# 생성된 데이터 전처리
candi3_syn = pd.read_csv(save_path + '/ctgan_syn_data.csv')

#candi3_syn = shuffle(candi3_synthetic_data, random_state=69)
train_syn_y = candi3_syn['Fraud_Type']
candi3_syn = candi3_syn.drop(columns=['Fraud_Type'])

test_all = pd.read_csv(path + "test.csv")
syn_train_y_encoded = le_subclass.transform(train_syn_y)

train_real_input = train_total_real.copy()
_, processed_candi3_syn = process_data(train_real_input, candi3_syn)

In [None]:
# 합성 데이터의 예측 값 선별
from sklearn.metrics import f1_score

predicted_proba = best_model.predict_proba(processed_candi3_syn)
max_indices = np.argmax(predicted_proba, axis=1)
max_values = np.max(predicted_proba, axis=1)

predictions = best_model.predict(processed_candi3_syn)
predictions_label = le_subclass.inverse_transform(predictions)

candi3 = pd.DataFrame(max_indices, columns=['predicted_max_indices'])
candi3['predicted_max_values'] = max_values
candi3['candi_label'] = syn_train_y_encoded
candi3['transed_label'] = predictions_label
candi3['model'] = 'ctgan'
print(f1_score(syn_train_y_encoded, max_indices, average='macro'))
report = classification_report(syn_train_y_encoded, max_indices)
print(report)

0.5731336493389321
              precision    recall  f1-score   support

           0       0.67      0.99      0.80     10000
           1       0.98      0.67      0.80     10000
           2       0.57      0.33      0.42     10000
           3       0.28      0.18      0.22     10000
           4       0.47      0.83      0.60     10000
           5       0.69      0.99      0.81     10000
           6       0.74      0.12      0.20     10000
           7       0.87      0.78      0.82     10000
           8       0.42      0.66      0.51     10000
           9       0.81      0.51      0.62     10000
          10       0.74      0.53      0.62     10000
          11       0.93      0.55      0.69     10000
          12       0.26      0.48      0.34     10000

    accuracy                           0.59    130000
   macro avg       0.65      0.59      0.57    130000
weighted avg       0.65      0.59      0.57    130000



In [None]:
candi3['correct'] = (candi3['candi_label'] == candi3['predicted_max_indices'])
candi3_correct = candi3[candi3.correct == True]
candi3_transed = candi3_correct.transed_label.value_counts().reset_index()

candi3_pb = candi3_correct.groupby('transed_label')['predicted_max_values'].mean().reset_index()
candi3_fin = pd.merge(candi3_transed, candi3_pb, on='transed_label')
candi3_fin

Unnamed: 0,transed_label,count,predicted_max_values
0,a,9900,0.980975
1,f,9896,0.981955
2,e,8289,0.910458
3,h,7753,0.861534
4,b,6689,0.876772
5,i,6636,0.851106
6,l,5519,0.831348
7,k,5314,0.823665
8,j,5063,0.767572
9,m,4840,0.776077


# 고품질 데이터 1천개 선택

In [None]:
# 각 라벨별 가장 많이 예측한 모델 선택 > 해당 모델의 합성 데이터셋 중 라벨 예측값이 높은 상위 1000개 선택 저장
fraud_types = train_all.Fraud_Type.unique()
result = []

fin_syn_data = pd.DataFrame()
candi1_synthetic_data = pd.read_csv(save_path + '/copulagan_syn_data.csv')
fraud_type_means = candi1_synthetic_data.groupby('Fraud_Type')['Account_remaining_amount_daily_limit_exceeded'].mean()
# nan 값을 Fraud_Type별 평균으로 채우기
for fraud_type, mean_value in fraud_type_means.items():
  candi1_synthetic_data.loc[candi1_synthetic_data['Fraud_Type'] == fraud_type, 'Account_remaining_amount_daily_limit_exceeded'] = candi1_synthetic_data.loc[candi1_synthetic_data['Fraud_Type'] == fraud_type, 'Account_remaining_amount_daily_limit_exceeded'].fillna(mean_value)



candi2_synthetic_data = pd.read_csv(save_path + '/tvae_syn_data.csv')
candi3_synthetic_data = pd.read_csv(save_path + '/ctgan_syn_data.csv')

for x in fraud_types:
  try:
    candi1_cnt = candi1_fin[candi1_fin.transed_label == x]['count'].values[0]
  except:
    candi3_cnt = 0
  try:
    candi2_cnt = candi2_fin[candi2_fin.transed_label == x]['count'].values[0]
  except:
    candi3_cnt = 0
  try:
    candi3_cnt = candi3_fin[candi3_fin.transed_label == x]['count'].values[0]
  except:
    candi3_cnt = 0

  max_count = max(candi1_cnt, candi2_cnt, candi3_cnt)
  print(x, max_count)

  # 일치 개수가 가장 많은 모델 선택
  if max_count == candi1_cnt:
    tmp = pd.concat([candi1_synthetic_data, candi1[['predicted_max_values', 'correct', 'model']]], axis=1)
    tmp_correct = tmp[tmp.correct == True]
    fin_tmp = tmp_correct[tmp_correct.Fraud_Type == x].sort_values('predicted_max_values', ascending=False).head(1000)
    fin_syn_data = pd.concat([fin_syn_data, fin_tmp], axis=0)
    result.append(('candi1_fin', x, candi1_cnt))

  elif max_count == candi2_cnt:
    tmp = pd.concat([candi2_synthetic_data, candi2[['predicted_max_values', 'correct', 'model']]], axis=1)
    tmp_correct = tmp[tmp.correct == True]
    fin_tmp = tmp_correct[tmp_correct.Fraud_Type == x].sort_values('predicted_max_values', ascending=False).head(1000)
    fin_syn_data = pd.concat([fin_syn_data, fin_tmp], axis=0)
    result.append(('candi2_fin', x, candi2_cnt))

  else:
    tmp = pd.concat([candi3_synthetic_data, candi3[['predicted_max_values', 'correct', 'model']]], axis=1)
    tmp_correct = tmp[tmp.correct == True]
    fin_tmp = tmp_correct[tmp_correct.Fraud_Type == x].sort_values('predicted_max_values', ascending=False).head(1000)
    fin_syn_data = pd.concat([fin_syn_data, fin_tmp], axis=0)
    result.append(('candi3_fin', x, candi3_cnt))

print(result)


m 5998
a 10000
j 5063
h 7920
k 8224
c 4912
g 1163
i 8774
b 6852
f 9896
d 3106
e 9344
l 7522
[('candi1_fin', 'm', 5998), ('candi2_fin', 'a', 10000), ('candi3_fin', 'j', 5063), ('candi2_fin', 'h', 7920), ('candi1_fin', 'k', 8224), ('candi2_fin', 'c', 4912), ('candi3_fin', 'g', 1163), ('candi2_fin', 'i', 8774), ('candi2_fin', 'b', 6852), ('candi3_fin', 'f', 9896), ('candi2_fin', 'd', 3106), ('candi2_fin', 'e', 9344), ('candi2_fin', 'l', 7522)]


In [None]:
# 확인
fin_syn_data[fin_syn_data.correct == False]

Unnamed: 0,Customer_Birthyear,Customer_Gender,Customer_personal_identifier,Customer_identification_number,Customer_registration_datetime,Customer_credit_rating,Customer_flag_change_of_authentication_1,Customer_flag_change_of_authentication_2,Customer_flag_change_of_authentication_3,Customer_flag_change_of_authentication_4,...,Recipient_account_suspend_status,Number_of_transaction_with_the_account,Transaction_history_with_the_account,First_time_iOS_by_vulnerable_user,Fraud_Type,Transaction_resumed_date,Time_difference,predicted_max_values,correct,model


In [None]:
# 불필요한 칼럼 지우기
fin_syn_data.drop('correct', axis=1, inplace=True)

In [None]:
fin_syn_data.Fraud_Type.value_counts()

Unnamed: 0_level_0,count
Fraud_Type,Unnamed: 1_level_1
m,1000
a,1000
j,1000
h,1000
k,1000
c,1000
g,1000
i,1000
b,1000
f,1000


In [None]:
fin_syn_data.model.value_counts()

Unnamed: 0_level_0,count
model,Unnamed: 1_level_1
tvae,8000
ctgan,3000
copulagan,2000


In [None]:
# 각 라벨별 선택 모델
fin_syn_data[['Fraud_Type', 'model']].drop_duplicates()

Unnamed: 0,Fraud_Type,model
9678,m,copulagan
11087,a,tvae
28136,j,ctgan
38305,h,tvae
43702,k,copulagan
53000,c,tvae
60089,g,ctgan
76303,i,tvae
83606,b,tvae
96895,f,ctgan


In [None]:
fin_syn_data.to_csv(save_path + '/filtered_syn_for_cls.csv', index=False)