# 구글 마운트

In [None]:
from google.colab import drive

drive.mount('/content/drive')

# **라이브러리 불러오기**

In [None]:
! pip install kaggler
import warnings
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import os
from kaggler.model import AutoLGB
import lightgbm as lgb

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')

In [None]:
print('현재경로: {}'.format(os.getcwd()))

# index는 무의미한 열이며, FLAGE_MOBIL은 모두 같은 값을 지니기에 삭제
train = pd.read_csv("/content/drive/MyDrive/dacon_card_predict/data/train.csv")
test = pd.read_csv("/content/drive/MyDrive/dacon_card_predict/data/test.csv")
submission = pd.read_csv("/content/drive/MyDrive/dacon_card_predict/data/sample_submission.csv")


train.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)
test.drop(['index', 'FLAG_MOBIL'], axis=1, inplace=True)

train_original = train.copy()
train_original2 = train.copy()
test_original = test.copy()
test_original2 = test.copy()

print('train의 Shape: {}'.format(train.shape))
print('test의 Shape: {}'.format(test.shape))
print('submission의 Shape: {}'.format(submission.shape))

## FIT 객체 정의

In [None]:
def make_fit_instance(train):


    # 결측치 처리
    train.loc[train['DAYS_EMPLOYED'] > 0, 'occyp_type'] = 'NoJop'
    train['occyp_type'] = train['occyp_type'].fillna('None')


    # labelencoding 열 추가
    ## (1). occyp_type을 credit_score로 label_encoding
    ### occyp_type 별 credit의 value_count 산정
    cnt = 0
    for jop in train['occyp_type'].unique():

        tmp = pd.DataFrame(round(train.loc[train['occyp_type'] == jop, 'credit'].value_counts() / train.loc[train['occyp_type'] == jop, 'credit'].shape[0],3)).rename(columns={'credit':jop})

        if cnt == 0:
            concat_df = tmp.copy()
        else:
            concat_df = pd.concat([concat_df,tmp], axis=1)

        cnt += 1

    ### occyp_type 별 credit_score 산출
    concat_df_t = concat_df.transpose().rename(columns={0:'credit:0',
                                        1:'credit:1',
                                        2:'credit:2'})
    concat_df_t['score'] = concat_df_t['credit:0'] * 0 + concat_df_t['credit:1'] * 1 + concat_df_t['credit:2'] * 2

    ### Scaling
    mmscaler = MinMaxScaler()
    dict_occyp_type = pd.DataFrame(mmscaler.fit_transform(concat_df_t['score'].values.reshape(-1,1)), columns=['score'], index=concat_df_t.index)['score'].to_dict()
    train['occyp_type_labelencoding'] = train['occyp_type'].apply(lambda x:dict_occyp_type.get(x,0))

    ### drop occyp_type
    train.drop('occyp_type', axis=1, inplace=True)


    ## (2). income_type을 credit_score로 label_encoding
    ### income_type 별 credit의 value_count 산정
    cnt = 0
    for jop in train['income_type'].unique():

        tmp = pd.DataFrame(round(train.loc[train['income_type'] == jop, 'credit'].value_counts() / train.loc[train['income_type'] == jop, 'credit'].shape[0],3)).rename(columns={'credit':jop})

        if cnt == 0:
            concat_df = tmp.copy()
        else:
            concat_df = pd.concat([concat_df,tmp], axis=1)

        cnt += 1

    ### income_type 별 credit_score 산출
    concat_df_t = concat_df.transpose().rename(columns={0:'credit:0',
                                        1:'credit:1',
                                        2:'credit:2'})
    concat_df_t['score'] = concat_df_t['credit:1'] * 1 + concat_df_t['credit:2'] * 2

    ### Scaling
    mmscaler = MinMaxScaler()
    dict_income_type = pd.DataFrame(mmscaler.fit_transform(concat_df_t['score'].values.reshape(-1,1)), columns=['score'], index=concat_df_t.index)['score'].to_dict()
    train['income_type_labelencoding'] = train['income_type'].apply(lambda x:dict_income_type.get(x,0))



    ## (3). house_type을 credit_score로 label_encoding
    ### house_type 별 credit의 value_count 산정
    cnt = 0
    for jop in train['house_type'].unique():

        tmp = pd.DataFrame(round(train.loc[train['house_type'] == jop, 'credit'].value_counts() / train.loc[train['house_type'] == jop, 'credit'].shape[0],3)).rename(columns={'credit':jop})

        if cnt == 0:
            concat_df = tmp.copy()
        else:
            concat_df = pd.concat([concat_df,tmp], axis=1)

        cnt += 1

    ### house_type 별 credit_score 산출
    concat_df_t = concat_df.transpose().rename(columns={0:'credit:0',
                                        1:'credit:1',
                                        2:'credit:2'})
    concat_df_t['score'] = concat_df_t['credit:0'] * 0 + concat_df_t['credit:1'] * 1 + concat_df_t['credit:2'] * 2

    ### Scaling
    mmscaler = MinMaxScaler()
    dict_house_type = pd.DataFrame(mmscaler.fit_transform(concat_df_t['score'].values.reshape(-1,1)), columns=['score'], index=concat_df_t.index)['score'].to_dict()
    train['house_type_labelencoding'] = train['house_type'].apply(lambda x:dict_house_type.get(x,0))



    ## (4). edu_type을 credit_score로 label_encoding
    ### edu_type 별 credit의 value_count 산정
    cnt = 0
    for jop in train['edu_type'].unique():

        tmp = pd.DataFrame(round(train.loc[train['edu_type'] == jop, 'credit'].value_counts() / train.loc[train['edu_type'] == jop, 'credit'].shape[0],3)).rename(columns={'credit':jop})

        if cnt == 0:
            concat_df = tmp.copy()
        else:
            concat_df = pd.concat([concat_df,tmp], axis=1)

        cnt += 1

    ### edu_type 별 credit_score 산출
    concat_df_t = concat_df.transpose().rename(columns={0:'credit:0',
                                        1:'credit:1',
                                        2:'credit:2'})
    concat_df_t['score'] = concat_df_t['credit:0'] * 0 + concat_df_t['credit:1'] * 1 + concat_df_t['credit:2'] * 2

    ### Scaling
    mmscaler = MinMaxScaler()
    dict_edu_type = pd.DataFrame(mmscaler.fit_transform(concat_df_t['score'].values.reshape(-1,1)), columns=['score'], index=concat_df_t.index)['score'].to_dict()
    train['edu_type_labelencoding'] = train['edu_type'].apply(lambda x:dict_edu_type.get(x,0))
    train.drop('edu_type', axis=1, inplace=True)




    # 2. 자동차와 집은 고가 재산 --> 두개 모두 소유 vs 한개만소유 vs 아예 없는 유형 유의미할 듯?
    train['gender'] = train['gender'].replace(['F','M'], [0,  1])
    train['car'] = train['car'].replace(['N', 'Y'], [0, 1])
    train['reality'] = train['reality'].replace(['N', 'Y'], [0, 1])
    train['car_reality'] = train['car'] + train['reality']



    # 3. 나이변수 구간화 --> 20 ~ 69세까지 존재 --> 20대, 30대 등,,, 으로 mapping
    train['DAYS_BIRTH'] = train['DAYS_BIRTH'] * -1
    train['DAYS_BIRTH_bin'] = 9999
    train.loc[(365*20 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*30), 'DAYS_BIRTH_bin'] = 1
    train.loc[(365*30 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*40), 'DAYS_BIRTH_bin'] = 2
    train.loc[(365*40 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*50), 'DAYS_BIRTH_bin'] = 3
    train.loc[(365*50 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*60), 'DAYS_BIRTH_bin'] = 4
    train.loc[(365*60 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*70), 'DAYS_BIRTH_bin'] = 5



    # 4. 아이들의 수: 없음 // 1~2명 // 3명이상으로 구분 
    train['child_num_group'] = 99
    train.loc[train['child_num'] == 0, 'child_num_group'] = 0
    train.loc[train['child_num'].isin([1,2]), 'child_num_group'] = 1
    train.loc[train['child_num'] > 2, 'child_num_group'] = 2
    train.drop('child_num', axis=1, inplace=True)



    # 5. 가족 사이즈 1 // 2~4 // 5~ 구분
    train['family_size_group'] = 99
    train.loc[train['family_size'] == 1, 'family_size_group'] = 0
    train.loc[train['family_size'].isin([2,3,4]), 'family_size_group'] = 1
    train.loc[train['family_size'] > 4, 'family_size_group'] = 2
    train.drop('family_size', axis=1, inplace=True)



    # 6. 결혼 // 혼자사는사람 // 결혼을 했으나 사정상 혼자사는 사람 0,1,2 구분
    train['family_type_group'] = 999
    train.loc[train['family_type'].isin(['Married','Civil marriage']), 'family_type_group'] = 0
    train.loc[train['family_type'].isin(['Single / not married']), 'family_type_group'] = 1
    train.loc[train['family_type'].isin(['Separated','Widow']), 'family_type_group'] = 2
    train.drop('family_type', axis=1, inplace=True)



    # 8. 근로변수 구간화-> 20 ~ 40세까지 존재 --> 20대, 30대 등,,, 으로 mapping
    train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'] * -1
    train['DAYS_EMPLOYED_bin'] = 9999
    train.loc[ ( (train['DAYS_EMPLOYED'] < 0 )), 'DAYS_EMPLOYED_bin'] = 0 # 무직
    train.loc[(0 < train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*5), 'DAYS_EMPLOYED_bin'] = 1 #1년차~4년차 (사회초년생)
    train.loc[(365*5 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*10), 'DAYS_EMPLOYED_bin'] = 2 # 5년차~9년차 
    train.loc[(365*10 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*20), 'DAYS_EMPLOYED_bin'] = 3 # 10년차~20년차
    train.loc[(365*20 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*30), 'DAYS_EMPLOYED_bin'] = 4 # 20년차~30년차
    train.loc[(365*30 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*40), 'DAYS_EMPLOYED_bin'] = 5 # 30년차~40년차
    train.loc[(365*40 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*50), 'DAYS_EMPLOYED_bin'] = 6 # 40년차~50년차
    train.loc[(365*50 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*60), 'DAYS_EMPLOYED_bin'] = 7
    train.loc[(365*60 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*70), 'DAYS_EMPLOYED_bin'] = 8
    train.loc[(365*70 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*80), 'DAYS_EMPLOYED_bin'] = 9



    # 9. 근로 일수에 따른 수입 (연간 소득을 년차 평준화해주는느낌..)
    train['EMPLOYED_INCOME'] = 9999
    train.loc[(train.DAYS_EMPLOYED_bin== 0),'EMPLOYED_INCOME'] = 0
    train.loc[(train.DAYS_EMPLOYED_bin== 1),'EMPLOYED_INCOME'] = 6/21
    train.loc[(train.DAYS_EMPLOYED_bin== 2),'EMPLOYED_INCOME'] = 5/21
    train.loc[(train.DAYS_EMPLOYED_bin== 3),'EMPLOYED_INCOME'] = 4/21
    train.loc[(train.DAYS_EMPLOYED_bin== 4),'EMPLOYED_INCOME'] = 3/21
    train.loc[(train.DAYS_EMPLOYED_bin== 5),'EMPLOYED_INCOME'] = 2/21
    train.loc[(train.DAYS_EMPLOYED_bin== 6),'EMPLOYED_INCOME'] = 1/21
    train['EMPLOYED_INCOME'] = train['EMPLOYED_INCOME'] * train['income_total']



    # FIT value_counts() 변수
    dict_income_type_valuecount = train['income_type'].value_counts().to_dict()
    dict_house_type_valuecount = train['house_type'].value_counts().to_dict()
    train['income_type_count'] = train['income_type'].apply(lambda x:dict_income_type_valuecount.get(x,0))
    train['house_type_count'] = train['house_type'].apply(lambda x:dict_house_type_valuecount.get(x,0))



    
    # max, mean, min
    ### DAYS_BIRTH_bin
    dict_DAYS_BIRTH_bin_mean = train.groupby('DAYS_BIRTH_bin').agg('mean')['income_total'].to_dict()
    dict_DAYS_BIRTH_bin_max = train.groupby('DAYS_BIRTH_bin').agg('max')['income_total'].to_dict()
    dict_DAYS_BIRTH_bin_min = train.groupby('DAYS_BIRTH_bin').agg('min')['income_total'].to_dict()
    train['averageincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_mean.get(x,0))
    train['maxincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_max.get(x,0))
    train['minincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_min.get(x,0))

    ### DAYS_EMPLOYED_bin
    dict_DAYS_EMPLOYED_bin_mean = train.groupby('DAYS_EMPLOYED_bin').agg('mean')['income_total'].to_dict()
    dict_DAYS_EMPLOYED_bin_max = train.groupby('DAYS_EMPLOYED_bin').agg('max')['income_total'].to_dict()
    dict_DAYS_EMPLOYED_bin_min = train.groupby('DAYS_EMPLOYED_bin').agg('min')['income_total'].to_dict()
    train['averagehouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_mean.get(x,0))
    train['maxinhouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_max.get(x,0))
    train['mininhouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_min.get(x,0))

    ### house_type
    dict_house_type_mean = train.groupby('house_type').agg('mean')['income_total'].to_dict()
    dict_house_type_max = train.groupby('house_type').agg('max')['income_total'].to_dict()
    dict_house_type_min = train.groupby('house_type').agg('min')['income_total'].to_dict()
    train['averagerealhouse'] = train['house_type'].apply(lambda x:dict_house_type_mean.get(x,0))
    train['maxrealhouse'] = train['house_type'].apply(lambda x:dict_house_type_max.get(x,0))
    train['minrealhouse'] = train['house_type'].apply(lambda x:dict_house_type_min.get(x,0))

    ### edu_type_labelencoding
    dict_edu_type_labelencoding_mean = train.groupby('edu_type_labelencoding').agg('mean')['income_total'].to_dict()
    dict_edu_type_labelencoding_max = train.groupby('edu_type_labelencoding').agg('max')['income_total'].to_dict()
    dict_edu_type_labelencoding_min = train.groupby('edu_type_labelencoding').agg('min')['income_total'].to_dict()
    train['averageedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_mean.get(x,0))
    train['maxedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_max.get(x,0))
    train['minedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_min.get(x,0))


   
    # FIT onehotencoder
    OH_encoder1 = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_encoder1.fit_transform(train[['income_type']])
    OH_cols_train1 = pd.DataFrame(OH_encoder1.fit_transform(train[['income_type']]), index=train.index)
    train.drop('income_type', axis=1, inplace=True)
    train = pd.concat([train, OH_cols_train1], axis=1)

    OH_encoder2 = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_encoder2.fit_transform(train[['house_type']])
    OH_cols_train2 = pd.DataFrame(OH_encoder2.fit_transform(train[['house_type']]), index=train.index)
    train.drop('house_type', axis=1, inplace=True)
    train = pd.concat([train, OH_cols_train2], axis=1)



    # binary sum 열 생성
    binary = ['gender','car','reality','work_phone','phone','email']
    train['bin_sum'] = train[binary].sum(axis=1)



    # FIT scaler
    standardscaler = StandardScaler()
    train['income_stand'] = standardscaler.fit_transform(train[['income_total']])

    minmaxscaler = MinMaxScaler()
    train['income_minmax'] = minmaxscaler.fit_transform(train[['income_total']])



    # KMEAN
    train_x = train.drop(["credit"], axis = 1)
    kmeans = KMeans(n_clusters=10, n_init=10, random_state=0)
    kmeans.fit(train_x)



    return dict_occyp_type, dict_income_type, dict_house_type, dict_edu_type, dict_income_type_valuecount, dict_house_type_valuecount, dict_DAYS_BIRTH_bin_mean, dict_DAYS_BIRTH_bin_max, dict_DAYS_BIRTH_bin_min, dict_DAYS_EMPLOYED_bin_mean, dict_DAYS_EMPLOYED_bin_max, dict_DAYS_EMPLOYED_bin_min, dict_house_type_mean, dict_house_type_max, dict_house_type_min, dict_edu_type_labelencoding_mean, dict_edu_type_labelencoding_max, dict_edu_type_labelencoding_min, OH_encoder1, OH_encoder2, standardscaler, minmaxscaler, kmeans

In [None]:
dict_occyp_type, dict_income_type, dict_house_type, dict_edu_type, dict_income_type_valuecount, dict_house_type_valuecount, dict_DAYS_BIRTH_bin_mean, dict_DAYS_BIRTH_bin_max, dict_DAYS_BIRTH_bin_min, dict_DAYS_EMPLOYED_bin_mean, dict_DAYS_EMPLOYED_bin_max, dict_DAYS_EMPLOYED_bin_min, dict_house_type_mean, dict_house_type_max, dict_house_type_min, dict_edu_type_labelencoding_mean, dict_edu_type_labelencoding_max, dict_edu_type_labelencoding_min, OH_encoder1, OH_encoder2, standardscaler, minmaxscaler, kmeans = make_fit_instance(train)

## 전처리 함수 정의

In [None]:
def preprocessing(train, test, mode):



    # 결측치 처리
    train.loc[train['DAYS_EMPLOYED'] > 0, 'occyp_type'] = 'NoJop'
    train['occyp_type'] = train['occyp_type'].fillna('None')
    test.loc[test['DAYS_EMPLOYED'] > 0, 'occyp_type'] = 'NoJop'
    test['occyp_type'] = test['occyp_type'].fillna('None')



    # 1. label_encoding
    train['occyp_type_labelencoding'] = train['occyp_type'].apply(lambda x:dict_occyp_type.get(x,0))
    test['occyp_type_labelencoding'] = test['occyp_type'].apply(lambda x:dict_occyp_type.get(x,0))
    train.drop('occyp_type', axis=1, inplace=True)
    test.drop('occyp_type', axis=1, inplace=True)

    train['income_type_labelencoding'] = train['income_type'].apply(lambda x:dict_occyp_type.get(x,0))
    test['income_type_labelencoding'] = test['income_type'].apply(lambda x:dict_occyp_type.get(x,0))

    train['house_type_labelencoding'] = train['house_type'].apply(lambda x:dict_occyp_type.get(x,0))
    test['house_type_labelencoding'] = test['house_type'].apply(lambda x:dict_occyp_type.get(x,0))

    train['edu_type_labelencoding'] = train['edu_type'].apply(lambda x:dict_edu_type.get(x,0))
    test['edu_type_labelencoding'] = test['edu_type'].apply(lambda x:dict_edu_type.get(x,0))




    # 2. 자동차와 집은 고가 재산 --> 두개 모두 소유 vs 한개만소유 vs 아예 없는 유형 유의미할 듯?
    train['gender'] = train['gender'].replace(['F','M'], [0,  1])
    train['car'] = train['car'].replace(['N', 'Y'], [0, 1])
    train['reality'] = train['reality'].replace(['N', 'Y'], [0, 1])
    train['car_reality'] = train['car'] + train['reality']

    test['gender'] = test['gender'].replace(['F','M'], [0,  1])
    test['car'] = test['car'].replace(['N', 'Y'], [0, 1])
    test['reality'] = test['reality'].replace(['N', 'Y'], [0, 1])
    test['car_reality'] = test['car'] + test['reality']



    # 3. 나이변수 구간화 --> 20 ~ 69세까지 존재 --> 20대, 30대 등,,, 으로 mapping
    train['DAYS_BIRTH'] = train['DAYS_BIRTH'] * -1
    train['DAYS_BIRTH_bin'] = 9999
    train.loc[(365*20 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*30), 'DAYS_BIRTH_bin'] = 1
    train.loc[(365*30 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*40), 'DAYS_BIRTH_bin'] = 2
    train.loc[(365*40 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*50), 'DAYS_BIRTH_bin'] = 3
    train.loc[(365*50 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*60), 'DAYS_BIRTH_bin'] = 4
    train.loc[(365*60 <= train['DAYS_BIRTH']) & (train['DAYS_BIRTH'] < 365*70), 'DAYS_BIRTH_bin'] = 5

    test['DAYS_BIRTH'] = test['DAYS_BIRTH'] * -1
    test['DAYS_BIRTH_bin'] = 9999
    test.loc[(365*20 <= test['DAYS_BIRTH']) & (test['DAYS_BIRTH'] < 365*30), 'DAYS_BIRTH_bin'] = 1
    test.loc[(365*30 <= test['DAYS_BIRTH']) & (test['DAYS_BIRTH'] < 365*40), 'DAYS_BIRTH_bin'] = 2
    test.loc[(365*40 <= test['DAYS_BIRTH']) & (test['DAYS_BIRTH'] < 365*50), 'DAYS_BIRTH_bin'] = 3
    test.loc[(365*50 <= test['DAYS_BIRTH']) & (test['DAYS_BIRTH'] < 365*60), 'DAYS_BIRTH_bin'] = 4
    test.loc[(365*60 <= test['DAYS_BIRTH']) & (test['DAYS_BIRTH'] < 365*70), 'DAYS_BIRTH_bin'] = 5



    # 4. 아이들의 수: 없음 // 1~2명 // 3명이상으로 구분 
    train['child_num_group'] = 99
    train.loc[train['child_num'] == 0, 'child_num_group'] = 0
    train.loc[train['child_num'].isin([1,2]), 'child_num_group'] = 1
    train.loc[train['child_num'] > 2, 'child_num_group'] = 2
    train.drop('child_num', axis=1, inplace=True)

    test['child_num_group'] = 99
    test.loc[test['child_num'] == 0, 'child_num_group'] = 0
    test.loc[test['child_num'].isin([1,2]), 'child_num_group'] = 1
    test.loc[test['child_num'] > 2, 'child_num_group'] = 2
    test.drop('child_num', axis=1, inplace=True)



    # 5. 가족 사이즈 1 // 2~4 // 5~ 구분
    train['family_size_group'] = 99
    train.loc[train['family_size'] == 1, 'family_size_group'] = 0
    train.loc[train['family_size'].isin([2,3,4]), 'family_size_group'] = 1
    train.loc[train['family_size'] > 4, 'family_size_group'] = 2
    train.drop('family_size', axis=1, inplace=True)

    test['family_size_group'] = 99
    test.loc[test['family_size'] == 1, 'family_size_group'] = 0
    test.loc[test['family_size'].isin([2,3,4]), 'family_size_group'] = 1
    test.loc[test['family_size'] > 4, 'family_size_group'] = 2
    test.drop('family_size', axis=1, inplace=True)



    # 6. 결혼 // 혼자사는사람 // 결혼을 했으나 사정상 혼자사는 사람 0,1,2 구분
    train['family_type_group'] = 999
    train.loc[train['family_type'].isin(['Married','Civil marriage']), 'family_type_group'] = 0
    train.loc[train['family_type'].isin(['Single / not married']), 'family_type_group'] = 1
    train.loc[train['family_type'].isin(['Separated','Widow']), 'family_type_group'] = 2
    train.drop('family_type', axis=1, inplace=True)

    test['family_type_group'] = 999
    test.loc[test['family_type'].isin(['Married','Civil marriage']), 'family_type_group'] = 0
    test.loc[test['family_type'].isin(['Single / not married']), 'family_type_group'] = 1
    test.loc[test['family_type'].isin(['Separated','Widow']), 'family_type_group'] = 2
    test.drop('family_type', axis=1, inplace=True)



    # 7. edu_type 학력순으로 label-encoding 
    train['edu_type_labelencoding'] = 999
    train.loc[train['edu_type'] == 'Academic degree', 'edu_type_labelencoding'] = 4
    train.loc[train['edu_type'] == 'Higher education', 'edu_type_labelencoding'] = 3
    train.loc[train['edu_type'] == 'Incomplete higher', 'edu_type_labelencoding'] = 2
    train.loc[train['edu_type'] == 'Secondary / secondary special', 'edu_type_labelencoding'] = 1
    train.loc[train['edu_type'] == 'Lower secondary', 'edu_type_labelencoding'] = 0
    train.drop('edu_type', axis=1, inplace=True)

    test['edu_type_labelencoding'] = 999
    test.loc[test['edu_type'] == 'Academic degree', 'edu_type_labelencoding'] = 4
    test.loc[test['edu_type'] == 'Higher education', 'edu_type_labelencoding'] = 3
    test.loc[test['edu_type'] == 'Incomplete higher', 'edu_type_labelencoding'] = 2
    test.loc[test['edu_type'] == 'Secondary / secondary special', 'edu_type_labelencoding'] = 1
    test.loc[test['edu_type'] == 'Lower secondary', 'edu_type_labelencoding'] = 0
    test.drop('edu_type', axis=1, inplace=True)



    # 8. 근로변수 구간화-> 20 ~ 40세까지 존재 --> 20대, 30대 등,,, 으로 mapping
    train['DAYS_EMPLOYED'] = train['DAYS_EMPLOYED'] * -1
    train['DAYS_EMPLOYED_bin'] = 9999
    train.loc[ ( (train['DAYS_EMPLOYED'] < 0 )), 'DAYS_EMPLOYED_bin'] = 0 # 무직
    train.loc[(0 < train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*5), 'DAYS_EMPLOYED_bin'] = 1 #1년차~4년차 (사회초년생)
    train.loc[(365*5 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*10), 'DAYS_EMPLOYED_bin'] = 2 # 5년차~9년차 
    train.loc[(365*10 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*20), 'DAYS_EMPLOYED_bin'] = 3 # 10년차~20년차
    train.loc[(365*20 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*30), 'DAYS_EMPLOYED_bin'] = 4 # 20년차~30년차
    train.loc[(365*30 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*40), 'DAYS_EMPLOYED_bin'] = 5 # 30년차~40년차
    train.loc[(365*40 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*50), 'DAYS_EMPLOYED_bin'] = 6 # 40년차~50년차
    train.loc[(365*50 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*60), 'DAYS_EMPLOYED_bin'] = 7
    train.loc[(365*60 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*70), 'DAYS_EMPLOYED_bin'] = 8
    train.loc[(365*70 <= train['DAYS_EMPLOYED']) & (train['DAYS_EMPLOYED'] < 365*80), 'DAYS_EMPLOYED_bin'] = 9

    test['DAYS_EMPLOYED'] = test['DAYS_EMPLOYED'] * -1
    test['DAYS_EMPLOYED_bin'] = 9999
    test.loc[ ( (test['DAYS_EMPLOYED'] < 0 )), 'DAYS_EMPLOYED_bin'] = 0 # 무직
    test.loc[(0 < test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*5), 'DAYS_EMPLOYED_bin'] = 1 #1년차~4년차 (사회초년생)
    test.loc[(365*5 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*10), 'DAYS_EMPLOYED_bin'] = 2 # 5년차~9년차 
    test.loc[(365*10 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*20), 'DAYS_EMPLOYED_bin'] = 3 # 10년차~20년차
    test.loc[(365*20 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*30), 'DAYS_EMPLOYED_bin'] = 4 # 20년차~30년차
    test.loc[(365*30 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*40), 'DAYS_EMPLOYED_bin'] = 5 # 30년차~40년차
    test.loc[(365*40 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*50), 'DAYS_EMPLOYED_bin'] = 6 # 40년차~50년차
    test.loc[(365*50 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*60), 'DAYS_EMPLOYED_bin'] = 7
    test.loc[(365*60 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*70), 'DAYS_EMPLOYED_bin'] = 8
    test.loc[(365*70 <= test['DAYS_EMPLOYED']) & (test['DAYS_EMPLOYED'] < 365*80), 'DAYS_EMPLOYED_bin'] = 9



    # 9. 근로 일수에 따른 수입 (연간 소득을 년차 평준화해주는느낌..)
    train['EMPLOYED_INCOME'] = 9999
    train.loc[(train.DAYS_EMPLOYED_bin== 0),'EMPLOYED_INCOME'] = 0
    train.loc[(train.DAYS_EMPLOYED_bin== 1),'EMPLOYED_INCOME'] = 6/21
    train.loc[(train.DAYS_EMPLOYED_bin== 2),'EMPLOYED_INCOME'] = 5/21
    train.loc[(train.DAYS_EMPLOYED_bin== 3),'EMPLOYED_INCOME'] = 4/21
    train.loc[(train.DAYS_EMPLOYED_bin== 4),'EMPLOYED_INCOME'] = 3/21
    train.loc[(train.DAYS_EMPLOYED_bin== 5),'EMPLOYED_INCOME'] = 2/21
    train.loc[(train.DAYS_EMPLOYED_bin== 6),'EMPLOYED_INCOME'] = 1/21
    train['EMPLOYED_INCOME'] = train['EMPLOYED_INCOME'] * train['income_total']

    test['EMPLOYED_INCOME'] = 9999
    test.loc[(test.DAYS_EMPLOYED_bin== 0),'EMPLOYED_INCOME'] = 0
    test.loc[(test.DAYS_EMPLOYED_bin== 1),'EMPLOYED_INCOME'] = 6/21
    test.loc[(test.DAYS_EMPLOYED_bin== 2),'EMPLOYED_INCOME'] = 5/21
    test.loc[(test.DAYS_EMPLOYED_bin== 3),'EMPLOYED_INCOME'] = 4/21
    test.loc[(test.DAYS_EMPLOYED_bin== 4),'EMPLOYED_INCOME'] = 3/21
    test.loc[(test.DAYS_EMPLOYED_bin== 5),'EMPLOYED_INCOME'] = 2/21
    test.loc[(test.DAYS_EMPLOYED_bin== 6),'EMPLOYED_INCOME'] = 1/21
    test['EMPLOYED_INCOME'] = test['EMPLOYED_INCOME'] * test['income_total']



    #11. value_counts 변수 
    train['income_type_count'] = train['income_type'].apply(lambda x:dict_income_type_valuecount.get(x,0))
    train['house_type_count'] = train['house_type'].apply(lambda x:dict_house_type_valuecount.get(x,0))
    test['income_type_count'] = test['income_type'].apply(lambda x:dict_income_type_valuecount.get(x,0))
    test['house_type_count'] = test['house_type'].apply(lambda x:dict_house_type_valuecount.get(x,0))



    # max, mean, min
    ### DAYS_BIRTH_bin
    train['averageincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_mean.get(x,0))
    train['maxincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_max.get(x,0))
    train['minincome'] = train['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_min.get(x,0))
    test['averageincome'] = test['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_mean.get(x,0))
    test['maxincome'] = test['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_max.get(x,0))
    test['minincome'] = test['DAYS_BIRTH_bin'].apply(lambda x:dict_DAYS_BIRTH_bin_min.get(x,0))

    ### DAYS_EMPLOYED_bin
    train['averagehouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_mean.get(x,0))
    train['maxinhouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_max.get(x,0))
    train['mininhouse'] = train['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_min.get(x,0))
    test['averagehouse'] = test['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_mean.get(x,0))
    test['maxinhouse'] = test['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_max.get(x,0))
    test['mininhouse'] = test['DAYS_EMPLOYED_bin'].apply(lambda x:dict_DAYS_EMPLOYED_bin_min.get(x,0))

    ### house_type
    train['averagerealhouse'] = train['house_type'].apply(lambda x:dict_house_type_mean.get(x,0))
    train['maxrealhouse'] = train['house_type'].apply(lambda x:dict_house_type_max.get(x,0))
    train['minrealhouse'] = train['house_type'].apply(lambda x:dict_house_type_min.get(x,0))
    test['averagerealhouse'] = test['house_type'].apply(lambda x:dict_house_type_mean.get(x,0))
    test['maxrealhouse'] = test['house_type'].apply(lambda x:dict_house_type_max.get(x,0))
    test['minrealhouse'] = test['house_type'].apply(lambda x:dict_house_type_min.get(x,0))

    ### edu_type_labelencoding
    train['averageedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_mean.get(x,0))
    train['maxedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_max.get(x,0))
    train['minedu'] = train['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_min.get(x,0))
    test['averageedu'] = test['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_mean.get(x,0))
    test['maxedu'] = test['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_max.get(x,0))
    test['minedu'] = test['edu_type_labelencoding'].apply(lambda x:dict_edu_type_labelencoding_min.get(x,0))

    

    # 그 외 열들 onehotencoding
    OH_cols_train1 = pd.DataFrame(OH_encoder1.transform(train[['income_type']]), index=train.index)
    train.drop('income_type', axis=1, inplace=True)
    train = pd.concat([train, OH_cols_train1], axis=1)

    OH_cols_test1 = pd.DataFrame(OH_encoder1.transform(test[['income_type']]), index=test.index)
    test.drop('income_type', axis=1, inplace=True)
    test = pd.concat([test, OH_cols_test1], axis=1)

    OH_cols_train2 = pd.DataFrame(OH_encoder2.transform(train[['house_type']]), index=train.index)
    train.drop('house_type', axis=1, inplace=True)
    train = pd.concat([train, OH_cols_train2], axis=1)

    OH_cols_test2 = pd.DataFrame(OH_encoder2.transform(test[['house_type']]), index=test.index)
    test.drop('house_type', axis=1, inplace=True)
    test = pd.concat([test, OH_cols_test2], axis=1)
    


    # binary sum 열 생성
    binary = ['gender','car','reality','work_phone','phone','email']
    train['bin_sum'] = train[binary].sum(axis=1)
    test['bin_sum'] = test[binary].sum(axis=1)



    # StandardScaler & minmaxscaler
    train['income_stand'] = standardscaler.transform(train[['income_total']])
    test['income_stand'] = standardscaler.transform(test[['income_total']])

    train['income_minmax'] = minmaxscaler.transform(train[['income_total']])
    test['income_minmax'] = minmaxscaler.transform(test[['income_total']])



    # KMEAN
    train_x = train.drop(["credit"], axis = 1)

    if mode == 'valid':
        test_x = test.drop(["credit"], axis = 1)
    elif mode == 'test':
        test_x = test.copy()
        

    train_kmean = kmeans.transform(train_x)
    train_kmean = pd.DataFrame(train_kmean, columns=[f"Centroid_{i+1}" for i in range(train_kmean.shape[1])], index=train_x.index)
    train = pd.concat([train, train_kmean], axis=1)

    test_kmean = kmeans.transform(test_x)
    test_kmean = pd.DataFrame(test_kmean, columns=[f"Centroid_{i+1}" for i in range(test_kmean.shape[1])], index=test_x.index)
    test = pd.concat([test, test_kmean], axis=1)



    # 일자 관련 변수
    ### DAYS_BIRTH
    train['DAYS_BIRTH_year'] = np.floor(-train['DAYS_BIRTH'] / 365)
    train['DAYS_BIRTH_month']=np.floor((-train['DAYS_BIRTH'])/30)-((np.floor((-train['DAYS_BIRTH'])/30)/12).astype(int)*12)
    train['DAYS_BIRTH_week']=np.floor((-train['DAYS_BIRTH'])/7)-((np.floor((-train['DAYS_BIRTH'])/7)/4).astype(int)*4)
    test['DAYS_BIRTH_year'] = np.floor(-test['DAYS_BIRTH'] / 365)
    test['DAYS_BIRTH_month']=np.floor((-test['DAYS_BIRTH'])/30)-((np.floor((-test['DAYS_BIRTH'])/30)/12).astype(int)*12)
    test['DAYS_BIRTH_week']=np.floor((-test['DAYS_BIRTH'])/7)-((np.floor((-test['DAYS_BIRTH'])/7)/4).astype(int)*4)

    ### DAYS_EMPLOYED
    train['DAYS_EMPLOYED_year'] = np.floor(-train['DAYS_EMPLOYED'] / 365)
    train['DAYS_EMPLOYED_month']=np.floor((-train['DAYS_EMPLOYED'])/30)-((np.floor((-train['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
    train['DAYS_EMPLOYED_week']=np.floor((-train['DAYS_EMPLOYED'])/7)-((np.floor((-train['DAYS_EMPLOYED'])/7)/4).astype(int)*4)
    test['DAYS_EMPLOYED_year'] = np.floor(-test['DAYS_EMPLOYED'] / 365)
    test['DAYS_EMPLOYED_month']=np.floor((-test['DAYS_EMPLOYED'])/30)-((np.floor((-test['DAYS_EMPLOYED'])/30)/12).astype(int)*12)
    test['DAYS_EMPLOYED_week']=np.floor((-test['DAYS_EMPLOYED'])/7)-((np.floor((-test['DAYS_EMPLOYED'])/7)/4).astype(int)*4)

    ### before_EMPLOYED
    train['before_EMPLOYED']=train['DAYS_BIRTH']-train['DAYS_EMPLOYED']
    train['before_EMPLOYED_year'] = np.floor(-train['before_EMPLOYED'] / 365)
    train['before_EMPLOYED_month']=np.floor((-train['before_EMPLOYED'])/30)-((np.floor((-train['before_EMPLOYED'])/30)/12).astype(int)*12)
    train['before_EMPLOYED_week']=np.floor((-train['before_EMPLOYED'])/7)-((np.floor((-train['before_EMPLOYED'])/7)/4).astype(int)*4)
    test['before_EMPLOYED']=test['DAYS_BIRTH']-test['DAYS_EMPLOYED']
    test['before_EMPLOYED_year'] = np.floor(-test['before_EMPLOYED'] / 365)
    test['before_EMPLOYED_month']=np.floor((-test['before_EMPLOYED'])/30)-((np.floor((-test['before_EMPLOYED'])/30)/12).astype(int)*12)
    test['before_EMPLOYED_week']=np.floor((-test['before_EMPLOYED'])/7)-((np.floor((-test['before_EMPLOYED'])/7)/4).astype(int)*4)



    return train, test

## 하이퍼파라미터 찾기

In [None]:
# 데이터 전처리
train_tuning, test = preprocessing(train_original2, test_original2, 'test')

train_tuning_x = train_tuning.drop(["credit"], axis = 1)
train_tuning_y = train_tuning['credit']

X_train, X_valid, y_train, y_valid = train_test_split(train_tuning_x, train_tuning_y,
                 stratify = train_tuning_y, 
                 test_size = 0.2,
                 random_state = 0)

In [None]:
# 파라미터 도출
params = {'num_class': 3,
          'learning_rate':0.01}


clf = AutoLGB(objective='multiclass', metric='multi_logloss', params=params, 
                feature_selection=False, n_est=10000)

clf.tune(X_train, y_train)
n_best = clf.n_best # n_estimates 횟수
features = clf.features # 사용된 변수
params = clf.params # 파라미터들
print(f'best iteration: {n_best}')
print(f'selected features ({len(features)}): {features}') 
print(f'params: {params}')

## KFOLD 모델링

In [None]:
# Train

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train_original, train_original['credit']):
    folds.append((train_idx, valid_idx))

lgb_models={}
for fold in range(5):
    print(f'===================================={fold+1}============================================')
    train_idx, valid_idx = folds[fold]
    
    TRAIN = train_original.iloc[train_idx]
    VALID = train_original.iloc[valid_idx]

    TRAIN, VALID = preprocessing(TRAIN, VALID, 'valid')

    X_train = TRAIN.drop(['credit'],axis=1).values
    X_valid = VALID.drop(['credit'],axis=1).values
    y_train = TRAIN['credit'].values
    y_valid = VALID['credit'].values 

    lgb_dtrain = lgb.Dataset(data = X_train, label = y_train) 
    lgb_dvalid = lgb.Dataset(data = X_valid, label = y_valid) 

    
    lgb_model = lgb.train(params, lgb_dtrain, 10000, valid_sets=[lgb_dvalid], early_stopping_rounds=100, verbose_eval=200)
    lgb_models[fold] = lgb_model

    print(f'================================================================================\n\n')

## submission

In [None]:
submission.iloc[:,1:]=0
for fold in range(5):
    submission.iloc[:,1:] += lgb_models[fold].predict(test)/5

submission.head(20)

In [None]:
submission.to_csv("/content/drive/MyDrive/dacon_card_predict/submission/0510_3.csv", index = False)