In [5]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.preprocessing import LabelEncoder

In [7]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
data = pd.concat([train, test]).drop(['index'], axis=1)

In [8]:
# 모든 사람은 핸드폰을 가지고 있다.
data = data.drop(['FLAG_MOBIL'], axis=1)

# 음수들 양수로 변환 변환
data['DAYS_BIRTH'] = -data['DAYS_BIRTH']
data['DAYS_EMPLOYED'] = -data['DAYS_EMPLOYED']
data['begin_month'] = -data['begin_month']

# DAYS_EMPLOYED : 일하고 있지 않은 사람들 0으로
data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(lambda x : x if x >=0 else 0)

data.head()

Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,begin_month,car,child_num,credit,edu_type,email,family_size,family_type,gender,house_type,income_total,income_type,occyp_type,phone,reality,work_phone
0,13899,4709,6.0,N,0,1.0,Higher education,0,2.0,Married,F,Municipal apartment,202500.0,Commercial associate,,0,N,0
1,11380,1540,5.0,N,1,1.0,Secondary / secondary special,1,3.0,Civil marriage,F,House / apartment,247500.0,Commercial associate,Laborers,0,Y,0
2,19087,4434,22.0,Y,0,2.0,Higher education,0,2.0,Married,M,House / apartment,450000.0,Working,Managers,1,Y,0
3,15088,2092,37.0,N,0,0.0,Secondary / secondary special,0,2.0,Married,F,House / apartment,202500.0,Commercial associate,Sales staff,1,Y,0
4,15037,2105,26.0,Y,0,2.0,Higher education,0,2.0,Married,F,House / apartment,157500.0,State servant,Managers,0,Y,0


# 데이터셋
### 이산형 자료형
1. gender(성별)
2. car(차 소유 유무)
3. reality(부동산 소유 우무)
4. income_type(소득 분류) 
    - Commercial associate : 상업 관계자..? 
    - Pensioner : 연금 수령자 
5. edu_type(교육 수준)
6. familly_type(결혼 여부)
7. house_type(생활 방식)
8. FLAG_MOBIL(핸드폰 소유 여부)
9. work_phone(업무용 전화 소유 여부)
10. phone(전화 소유 여부)
11. email(이메일 소유 여부)
12. occyp_type(직업 유형)

### 연속형
1. child_num(자녀 수)
2. income_total(연간 소득)
3. DAYS_BIRTH(출생일)
4. DAYS_EMPLOYED(업무 시작일)
5. family_size(가족 규모)
6. begin_month(신용카드 발급 월)

# 데이터 전처리

- Log-transformation of the target variable
- 이상치를 모두 제거하는 대신에 후에 모델에서 이런 데이터를 제어하는 방법을 배울 것입니다.
- RL이 최빈값으로 빈 부분은 RL로 채웁니다. mode 메서드는 가장 많이 나타나는 값을 자동으로 선택해줍니다.
- occyp_type : 결측이 존재 : 결측에 수입이 있는걸로 봐서 아에 no_job은 아닌듯! -> 클러스터링 등의 방법으로 결측을 채워보자! work_phone, phone, email, car, reality, edu_type, income_total 등으로 채워보자 !
- Box-Cox Transformation은 정규 분포가 아닌 데이터를 정규 분포 형태로 변환하는 방법 중 하나

In [63]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
data = pd.concat([train, test]).drop(['index'], axis=1)
data.shape

(36457, 19)

In [64]:
# 모든 사람은 핸드폰을 가지고 있다.
data = data.drop(['FLAG_MOBIL'], axis=1)

# 음수들 양수로 변환 변환
data['DAYS_BIRTH'] = -data['DAYS_BIRTH']
data['DAYS_EMPLOYED'] = -data['DAYS_EMPLOYED']
data['begin_month'] = -data['begin_month']

# DAYS_EMPLOYED : 일하고 있지 않은 사람들 0으로
data['DAYS_EMPLOYED'] = data['DAYS_EMPLOYED'].apply(lambda x : x if x >=0 else 0)

# 무직자들(DAYS_EMPLOYED < 0)은 모두 occyp_type이 NaN이다! -> noJob
occyp_type = []
for i, x in data.iterrows():
    if x.DAYS_EMPLOYED < 0:
        occyp_type.append('noJob')
    else:
        occyp_type.append(x.occyp_type)
data['occyp_type'] = occyp_type

# 남은 NaN값들을 채워주자! -> 클러스터링


# 어른 1명당 부양가족
data['adult'] = data['family_size'] - data['child_num']
adult_family_ratio = []
for i, x in data.iterrows():
    if x.adult > 0:
        adult_family_ratio.append(x.family_size / x.adult)
    else:
        adult_family_ratio.append(5)
data['adult_family_ratio'] = adult_family_ratio
data['adult_family_ratio'] = data['adult_family_ratio'].apply(lambda x: x if x<=4 else 5)

data = data.drop(['family_size', 'child_num'], axis = 1)

# edu_type 순서화
edu_dict = {'Secondary / secondary special' : 2, 'Higher education' : 4,'Incomplete higher' : 3, 'Lower secondary' : 1, 'Academic degree' : 5}
data['edu_type'] = data['edu_type'].apply(lambda x: edu_dict[x])

# 연속형 변수 스케일링
# income_total : robust scaling
# DAYS_BIRTH, begin_month : standard scaling
#rbScaler = RobustScaler()
#data['income_total'] = rbScaler.fit_transform(data[['income_total']])
#sdScaler = StandardScaler()
#data[['DAYS_BIRTH', 'begin_month']] = sdScaler.fit_transform(data[['DAYS_BIRTH', 'begin_month']])

# 범주형 변수 인코딩
category_list = ['car', 'email', 'gender', 'phone','reality', 'work_phone',
                 'edu_type', 'family_type', 'house_type', 'income_type', 'occyp_type']
from sklearn.preprocessing import LabelEncoder
for cat in category_list:
    lbl = LabelEncoder()
    if cat == 'occyp_type':
        pass
        data['occyp_type'] = data['occyp_type'].fillna('NaN')
        data['occyp_type'] = lbl.fit_transform(data['occyp_type'].values)
        #data['occyp_type'] = data['occyp_type'].replace(12, np.NaN)
    else:
        data[cat] = lbl.fit_transform(data[cat].values)


print(data.shape)
data.head(5)

(36457, 18)


Unnamed: 0,DAYS_BIRTH,DAYS_EMPLOYED,begin_month,car,credit,edu_type,email,family_type,gender,house_type,income_total,income_type,occyp_type,phone,reality,work_phone,adult,adult_family_ratio
0,13899,4709,6.0,0,1.0,3,0,1,0,2,202500.0,0,12,0,0,0,2.0,1.0
1,11380,1540,5.0,0,1.0,1,1,0,0,1,247500.0,0,8,0,1,0,2.0,1.5
2,19087,4434,22.0,1,2.0,3,0,1,1,1,450000.0,4,10,1,1,0,2.0,1.0
3,15088,2092,37.0,0,0.0,1,0,1,0,1,202500.0,0,15,1,1,0,2.0,1.0
4,15037,2105,26.0,1,2.0,3,0,1,0,1,157500.0,2,10,0,1,0,2.0,1.0


In [65]:
credit = data['credit']
del data['credit']

# https://datawig.readthedocs.io/en/latest/index.html
import datawig

cols = list(data.columns)
cols.remove('occyp_type')

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns= cols, # column(s) containing information about the column we want to impute
    output_column='occyp_type', # the column we'd like to impute values for
    )

#Fit an imputer model on the train data
imputer.fit(train_df=data, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(data)
imputed

### PCA

In [32]:
from sklearn.decomposition import PCA, SparsePCA 
pca = SparsePCA(n_components = 5)
data = pca.fit_transform(data)
data

array([[ 1.56610994e+04, -2.04283492e+03, -2.42620490e+03,
         1.99638687e+01,  2.87304942e+00],
       [ 6.02194862e+04, -4.37454263e+03,  8.29872775e+02,
         2.09529295e+01, -1.06579165e+00],
       [ 2.60702649e+05,  3.77513226e+03, -1.75188899e+03,
         4.12121369e+00,  7.45828312e-01],
       ...,
       [ 1.04759175e+05,  5.13580671e+03, -1.15649500e+04,
        -2.85502161e+01,  1.80339250e+00],
       [-6.62049089e+03,  5.55515523e+02,  1.08401259e+03,
        -6.76920174e+00,  2.87045949e+00],
       [ 8.25002235e+04, -6.50008713e+03,  2.24146061e+03,
         1.50128866e+01, -1.20369232e+00]])

from sklearn.preprocessing import LabelEncoder

lbl = LabelEncoder()
imputed['occyp_type'] = lbl.fit_transform(imputed['occyp_type_imputed'])
data = imputed.drop(['occyp_type_imputed', 'occyp_type_imputed_proba'],axis=1)

In [67]:
X_train = data[credit.notnull()]
X_test = data[credit.isnull()]
y_train = credit.dropna().astype(int)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(26457, 17)
(10000, 17)
(26457,)


# 모델링

## 스태킹
- XGBoost
- LightGBM
- CatBoost

### XGBOOST

In [37]:
from sklearn.ensemble import RandomForestClassifier,  GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

from bayes_opt import BayesianOptimization

In [38]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier

### LightGBM

In [68]:
train_x, valid_x, train_y, valid_y = train_test_split(X_train, y_train, test_size=0.3, random_state=2020)

#### BAYESIAN_PARAMETER_OPTIMIZATION

In [69]:
# parameter 별로 search할 범위를 설정. 
bayesian_params = {
    'max_depth': (1, 5), 
    'num_leaves': (24, 64), 
    'min_child_samples': (10, 200), 
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1.0),
    'colsample_bytree': (0.5, 1.0),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha': (0.001, 10) 
}

In [70]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree,max_bin, reg_lambda, reg_alpha):
    params = {
        'objective': 'multiclass',
        "n_estimators":500, "learning_rate":0.02,
        'max_depth': int(round(max_depth)), #  호출 시 실수형 값이 들어오므로 정수형 하이퍼 파라미터는 정수형으로 변경 
        'num_leaves': int(round(num_leaves)), 
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0), 
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin':  max(int(round(max_bin)),10),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'logloss', verbose= 100, 
                early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(valid_x)
    logLoss = log_loss(valid_y, valid_proba)
    
    return -logLoss

In [71]:
# BayesianOptimization객체를 수행할 함수와 search할 parameter 범위를 설정하여 생성. 
lgbBO = BayesianOptimization(lgb_roc_eval,bayesian_params , random_state=0)
# 함수 반환값이 최대가 되는 입력값 유추를 위한 iteration 수행. 
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.808849	valid_1's multi_logloss: 0.816704
[200]	training's multi_logloss: 0.797531	valid_1's multi_logloss: 0.808195
[300]	training's multi_logloss: 0.791058	valid_1's multi_logloss: 0.804895
[400]	training's multi_logloss: 0.786271	valid_1's multi_logloss: 0.802554
[500]	training's multi_logloss: 0.782319	valid_1's multi_logloss: 0.800966
Did not meet early stopping. Best iteration is:
[500]	training's multi_logloss: 0.782319	valid_1's multi_logloss: 0.800966
| [0m 1       [0m | [0m-0.801   [0m | [0m 0.7744  [0m | [0m 360.4   [0m | [0m 3.411   [0m | [0m 113.5   [0m | [0m 21.76   [0m | [0m 49.84   [0m | [0m 4.376

[200]	training's multi_logloss: 0.807813	valid_1's multi_logloss: 0.812552
[300]	training's multi_logloss: 0.805655	valid_1's multi_logloss: 0.810808
[400]	training's multi_logloss: 0.804594	valid_1's multi_logloss: 0.810325
[500]	training's multi_logloss: 0.80391	valid_1's multi_logloss: 0.809945
Did not meet early stopping. Best iteration is:
[500]	training's multi_logloss: 0.80391	valid_1's multi_logloss: 0.809945
| [0m 11      [0m | [0m-0.8099  [0m | [0m 0.8876  [0m | [0m 478.8   [0m | [0m 1.239   [0m | [0m 196.2   [0m | [0m 1.81    [0m | [0m 31.77   [0m | [0m 9.662   [0m | [0m 8.428   [0m | [0m 0.9965  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.811025	valid_1's multi_logloss: 0.819099
[200]	training's multi_logloss: 0.797855	valid_1's multi_logloss: 0.808455
[300]	training's multi_logloss: 0.791509	valid_1's multi_logloss: 0.80513
[400]	training's multi_logloss: 0.786705	valid_1's multi_logloss: 0.802

| [0m 21      [0m | [0m-0.7881  [0m | [0m 0.646   [0m | [0m 390.3   [0m | [0m 4.547   [0m | [0m 13.3    [0m | [0m 21.04   [0m | [0m 62.31   [0m | [0m 0.6139  [0m | [0m 3.969   [0m | [0m 0.8906  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.790084	valid_1's multi_logloss: 0.808096
[200]	training's multi_logloss: 0.766995	valid_1's multi_logloss: 0.79817
[300]	training's multi_logloss: 0.750884	valid_1's multi_logloss: 0.792347
[400]	training's multi_logloss: 0.736182	valid_1's multi_logloss: 0.787318
[500]	training's multi_logloss: 0.723057	valid_1's multi_logloss: 0.783736
Did not meet early stopping. Best iteration is:
[500]	training's multi_logloss: 0.723057	valid_1's multi_logloss: 0.783736
| [95m 22      [0m | [95m-0.7837  [0m | [95m 0.9648  [0m | [95m 491.6   [0m | [95m 4.885   [0m | [95m 41.0    [0m | [95m 2.127   [0m | [95m 57.85   [0m | [95m 2.011   [0m | [95m 1.687   [0m | [95m 0

In [72]:
# dictionary에 있는 target값을 모두 추출
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmax(np.array(target_list)))

[-0.8009663514311595, -0.8003783702891311, -0.7904659330873844, -0.8009694054166153, -0.7918341445927425, -0.8131934502724841, -0.8230763607919699, -0.7985552283502667, -0.7893915424914775, -0.8003801907583233, -0.8099447420724155, -0.8013878184960946, -0.7944982164674612, -0.8007872642884186, -0.7948768094331106, -0.7933085573225015, -0.7989108640691488, -0.7937072145318373, -0.7896682520998842, -0.7851798428519491, -0.7880940364573811, -0.7837359820129789, -0.7993830534547174, -0.7924223367422328, -0.7896253673751455, -0.7849264069370321, -0.7955961712356449, -0.7841447168055898, -0.7846951975429296, -0.7841530973483751]
maximum target index: 21


In [74]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
max_params = max_dict['params']
print(max_dict['target'])
print(max_params)

-0.7837359820129789
{'colsample_bytree': 0.9647911988231949, 'max_bin': 491.60363048973545, 'max_depth': 4.885138936967211, 'min_child_samples': 40.99969541540217, 'min_child_weight': 2.1273123403234457, 'num_leaves': 57.853171850472386, 'reg_alpha': 2.011353943228025, 'reg_lambda': 1.687483832390387, 'subsample': 0.9763826910182591}


In [78]:
params = {'colsample_bytree': round(max_params['colsample_bytree'], 3),
 'max_bin': int(max_params['max_bin']),
 'max_depth': int(max_params['max_depth']),
 'min_child_samples': int(max_params['min_child_samples']),
 'min_child_weight': int(max_params['min_child_weight']),
 'num_leaves': int(max_params['num_leaves']),
 'reg_alpha': round(max_params['reg_alpha'], 3),
 'reg_lambda': round(max_params['reg_lambda'], 3),
 'subsample': round(max_params['subsample'], 3),}

In [82]:
d_train = lgb.Dataset(X_train, label=y_train)
lgb_model = LGBMClassifier(**params)
lgb_model.fit(X_train, y_train, verbose= 100)
pred_lgb = lgb_model.predict_proba(X_test)
pred_lgb

array([[0.07888235, 0.17946021, 0.74165744],
       [0.12853917, 0.17754921, 0.69391162],
       [0.10740123, 0.20328282, 0.68931595],
       ...,
       [0.09862079, 0.13586269, 0.76551652],
       [0.07610688, 0.1716816 , 0.75221152],
       [0.12765025, 0.19571999, 0.67662977]])

### RANDOM FOREST

In [83]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0, n_jobs=-1)
rf.fit(X_train , y_train)
pred_rf = rf.predict_proba(X_test)
pred_rf

array([[0.055     , 0.22      , 0.725     ],
       [0.49      , 0.21      , 0.3       ],
       [0.05      , 0.1       , 0.85      ],
       ...,
       [0.        , 0.03571429, 0.96428571],
       [0.55      , 0.27      , 0.18      ],
       [0.09      , 0.344     , 0.566     ]])

from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier

#n_estimators : 생성할 tree의 개수

#max_features : 최대 선택할 변수의 수
#max_features 값을 크게 하면 random forest의 tree들은 같은 변수를 고려하므로 
#tree들이 비슷해지고 가장 두드러진 변수를 이용해 데이터에 잘 맞춰짐

#max_features를 낮추면 
#random forest tree들은 많이 달라지고 각 tree는 데이터에 맞추기 위해 tree의 깊이가 깊어집니다.

#max_depth : 랜덤포레스트 안에 있는 각 의사결정나무의 깊이를 설정. 
# 트리가 깊어질수록 더 잘게 분류를 시키므로 일반적으론 정확도가 높아진다. 하지만 오버피팅의 위험이 존재

params = {
    'n_estimators':[100, 300, 500]
}

#Revenue의 T, F의 비율이 8,5 : 1.5로 치우처져있으므로 그 비율에 맞게 sampling하는 StratifiedKFold사용
cv = StratifiedKFold(n_splits=5, random_state=0)

# 랜덤포레스트 객체 생성 
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

# f1 스코어 기준으로 GridSearchCV 수행
# refit = True :  best estimator로 자동으로 수정됨
# n_jobs = -1 : 모든 cpu의 코어를 사용
grid_cv = GridSearchCV(rf , param_grid=params , cv=cv, scoring="logloss", n_jobs=-1, refit = True)

# 모델 학습
grid_cv.fit(X_train , y_train)

print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 f1: {0:.4f}'.format(grid_cv.best_score_))

# 최적의 파라미터로 모델 생성 및 예측
model = grid_cv.best_estimator_
pred_rf = model.predict(X_test)

# Evaluation
print('f1: {0:.4f}'.format(metrics.f1_score(y_test , pred_rf)))
print('accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test , pred_rf)))
print('precision: {0:.4f}'.format(metrics.precision_score(y_test , pred_rf)))
print('recall: {0:.4f}'.format(metrics.recall_score(y_test , pred_rf)))

# Confusion Matrix
metrics.plot_confusion_matrix(model, X_test, y_test, cmap="Blues")
plt.show()

# 제출

In [84]:
pred = (pred_lgb + pred_rf)/n

In [85]:
submission=pd.read_csv('sample_submission.csv')
submission[['0','1','2']] = pd.DataFrame(pred)
submission

Unnamed: 0,index,0,1,2
0,26457,0.133882,0.399460,1.466657
1,26458,0.618539,0.387549,0.993912
2,26459,0.157401,0.303283,1.539316
3,26460,0.167866,0.198452,1.633681
4,26461,0.172979,0.398236,1.428785
...,...,...,...,...
9995,36452,0.185992,0.574653,1.239355
9996,36453,0.393995,0.770766,0.835239
9997,36454,0.098621,0.171577,1.729802
9998,36455,0.626107,0.441682,0.932212


In [86]:
submission.to_csv('submission_3.csv', index=False)