# 사용자의 신용카드 대금 연체 정도 예측

## 변수 설명

- index  
- gender: 성별  
- car: 차량 소유 여부
- reality: 부동산 소유 여부
- child_num: 자녀 수
- income_total: 연간 소득
- income_type: 소득 분류 => ['Commercial associate', 'Working', 'State servant', 'Pensioner', 'Student']
- edu_type: 교육 수준 => ['Higher education' ,'Secondary / secondary special', 'Incomplete higher', 'Lower secondary', 'Academic degree']
- family_type: 결혼 여부 => ['Married', 'Civil marriage', 'Separated', 'Single / not married', 'Widow']

- house_type: 생활 방식 => ['Municipal apartment', 'House / apartment', 'With parents', 'Co-op apartment', 'Rented apartment', 'Office apartment']
- DAYS_BIRTH: 출생일 => 데이터 수집 당시 0부터 역으로 셈, 즉, -1은 데이터 수집일 하루 전에 태어났음을 의미
- DAYS_EMPLOYED: 업무 시작일 => 데이터 수집 당시 0부터 역으로 셈, 즉, -1은 데이터 수집일 하루 전부터 일을 시작함을 의미. 양수 값은 고용되지 않은 상태를 의미함
- FLAG_MOBIL: 핸드폰 소유 여부
- work_phone: 업무용 전화 소유 여부
- phone: 집 전화 소유 여부
- email: 이메일 소유 여부
- occyp_type: 직업 유형													
- family_size: 가족 규모
- begin_month: 신용카드 발급 월 => 데이터 수집 당시 0부터 역으로 셈, 즉, -1은 데이터 수집일 한 달 전에 신용카드를 발급함을 의미
- credit: 사용자의 신용카드 대금 연체를 기준으로 한 신용도 => 낮을 수록 높은 신용의 신용카드 사용자를 의미함

In [189]:
# 모듈 로딩
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.utils import all_estimators
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# warning 무시
import warnings
warnings.filterwarnings('ignore')

### [1] 데이터 로딩 & 전처리

In [209]:
df=pd.read_csv('./file/train.csv')

In [210]:
df

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,26452,F,N,N,2,225000.0,State servant,Secondary / secondary special,Married,House / apartment,-12079,-1984,1,0,0,0,Core staff,4.0,-2.0,1.0
26453,26453,F,N,Y,1,180000.0,Working,Higher education,Separated,House / apartment,-15291,-2475,1,0,0,0,,2.0,-47.0,2.0
26454,26454,F,Y,N,0,292500.0,Working,Secondary / secondary special,Civil marriage,With parents,-10082,-2015,1,0,0,0,Core staff,2.0,-25.0,2.0
26455,26455,M,N,Y,0,171000.0,Working,Incomplete higher,Single / not married,House / apartment,-10145,-107,1,0,0,0,Laborers,1.0,-59.0,2.0


In [211]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

In [212]:
df.describe()

Unnamed: 0,index,child_num,income_total,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,family_size,begin_month,credit
count,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0
mean,13228.0,0.428658,187306.5,-15958.053899,59068.750728,1.0,0.224742,0.294251,0.09128,2.196848,-26.123294,1.51956
std,7637.622372,0.747326,101878.4,4201.589022,137475.427503,0.0,0.41742,0.455714,0.288013,0.916717,16.55955,0.702283
min,0.0,0.0,27000.0,-25152.0,-15713.0,1.0,0.0,0.0,0.0,1.0,-60.0,0.0
25%,6614.0,0.0,121500.0,-19431.0,-3153.0,1.0,0.0,0.0,0.0,2.0,-39.0,1.0
50%,13228.0,0.0,157500.0,-15547.0,-1539.0,1.0,0.0,0.0,0.0,2.0,-24.0,2.0
75%,19842.0,1.0,225000.0,-12446.0,-407.0,1.0,0.0,1.0,0.0,3.0,-12.0,2.0
max,26456.0,19.0,1575000.0,-7705.0,365243.0,1.0,1.0,1.0,1.0,20.0,0.0,2.0


In [213]:
df.shape

(26457, 20)

In [214]:
df['credit'].value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

In [215]:
df['gender'].unique()

array(['F', 'M'], dtype=object)

In [216]:
df['car'].unique()

array(['N', 'Y'], dtype=object)

In [217]:
df['reality'].unique()

array(['N', 'Y'], dtype=object)

In [219]:
df['income_type'].unique()

array(['Commercial associate', 'Working', 'State servant', 'Pensioner',
       'Student'], dtype=object)

In [220]:
df['edu_type'].unique()

array(['Higher education', 'Secondary / secondary special',
       'Incomplete higher', 'Lower secondary', 'Academic degree'],
      dtype=object)

In [221]:
df['family_type'].unique()

array(['Married', 'Civil marriage', 'Separated', 'Single / not married',
       'Widow'], dtype=object)

In [222]:
df['house_type'].unique()

array(['Municipal apartment', 'House / apartment', 'With parents',
       'Co-op apartment', 'Rented apartment', 'Office apartment'],
      dtype=object)

In [223]:
# DAYS_BIRTH가 0보다 큰 사람은 없음 => 잘못 입력된 값 없음
df[df['DAYS_BIRTH']>0]['DAYS_BIRTH']

Series([], Name: DAYS_BIRTH, dtype: int64)

In [224]:
# DAYS_BIRTH 컬럼 양수로 변경
df['DAYS_BIRTH']=abs(df['DAYS_BIRTH'])

In [225]:
# 나이 컬럼 추가
df['age']=abs(df['DAYS_BIRTH'])//365

In [226]:
# DAYS_BIRTH 컬럼 제거
df=df.drop(columns=['DAYS_BIRTH'])

In [227]:
# 0 이상인 값 => 입사하지 않은 사람의 값
df[df['DAYS_EMPLOYED']>0]['DAYS_EMPLOYED']

14       365243
18       365243
21       365243
24       365243
46       365243
          ...  
26431    365243
26432    365243
26439    365243
26441    365243
26443    365243
Name: DAYS_EMPLOYED, Length: 4438, dtype: int64

In [228]:
# 0보다 큰 값을 입사하지 않은 사람으로 간주하여 0으로 변환
df['DAYS_EMPLOYED']=df['DAYS_EMPLOYED'].map(lambda x:0 if x>0 else x)

In [229]:
# DAYS_EMPLOYED 컬럼 양수로 변경
df['DAYS_EMPLOYED']=abs(df['DAYS_EMPLOYED'])

In [230]:
df['FLAG_MOBIL'].unique()

array([1], dtype=int64)

In [231]:
# FLAG_MOBIL(핸드폰 소유 여부) 열의 값이 1뿐이므로 열 삭제
df=df.drop(columns=['FLAG_MOBIL'])

In [232]:
df['work_phone'].unique()

array([0, 1], dtype=int64)

In [233]:
df['phone'].unique()

array([0, 1], dtype=int64)

In [234]:
df['email'].unique()

array([0, 1], dtype=int64)

In [235]:
df['occyp_type'].unique()

array([nan, 'Laborers', 'Managers', 'Sales staff',
       'High skill tech staff', 'Core staff', 'Drivers', 'Medicine staff',
       'Accountants', 'Realty agents', 'Security staff', 'Cleaning staff',
       'Private service staff', 'Cooking staff', 'Secretaries',
       'HR staff', 'IT staff', 'Low-skill Laborers',
       'Waiters/barmen staff'], dtype=object)

In [236]:
# occyp_type 열의 결측치를 Jobless로 변경
df['occyp_type']=df['occyp_type'].fillna('Jobless')

In [237]:
# family_size를 정수형으로 바꾸기
df['family_size']=df['family_size'].astype('int')

In [238]:
df['family_size'].unique()

array([ 2,  3,  4,  1,  5,  6,  7, 15, 20,  9])

In [239]:
df['begin_month']=abs(df['begin_month'])
df['begin_month']

0         6.0
1         5.0
2        22.0
3        37.0
4        26.0
         ... 
26452     2.0
26453    47.0
26454    25.0
26455    59.0
26456     9.0
Name: begin_month, Length: 26457, dtype: float64

In [240]:
df['credit'].value_counts()

2.0    16968
1.0     6267
0.0     3222
Name: credit, dtype: int64

In [241]:
# Index 열 필요없으니 제거
df=df.drop(columns=['index'])

In [242]:
# 컬럼 순서 바꾸기
credit=df['credit']
credit

0        1.0
1        1.0
2        2.0
3        0.0
4        2.0
        ... 
26452    1.0
26453    2.0
26454    2.0
26455    2.0
26456    2.0
Name: credit, Length: 26457, dtype: float64

In [243]:
df=df.drop(columns=['credit'])

In [244]:
df=pd.concat([df, credit], axis=1)

In [245]:
df.corr()

Unnamed: 0,child_num,income_total,DAYS_EMPLOYED,work_phone,phone,email,family_size,begin_month,age,credit
child_num,1.0,0.032186,0.043122,0.051521,-0.010555,0.01612,0.89053,0.007229,-0.332831,0.004081
income_total,0.032186,1.0,0.086899,-0.034207,0.019013,0.089882,0.023839,0.018047,-0.064009,0.008555
DAYS_EMPLOYED,0.043122,0.086899,1.0,0.108538,0.041886,0.002904,0.055657,0.082669,-0.020645,0.022824
work_phone,0.051521,-0.034207,0.108538,1.0,0.310256,-0.031048,0.068705,0.008563,-0.179204,-0.003134
phone,-0.010555,0.019013,0.041886,0.310256,1.0,0.012494,-0.000132,0.014286,0.02909,0.003452
email,0.01612,0.089882,0.002904,-0.031048,0.012494,1.0,0.016122,-0.00232,-0.111189,0.014812
family_size,0.89053,0.023839,0.055657,0.068705,-0.000132,0.016122,1.0,0.023032,-0.298081,0.008227
begin_month,0.007229,0.018047,0.082669,0.008563,0.014286,-0.00232,0.023032,1.0,0.057365,0.147477
age,-0.332831,-0.064009,-0.020645,-0.179204,0.02909,-0.111189,-0.298081,0.057365,1.0,0.025059
credit,0.004081,0.008555,0.022824,-0.003134,0.003452,0.014812,0.008227,0.147477,0.025059,1.0


child_num과 family_size의 상관계수가 0.890530으로 높아서 child_num 컬럼 제거

In [246]:
df=df.drop(columns=['child_num'])

In [357]:
df

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age,credit
0,0,0,0,202500.0,0,1,1,2,4709,0,0,0,8,2,6.0,38,1.0
1,0,0,1,247500.0,0,4,0,1,1540,0,0,1,9,3,5.0,31,1.0
2,1,1,1,450000.0,4,1,1,1,4434,0,1,0,11,2,22.0,52,2.0
3,0,0,1,202500.0,0,4,1,1,2092,0,1,0,15,2,37.0,41,0.0
4,0,1,1,157500.0,2,1,1,1,2105,0,0,0,11,2,26.0,41,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,225000.0,2,4,1,1,1984,0,0,0,3,4,2.0,33,1.0
26453,0,0,1,180000.0,4,1,2,1,2475,0,0,0,8,2,47.0,41,2.0
26454,0,1,0,292500.0,4,4,0,5,2015,0,0,0,3,2,25.0,27,2.0
26455,1,0,1,171000.0,4,2,3,1,107,0,0,0,9,1,59.0,27,2.0


#### 인코딩

In [247]:
encoder=LabelEncoder()

In [248]:
col_list=['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

In [249]:
for col in col_list:
    df[col]=encoder.fit_transform(df[col])

In [250]:
df

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age,credit
0,0,0,0,202500.0,0,1,1,2,4709,0,0,0,8,2,6.0,38,1.0
1,0,0,1,247500.0,0,4,0,1,1540,0,0,1,9,3,5.0,31,1.0
2,1,1,1,450000.0,4,1,1,1,4434,0,1,0,11,2,22.0,52,2.0
3,0,0,1,202500.0,0,4,1,1,2092,0,1,0,15,2,37.0,41,0.0
4,0,1,1,157500.0,2,1,1,1,2105,0,0,0,11,2,26.0,41,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,225000.0,2,4,1,1,1984,0,0,0,3,4,2.0,33,1.0
26453,0,0,1,180000.0,4,1,2,1,2475,0,0,0,8,2,47.0,41,2.0
26454,0,1,0,292500.0,4,4,0,5,2015,0,0,0,3,2,25.0,27,2.0
26455,1,0,1,171000.0,4,2,3,1,107,0,0,0,9,1,59.0,27,2.0


In [251]:
df.describe()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age,credit
count,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0,26457.0
mean,0.331103,0.379748,0.673924,187306.5,2.394754,3.097592,1.367275,1.279813,2198.529538,0.224742,0.294251,0.09128,8.200514,2.196848,26.123294,43.213478,1.51956
std,0.470619,0.485333,0.468784,101878.4,1.7359,1.341602,0.952536,0.944273,2370.14053,0.41742,0.455714,0.288013,3.951175,0.916717,16.55955,11.51359,0.702283
min,0.0,0.0,0.0,27000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,21.0,0.0
25%,0.0,0.0,0.0,121500.0,1.0,1.0,1.0,1.0,407.0,0.0,0.0,0.0,6.0,2.0,12.0,34.0,1.0
50%,0.0,0.0,1.0,157500.0,4.0,4.0,1.0,1.0,1539.0,0.0,0.0,0.0,8.0,2.0,24.0,42.0,2.0
75%,1.0,1.0,1.0,225000.0,4.0,4.0,1.0,1.0,3153.0,0.0,1.0,0.0,10.0,3.0,39.0,53.0,2.0
max,1.0,1.0,1.0,1575000.0,4.0,4.0,4.0,5.0,15713.0,1.0,1.0,1.0,18.0,20.0,60.0,68.0,2.0


In [252]:
df

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age,credit
0,0,0,0,202500.0,0,1,1,2,4709,0,0,0,8,2,6.0,38,1.0
1,0,0,1,247500.0,0,4,0,1,1540,0,0,1,9,3,5.0,31,1.0
2,1,1,1,450000.0,4,1,1,1,4434,0,1,0,11,2,22.0,52,2.0
3,0,0,1,202500.0,0,4,1,1,2092,0,1,0,15,2,37.0,41,0.0
4,0,1,1,157500.0,2,1,1,1,2105,0,0,0,11,2,26.0,41,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26452,0,0,0,225000.0,2,4,1,1,1984,0,0,0,3,4,2.0,33,1.0
26453,0,0,1,180000.0,4,1,2,1,2475,0,0,0,8,2,47.0,41,2.0
26454,0,1,0,292500.0,4,4,0,5,2015,0,0,0,3,2,25.0,27,2.0
26455,1,0,1,171000.0,4,2,3,1,107,0,0,0,9,1,59.0,27,2.0


### [2] 학습 & 테스트 데이터 분리

In [438]:
# data, target 분리
data=df.iloc[:, :-1]
target=df.iloc[:, -1]

In [439]:
x_train, x_test, y_train, y_test=train_test_split(
data, target, test_size=0.2, stratify=target, random_state=42)

In [440]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((21165, 16), (5292, 16), (21165,), (5292,))

In [441]:
y_train.value_counts()

2.0    13574
1.0     5013
0.0     2578
Name: credit, dtype: int64

In [442]:
y_test.value_counts()

2.0    3394
1.0    1254
0.0     644
Name: credit, dtype: int64

### 모델 찾기

In [56]:
models=all_estimators(type_filter='classifier')

In [406]:
# scores=[]

# for name, model in models:
#     try:
#         # 모델 객체 생성
#         md=model()
#         # 학습
#         md.fit(x_train, y_train)
#         # 평가
#         train_result=md.score(x_train, y_train)
#         result=md.score(x_test, y_test)
#         scores.append((name, train_result, result))
        
#     except:
#         pass

In [407]:
scores

# GradientBoostingClassifier, RandomforestClassifier

[('AdaBoostClassifier', 0.692322230096858, 0.6832955404383976),
 ('BaggingClassifier', 0.7734939759036145, 0.6360544217687075),
 ('BernoulliNB', 0.6413418379399953, 0.6413454270597128),
 ('CalibratedClassifierCV', 0.6429482636428066, 0.6424792139077853),
 ('DecisionTreeClassifier', 0.7221356012284432, 0.5595238095238095),
 ('DummyClassifier', 0.6413418379399953, 0.6413454270597128),
 ('ExtraTreeClassifier', 0.7258209307819513, 0.5574452003023431),
 ('ExtraTreesClassifier', 0.7929600755965036, 0.6678004535147393),
 ('GaussianNB', 0.6373257736829672, 0.6271730914588057),
 ('GaussianProcessClassifier', 0.714103472714387, 0.6402116402116402),
 ('GradientBoostingClassifier', 0.6952515946137491, 0.6842403628117913),
 ('HistGradientBoostingClassifier', 0.735648476257973, 0.6832955404383976),
 ('KNeighborsClassifier', 0.6331679659815733, 0.5755857898715041),
 ('LabelPropagation', 0.7313961729270021, 0.5844671201814059),
 ('LabelSpreading', 0.731301677297425, 0.5848450491307634),
 ('LinearDiscr

## 모델1 - RandomForestClassifier

### 그리드서치

In [279]:
params={'max_depth':[17, 18, 19, 20],
       'min_samples_split':[2, 3, 4, 5]}
dtc=RandomForestClassifier()
grid_tree=GridSearchCV(dtc, param_grid=params, cv=3, refit=True)
grid_tree.fit(x_train2, y_train2)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [17, 18, 19, 20],
                         'min_samples_split': [2, 3, 4, 5]})

In [280]:
print('best parameters :', grid_tree.best_params_)
print('best score :', grid_tree.best_score_)
em=grid_tree.best_estimator_

best parameters : {'max_depth': 20, 'min_samples_split': 2}
best score : 0.7143869596031184


In [495]:
rfModel=RandomForestClassifier(max_depth=20, min_samples_split=2, random_state=42)

In [496]:
rfModel.fit(x_train, y_train)

RandomForestClassifier(max_depth=20, random_state=42)

In [497]:
print(rfModel.score(x_train, y_train))
print(rfModel.score(x_test, y_test))

0.8877391920623671
0.7173091458805745


## 교차검증

In [262]:
sSplitter=StratifiedKFold(n_splits=5, shuffle=True)

pd.DataFrame(cross_validate(rfModel, x_train, y_train,
                           return_train_score=True,
                           cv=sSplitter))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,4.086589,0.203974,0.701866,0.819691
1,4.396495,0.206197,0.710135,0.815143
2,3.94498,0.099495,0.7073,0.811068
3,3.248413,0.13496,0.699976,0.816442
4,3.559899,0.158988,0.703756,0.810773


In [491]:
rfModel.feature_importances_

array([0.02734393, 0.0353121 , 0.03915224, 0.18541755, 0.04102852,
       0.037048  , 0.05283464, 0.03209398, 0.19438753, 0.02857041,
       0.03954652, 0.02421988, 0.09981219, 0.04420249, 0.08370193,
       0.0353281 ])

In [317]:
from sklearn.metrics import classification_report
print(classification_report(y_test, rfModel.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.54      0.08      0.14       644
         1.0       0.72      0.32      0.45      1254
         2.0       0.71      0.97      0.82      3394

    accuracy                           0.71      5292
   macro avg       0.66      0.46      0.47      5292
weighted avg       0.69      0.71      0.65      5292



## predict

In [498]:
pred2=pd.DataFrame(rfModel.predict(x_test))
raw2=y_test.reset_index(drop=True)
pr2=pd.concat([pred2, raw2], axis=1)
pr2.columns=['pred', 'credit']
pr2
new_pr2=pr2[pr2['pred']==pr2['credit']]
new_pr2
new_pr2.shape[0]/pr2.shape[0]

0.7173091458805745

In [524]:
pd.DataFrame(rfModel.predict(test_df), columns=['pred'])

Unnamed: 0,pred
0,2.0
1,2.0
2,2.0
3,2.0
4,2.0
...,...
9995,2.0
9996,2.0
9997,2.0
9998,2.0


In [526]:
np.unique(np.array(rfModel.predict(test_df)))

array([0., 1., 2.])

# test.csv ---------------------------------------------------------

In [284]:
test_df=pd.read_csv('./file/test.csv')

In [285]:
# DAYS_BIRTH가 0보다 큰 사람은 없음 => 잘못 입력된 값 없음
test_df[test_df['DAYS_BIRTH']>0]['DAYS_BIRTH']

Series([], Name: DAYS_BIRTH, dtype: int64)

In [286]:
# DAYS_BIRTH 컬럼 양수로 변경
test_df['DAYS_BIRTH']=abs(test_df['DAYS_BIRTH'])

In [287]:
# 나이 컬럼 추가
test_df['age']=abs(test_df['DAYS_BIRTH'])//365

In [288]:
# DAYS_BIRTH 컬럼 제거
test_df=test_df.drop(columns=['DAYS_BIRTH'])

In [289]:
# 0보다 큰 값을 입사하지 않은 사람으로 간주하여 0으로 변환
test_df['DAYS_EMPLOYED']=test_df['DAYS_EMPLOYED'].map(lambda x:0 if x>0 else x)

In [290]:
# DAYS_EMPLOYED 컬럼 양수로 변경
test_df['DAYS_EMPLOYED']=abs(test_df['DAYS_EMPLOYED'])

In [291]:
# FLAG_MOBIL 열의 값이 1뿐이므로 열 삭제
test_df=test_df.drop(columns=['FLAG_MOBIL'])

In [292]:
# occyp_type 열의 결측치를 Jobless로 변경
test_df['occyp_type']=test_df['occyp_type'].fillna('Jobless')

In [293]:
# family_size를 정수형으로 바꾸기
test_df['family_size']=test_df['family_size'].astype('int')

In [294]:
test_df['begin_month']=abs(test_df['begin_month'])
test_df['begin_month']

0       60.0
1       36.0
2       40.0
3       41.0
4        8.0
        ... 
9995    19.0
9996    34.0
9997    55.0
9998    33.0
9999    11.0
Name: begin_month, Length: 10000, dtype: float64

In [295]:
encoder=LabelEncoder()

In [296]:
col_list=['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type', 'house_type', 'occyp_type']

In [297]:
for col in col_list:
    test_df[col]=encoder.fit_transform(test_df[col])

In [298]:
# Index 열 필요없으니 제거
test_df=test_df.drop(columns=['index'])

In [299]:
test_df=test_df.drop(columns=['child_num'])

In [306]:
new_test=pd.DataFrame(rfModel.predict_proba(test_df))

In [307]:
sub=pd.read_csv('./file/sample_submission.csv')

In [308]:
sub=sub.drop(columns=['0', '1', '2'])

In [309]:
sub

Unnamed: 0,index
0,26457
1,26458
2,26459
3,26460
4,26461
...,...
9995,36452
9996,36453
9997,36454
9998,36455


In [316]:
pd.concat([sub, new_test], axis=1).to_csv('result4.csv', index=False, encoding='utf-8')

# ------------------------------------------------------------------

---
# 모델2 - HistGradientBoostingClassifier

#### 스케일링

In [318]:
scaler=StandardScaler()

In [319]:
x_train=pd.DataFrame(scaler.fit_transform(x_train), columns=x_train.columns)
x_test=pd.DataFrame(scaler.transform(x_test), columns=x_test.columns)

In [320]:
x_train

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,age
0,1.422827,1.277038,-1.438616,1.237050,-1.384358,0.672876,-1.433832,-0.295832,-0.514469,-0.538807,-0.644983,-0.316852,0.707366,-0.213712,0.960141,-0.453565
1,-0.702826,-0.783062,0.695113,1.586673,-0.230340,0.672876,1.713767,-0.295832,0.836135,-0.538807,-0.644983,-0.316852,0.960168,-1.299980,-1.096140,0.328043
2,-0.702826,1.277038,-1.438616,-0.685875,-0.807349,0.672876,-1.433832,-0.295832,-0.926475,-0.538807,1.550428,-0.316852,-0.051038,-0.213712,-1.217098,1.370186
3,1.422827,-0.783062,0.695113,-1.035498,0.923677,0.672876,-0.384632,-0.295832,0.617072,-0.538807,1.550428,-0.316852,0.201763,0.872556,-0.914704,-0.019339
4,-0.702826,-0.783062,0.695113,-0.423658,-0.807349,0.672876,-0.384632,-0.295832,-0.926475,-0.538807,-0.644983,-0.316852,-0.051038,-0.213712,-1.217098,1.717567
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21160,1.422827,1.277038,0.695113,-0.511063,0.923677,0.672876,-0.384632,-0.295832,-0.068761,-0.538807,-0.644983,-0.316852,-0.051038,-0.213712,0.839183,-1.408863
21161,1.422827,-0.783062,0.695113,0.362993,0.923677,0.672876,-0.384632,-0.295832,0.309542,1.855953,-0.644983,-0.316852,0.201763,-0.213712,1.262535,0.675424
21162,1.422827,-0.783062,-1.438616,0.450399,0.923677,0.672876,-0.384632,-0.295832,0.986530,1.855953,-0.644983,-0.316852,-1.062244,-0.213712,0.839183,0.067507
21163,-0.702826,-0.783062,-1.438616,-0.471731,-0.807349,0.672876,-0.384632,0.766122,-0.926475,-0.538807,-0.644983,-0.316852,-0.051038,0.872556,-0.612310,1.196496


### 그리드서치

In [459]:
params={'learning_rate':[0.05, 0.1, 0.2, 0.3],
       'max_depth':[13, 14, 15, 16]}
dtc=HistGradientBoostingClassifier()
grid_tree=GridSearchCV(dtc, param_grid=params, cv=3, refit=True)
grid_tree.fit(x_train, y_train)

GridSearchCV(cv=3, estimator=HistGradientBoostingClassifier(),
             param_grid={'learning_rate': [0.05, 0.1, 0.2, 0.3],
                         'max_depth': [13, 14, 15, 16]})

In [502]:
print('best parameters :', grid_tree.best_params_)
print('best score :', grid_tree.best_score_)

best parameters : {'learning_rate': 0.1, 'max_depth': 14}
best score : 0.6988896763524687


In [500]:
hgModel=HistGradientBoostingClassifier(max_depth=15, random_state=42)

In [501]:
hgModel.fit(x_train, y_train)

HistGradientBoostingClassifier(max_depth=15, random_state=42)

In [503]:
print(hgModel.score(x_train, y_train))
print(hgModel.score(x_test, y_test))

0.7428301441058351
0.703514739229025


## 교차검증

In [504]:
sSplitter=StratifiedKFold(n_splits=5, shuffle=True)

pd.DataFrame(cross_validate(hgModel, x_train, y_train,
                           return_train_score=True,
                           cv=sSplitter))

Unnamed: 0,fit_time,score_time,test_score,train_score
0,2.588675,0.026227,0.699504,0.752835
1,2.543849,0.031127,0.706355,0.748405
2,2.575802,0.027925,0.698086,0.752599
3,2.593009,0.033909,0.696669,0.754961
4,2.53877,0.031017,0.702103,0.756378


In [505]:
print(classification_report(y_test, hgModel.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.55      0.07      0.12       644
         1.0       0.73      0.29      0.41      1254
         2.0       0.70      0.98      0.82      3394

    accuracy                           0.70      5292
   macro avg       0.66      0.44      0.45      5292
weighted avg       0.69      0.70      0.64      5292



## predict

In [515]:
pd.DataFrame(hgModel.predict(test_df))

Unnamed: 0,0
0,2.0
1,2.0
2,2.0
3,2.0
4,2.0
...,...
9995,2.0
9996,2.0
9997,2.0
9998,2.0


In [521]:
np.unique(np.array(hgModel.predict(test_df)))

array([0., 1., 2.])