# [作業目標]
- 使用 Day 17 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 17 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [3]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [4]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [5]:
# len(list(app_train["SK_ID_CURR"].unique()))
app_train["SK_ID_CURR"].nunique()

307511

In [6]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len( list(app_train[col].unique()) ) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 17 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [8]:
app_train['DAYS_BIRTH'] = app_train['DAYS_BIRTH'] / 365

In [9]:
app_train['DAYS_BIRTH'].sort_values(ascending=False)

265026    69.120548
63316     69.043836
124430    69.043836
143266    69.041096
130108    69.032877
169562    69.032877
169823    69.030137
173470    69.030137
226213    69.030137
282013    69.030137
79852     69.027397
172581    69.019178
276202    69.019178
140738    69.016438
271258    69.005479
65237     69.002740
49288     68.997260
20863     68.991781
119085    68.986301
130865    68.983562
187478    68.978082
216470    68.975342
282656    68.972603
228384    68.972603
76783     68.967123
298564    68.958904
274276    68.958904
164069    68.958904
263141    68.956164
120399    68.953425
            ...    
227157    21.063014
74216     21.063014
271897    21.060274
58541     21.060274
37558     21.060274
269022    21.057534
50995     21.054795
169219    21.052055
233741    21.052055
186834    21.049315
114779    21.049315
35956     21.049315
170882    21.046575
224890    21.043836
80769     21.043836
249615    21.041096
124856    21.041096
48401     21.041096
183033    21.041096


In [10]:
# 將數據離散化 (把連續數值作區間分割)
app_train['DAYS_BIRTH_cut'] = pd.cut(app_train['DAYS_BIRTH'], 4, precision=0) 
app_train['DAYS_BIRTH_cut']

0         (20.0, 33.0]
1         (45.0, 57.0]
2         (45.0, 57.0]
3         (45.0, 57.0]
4         (45.0, 57.0]
5         (45.0, 57.0]
6         (33.0, 45.0]
7         (45.0, 57.0]
8         (45.0, 57.0]
9         (33.0, 45.0]
10        (20.0, 33.0]
11        (45.0, 57.0]
12        (33.0, 45.0]
13        (33.0, 45.0]
14        (33.0, 45.0]
15        (20.0, 33.0]
16        (33.0, 45.0]
17        (20.0, 33.0]
18        (45.0, 57.0]
19        (20.0, 33.0]
20        (45.0, 57.0]
21        (33.0, 45.0]
22        (20.0, 33.0]
23        (57.0, 69.0]
24        (20.0, 33.0]
25        (45.0, 57.0]
26        (45.0, 57.0]
27        (33.0, 45.0]
28        (20.0, 33.0]
29        (20.0, 33.0]
              ...     
307481    (45.0, 57.0]
307482    (33.0, 45.0]
307483    (57.0, 69.0]
307484    (33.0, 45.0]
307485    (33.0, 45.0]
307486    (33.0, 45.0]
307487    (57.0, 69.0]
307488    (20.0, 33.0]
307489    (45.0, 57.0]
307490    (20.0, 33.0]
307491    (20.0, 33.0]
307492    (57.0, 69.0]
307493    (

In [11]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,DAYS_BIRTH_cut
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0.0,0.0,0.0,0.0,0.0,1.0,False,"(20.0, 33.0]"
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]"
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]"
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,,,,,,,False,"(45.0, 57.0]"
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]"


In [12]:
app_train['DAYS_BIRTH_cut'].value_counts()

(33.0, 45.0]    100808
(45.0, 57.0]     84685
(20.0, 33.0]     66536
(57.0, 69.0]     55482
Name: DAYS_BIRTH_cut, dtype: int64

In [13]:
lab_name = ["二十","三十","四十","五十"]
bins=[20,33,45,57,69]
app_train['cut_name'] = pd.cut(app_train['DAYS_BIRTH'], bins, labels=lab_name)
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,DAYS_BIRTH_cut,cut_name
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0.0,0.0,0.0,0.0,0.0,1.0,False,"(20.0, 33.0]",二十
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,,,,,,,False,"(45.0, 57.0]",四十
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十


In [14]:
lab_name = ["二十","三十","四十","五十"]
app_train['lab_name'] = pd.cut(app_train['DAYS_BIRTH'], 4, labels=lab_name) 
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,DAYS_BIRTH_cut,cut_name,lab_name
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0.0,0.0,0.0,0.0,0.0,1.0,False,"(20.0, 33.0]",二十,二十
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十,四十
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十,四十
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,,,,,,,False,"(45.0, 57.0]",四十,四十
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0.0,0.0,0.0,0.0,0.0,0.0,False,"(45.0, 57.0]",四十,四十


In [15]:
# DAY_16
# plt.hist(app_train['DAYS_BIRTH'] / 365, edgecolor = 'k', bins = 25)
# plt.title('Age of Client'); plt.xlabel('Age (years)'); plt.ylabel('Count');
# plt.show()

plt.bar(app_train['DAYS_BIRTH_cut'])
# plt.hist(app_train['DAYS_BIRTH_cut'])

TypeError: bar() missing 1 required positional argument: 'height'

In [28]:
type(app_train['DAYS_BIRTH'])

pandas.core.series.Series

In [29]:
type(app_train['DAYS_BIRTH_cut'])

pandas.core.series.Series