# Load data
 - HN16_ALL.sas7bdat : 설문지 데이터

In [40]:
import pandas as pd

# 식품 섭취 데이터는 사용하지 않는 것으로 결정(2018-11-15)
df_ALL = pd.read_sas("./HN16_ALL.sas7bdat", format = 'sas7bdat', encoding='iso-8859-1')

# 만 19~79세 제한
df_data = df_ALL.loc[(18 < df_ALL.age), :]
df_data = df_data.loc[(80 > df_data.age), :]

# 혈압치료중인 대상 제외
df_data = df_data.loc[(1 < df_data.DI1_pt) | (df_data.DI1_pt < 1), :]

# 모든 값이 NaN값이 컬럼 삭제
df_data = df_data.dropna(axis=1, how='all')

# 고혈압 유병여부 NaN값 대상 삭제
df_data = df_data.dropna(subset=['HE_HP'])

# 평균동맥압 파생변수 추가
df_data["HE_MAP"] = df_data["HE_dbp"] + (df_data["HE_sbp"] - df_data["HE_dbp"]) / 3
df_data.shape

(4585, 715)

In [41]:
# 남자만으로 dataframe 구성
df_data_sex_1 = df_data.loc[(1 == df_data.sex), :]
df_data_sex_1.shape

# 여자만으로 dataframe 구성
# df_data_sex_2 = df_data.loc[(2 == df_data.sex), :]
# df_data_sex_2.shape

# df 이름 변경
df_data = df_data_sex_1
df_data.shape

(1945, 715)

# 라이브러리

In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib notebook
#import seaborn as sns

from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import Imputer
# from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
# from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
# from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.model_selection import train_test_split

import graphviz
import time

In [43]:
df_data.head()

Unnamed: 0,mod_d,ID,ID_fam,year,region,town_t,apt_t,psu,sex,age,...,N_CAROT,N_RETIN,N_B1,N_B2,N_NIAC,N_VITC,LF_secur_y,LF_BUYER,LF_SAFE,HE_MAP
3,2018.02.01.,A651183002,A6511830,2016.0,1.0,1.0,2.0,A651,1.0,39.0,...,,,,,,,,,,111.666667
8,2018.02.01.,A651205101,A6512051,2016.0,1.0,1.0,2.0,A651,1.0,38.0,...,2051.326118,296.028559,5.408978,2.145862,60.347972,43.23904,1.0,1.0,3.0,91.666667
14,2018.02.01.,A651249301,A6512493,2016.0,1.0,1.0,2.0,A651,1.0,49.0,...,,,,,,,,,,91.666667
18,2018.02.01.,A651268001,A6512680,2016.0,1.0,1.0,2.0,A651,1.0,49.0,...,2401.220159,192.14506,2.160682,1.736965,14.356613,25.878406,1.0,1.0,2.0,104.666667
22,2018.02.01.,A651281601,A6512816,2016.0,1.0,1.0,2.0,A651,1.0,42.0,...,,,,,,,,,,100.0


# 데이터 전처리

In [44]:
df_data_cut = pd.DataFrame(df_data, columns=[
    df_data.HE_MAP.name ,
#     df_data.region.name ,
#     df_data.sex.name ,
#     df_data.age.name ,
    df_data.incm.name ,
    df_data.ho_incm.name ,
    df_data.edu.name ,
    df_data.occp.name ,
    df_data.cfam.name ,
    df_data.allownc.name ,
    df_data.house.name ,
    df_data.live_t.name ,
    df_data.ainc_1.name ,
    df_data.ainc.name ,
    df_data.marri_1.name ,
    df_data.marri_2.name ,
    df_data.tins.name ,
    df_data.npins.name ,
    df_data.D_1_1.name ,
    df_data.D_2_1.name ,
    df_data.D_2_wk.name ,
#     df_data.DI1_dg.name ,
#     df_data.DI1_pr.name ,
#     df_data.DI1_pt.name ,
#     df_data.DI1_2.name ,
    df_data.DI2_dg.name ,
    df_data.DI2_pr.name ,
    df_data.DI3_dg.name ,
    df_data.DI3_pr.name ,
    df_data.DI4_dg.name ,
    df_data.DI4_pr.name ,
    df_data.DI5_dg.name ,
    df_data.DI5_pr.name ,
    df_data.DI6_dg.name ,
    df_data.DI6_pr.name ,
    df_data.DM1_dg.name ,
    df_data.DM1_pr.name ,
    df_data.DM2_dg.name ,
    df_data.DM2_pr.name ,
    df_data.DM3_dg.name ,
    df_data.DM3_pr.name ,
    df_data.DM4_dg.name ,
    df_data.DM4_pr.name ,
    df_data.DJ2_dg.name ,
    df_data.DJ2_pr.name ,
    df_data.DJ4_dg.name ,
    df_data.DJ4_pr.name ,
    df_data.DE1_dg.name ,
    df_data.DE1_pr.name ,
    df_data.DE2_dg.name ,
    df_data.DE2_pr.name ,
    df_data.DC1_dg.name ,
    df_data.DC1_pr.name ,
    df_data.DC2_dg.name ,
    df_data.DC2_pr.name ,
    df_data.DC3_dg.name ,
    df_data.DC3_pr.name ,
    df_data.DC4_dg.name ,
    df_data.DC4_pr.name ,
    df_data.DC5_dg.name ,
    df_data.DC5_pr.name ,
    df_data.DC6_dg.name ,
    df_data.DC6_pr.name ,
    df_data.DC7_dg.name ,
    df_data.DC7_pr.name ,
    df_data.DC11_dg.name ,
    df_data.DC11_pr.name ,
    df_data.DF2_dg.name ,
    df_data.DF2_pr.name ,
    df_data.DL1_dg.name ,
    df_data.DL1_pr.name ,
    df_data.DJ8_dg.name ,
    df_data.DJ8_pr.name ,
    df_data.DJ6_dg.name ,
    df_data.DJ6_pr.name ,
    df_data.DH4_dg.name ,
    df_data.DH4_pr.name ,
    df_data.DH2_dg.name ,
    df_data.DH2_pr.name ,
    df_data.DH3_dg.name ,
    df_data.DH3_pr.name ,
    df_data.DH6_dg.name ,
    df_data.DH6_pr.name ,
    df_data.DN1_dg.name ,
    df_data.DN1_pr.name ,
    df_data.DK8_dg.name ,
    df_data.DK8_pr.name ,
    df_data.DK9_dg.name ,
    df_data.DK9_pr.name ,
    df_data.DK4_dg.name ,
    df_data.DK4_pr.name ,
    df_data.BH9_11.name ,
    df_data.BH1.name ,
    df_data.BH1_1.name ,
    df_data.BH1_2.name ,
    df_data.BH1_3.name ,
    df_data.BH1_8.name ,
    df_data.BH1_6.name ,
    df_data.BH2_61.name ,
    df_data.BH2_62.name ,
    df_data.BH2_63.name ,
    df_data.BH2_66.name ,
    df_data.BH2_67.name ,
    df_data.BH2_64.name ,
    df_data.educ.name ,
    df_data.graduat.name ,
    df_data.EC1_1.name ,
    df_data.EC1_2.name ,
    df_data.EC_occp.name ,
    df_data.EC_stt_1.name ,
    df_data.EC_stt_2.name ,
    df_data.EC_wh.name ,
    df_data.EC_wht_0.name ,
    df_data.EC_wht_23.name ,
    df_data.EC_wht_5.name ,
    df_data.EC_lgw_2.name ,
    df_data.EC_lgw_4.name ,
    df_data.EC_lgw_5.name ,
    df_data.EC_pedu_1.name ,
    df_data.EC_pedu_2.name
])
total_column_count = df_data_cut.columns.size
print(total_column_count)
print(df_data_cut.shape)
df_data_cut.head()

113
(1945, 113)


Unnamed: 0,HE_MAP,incm,ho_incm,edu,occp,cfam,allownc,house,live_t,ainc_1,...,EC_stt_2,EC_wh,EC_wht_0,EC_wht_23,EC_wht_5,EC_lgw_2,EC_lgw_4,EC_lgw_5,EC_pedu_1,EC_pedu_2
3,111.666667,3.0,3.0,4.0,3.0,5.0,20.0,3.0,2.0,600.0,...,1.0,1.0,1.0,61.0,1.0,5.0,1.0,1.0,8.0,5.0
8,91.666667,1.0,2.0,4.0,1.0,4.0,20.0,1.0,2.0,300.0,...,1.0,1.0,1.0,56.0,1.0,2.0,1.0,1.0,7.0,5.0
14,91.666667,3.0,4.0,4.0,1.0,3.0,20.0,3.0,2.0,7000.0,...,8.0,8.0,8.0,40.0,1.0,2.0,2.0,8.0,7.0,5.0
18,104.666667,4.0,4.0,4.0,2.0,4.0,20.0,3.0,2.0,750.0,...,1.0,1.0,1.0,40.0,1.0,3.0,1.0,1.0,5.0,4.0
22,100.0,3.0,4.0,4.0,3.0,3.0,20.0,1.0,2.0,550.0,...,8.0,8.0,8.0,45.0,1.0,5.0,2.0,8.0,8.0,7.0


## 특정 value 값을 NaN 처리

In [45]:
def setNanValues(in_df, name, nan_values) :
    
    start_na_count = in_df[name].isna().sum()
    
    uniq_values = sorted(map(float,in_df[name].dropna().unique()))
    
    in_df[name] = in_df[name].map(lambda x: np.nan if x in nan_values else x)
        
    #print(sorted(map(float,in_df[name].dropna().unique())))
        
    end_na_count = in_df[name].isna().sum()
    
    print("%s NaN count %s -> %s" % (name,start_na_count,end_na_count))
    
    return in_df

nan_df_data = df_data_cut.copy(deep=True)
nan_df_data = setNanValues(nan_df_data, nan_df_data.marri_1.name, [9.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.marri_2.name, [88.0, 99.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.EC_pedu_1.name, [88.0, 99.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.EC_pedu_2.name, [88.0, 99.0])
# 'DI4_dg', 'DI4_pr', 'DM1_dg', 'DM1_pr'
nan_df_data = setNanValues(nan_df_data, nan_df_data.DI4_dg.name, [8.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.DI4_pr.name, [8.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.DM1_dg.name, [8.0])
nan_df_data = setNanValues(nan_df_data, nan_df_data.DM1_pr.name, [8.0])
print(nan_df_data.shape)
nan_df_data.columns

marri_1 NaN count 0 -> 0
marri_2 NaN count 0 -> 478
EC_pedu_1 NaN count 0 -> 278
EC_pedu_2 NaN count 0 -> 251
DI4_dg NaN count 107 -> 107
DI4_pr NaN count 107 -> 1907
DM1_dg NaN count 107 -> 107
DM1_pr NaN count 107 -> 1878
(1945, 113)


Index(['HE_MAP', 'incm', 'ho_incm', 'edu', 'occp', 'cfam', 'allownc', 'house',
       'live_t', 'ainc_1',
       ...
       'EC_stt_2', 'EC_wh', 'EC_wht_0', 'EC_wht_23', 'EC_wht_5', 'EC_lgw_2',
       'EC_lgw_4', 'EC_lgw_5', 'EC_pedu_1', 'EC_pedu_2'],
      dtype='object', length=113)

## NaN 값 처리

In [46]:
'''
strategy = mean, median, most_frequent, constant
'''
def setNanImputer(in_df, name, strategy="mean") :
    
    tmp_df = pd.DataFrame(in_df[name], columns=[name])
    
    imp = SimpleImputer(missing_values=np.nan, strategy=strategy)
    in_df[name] = imp.fit_transform(tmp_df)
        
    print('%s null count %s, na count %s' % (name, in_df[name].isnull().sum(), in_df[name].isna().sum()))
    
    return in_df


imp_df = nan_df_data.copy(deep=True)

imputer_infos = [
      ['ainc', 'mean']
]

for name, strategy in imputer_infos :
    imp_df = setNanImputer(imp_df, name, strategy=strategy)

ainc null count 0, na count 0


## One-Hot Encoding

In [47]:
def addOneHotEncodingColumns(in_df, name) :
    
    uniq_values = sorted(map(float,in_df[name].dropna().unique()))
    
    dummie = pd.get_dummies(in_df[name], dummy_na=True)
    dummie.columns=['%s_ONE_%s' % (name,str(n)) for n in uniq_values] + ['%s_ONE_NaN' % name]
    
    one_hot_df = pd.concat([in_df,dummie],axis=1)    
    one_hot_df.drop(columns=[name], inplace=True)
    
    return one_hot_df

one_hot_df = imp_df.copy(deep=True)
print(one_hot_df.shape)
# one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.region.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.incm.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.ho_incm.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.edu.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.occp.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.EC_pedu_1.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.EC_pedu_2.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.marri_1.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.marri_2.name)
# 'DI4_dg', 'DI4_pr', 'DM1_dg', 'DM1_pr'
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.DI4_dg.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.DI4_pr.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.DM1_dg.name)
one_hot_df = addOneHotEncodingColumns(one_hot_df, nan_df_data.DM1_pr.name)
print(one_hot_df.shape)
print(one_hot_df.columns)
one_hot_df.tail()

(1945, 113)
(1945, 163)
Index(['HE_MAP', 'cfam', 'allownc', 'house', 'live_t', 'ainc_1', 'ainc',
       'tins', 'npins', 'D_1_1',
       ...
       'DI4_dg_ONE_NaN', 'DI4_pr_ONE_0.0', 'DI4_pr_ONE_1.0', 'DI4_pr_ONE_NaN',
       'DM1_dg_ONE_0.0', 'DM1_dg_ONE_1.0', 'DM1_dg_ONE_NaN', 'DM1_pr_ONE_0.0',
       'DM1_pr_ONE_1.0', 'DM1_pr_ONE_NaN'],
      dtype='object', length=163)


Unnamed: 0,HE_MAP,cfam,allownc,house,live_t,ainc_1,ainc,tins,npins,D_1_1,...,DI4_dg_ONE_NaN,DI4_pr_ONE_0.0,DI4_pr_ONE_1.0,DI4_pr_ONE_NaN,DM1_dg_ONE_0.0,DM1_dg_ONE_1.0,DM1_dg_ONE_NaN,DM1_pr_ONE_0.0,DM1_pr_ONE_1.0,DM1_pr_ONE_NaN
8126,89.0,4.0,20.0,2.0,2.0,8000.0,666.666667,20.0,1.0,2.0,...,0,0,0,1,1,0,0,0,0,1
8131,88.0,3.0,20.0,2.0,2.0,6940.0,578.333333,20.0,1.0,2.0,...,0,0,0,1,1,0,0,0,0,1
8134,94.666667,4.0,20.0,3.0,2.0,8000.0,666.666667,20.0,1.0,3.0,...,0,0,0,1,1,0,0,0,0,1
8138,83.666667,4.0,10.0,3.0,2.0,7000.0,583.333333,20.0,1.0,2.0,...,0,0,0,1,1,0,0,0,0,1
8142,96.0,3.0,20.0,2.0,2.0,6000.0,500.0,10.0,1.0,1.0,...,0,0,0,1,1,0,0,0,0,1


In [27]:
# one_hot_df[one_hot_df.sex.name] = one_hot_df[one_hot_df.sex.name].astype(int)

## 결측치가 하나 이상 있는 컬럼 확인

In [48]:
nan_names = one_hot_df.columns[one_hot_df.isna().any()]
nan_names

Index([], dtype='object')

In [49]:
for index, name in enumerate(nan_names) :
    value_count = pd.value_counts(one_hot_df[name], dropna=False)
    nan_count = value_count[np.nan]
    print("[ %3d ] %s NaN count %s" % (index,name,nan_count))
    value_count.sort_index().plot(kind='bar', figsize=(15,5))
    plt.show()

# Train, Test data split

In [50]:
train_df, test_df = train_test_split(one_hot_df, test_size=0.25)
print(train_df.shape)
print(test_df.shape)

Y = one_hot_df.HE_MAP.name
X = one_hot_df.columns.drop(Y).tolist()

(1458, 163)
(487, 163)


# Decision Tree Regressor

In [51]:
def runDTR(x_name_list, y_name, train_df, test_df, max_depth=None) :
    
    stime = time.time()

    x_train_df = train_df[x_name_list]
    y_train_df = train_df[y_name]
    
    x_test_df = test_df[x_name_list]
    y_test_df = test_df[y_name]

    model = DecisionTreeRegressor(
          criterion = 'mse'
        , max_depth=max_depth
    ).fit(x_train_df,y_train_df)
    
    pdf_name = "./pdf_decision_tree_regressor_mse/target-%s_feature_count-%s_max_depth-%s" % (y_name,x_name_list.__len__(),max_depth)

    dot_data = tree.export_graphviz(model,
                                    out_file=None, 
                                    feature_names=x_name_list, 
                                    class_names=[y_name]
                                   )

    graph = graphviz.Source(dot_data) 
    graph.render(pdf_name)
    #graph.render("%s+%s" % (x_name,y_name))

    modelPrediction = model.predict(x_train_df)
    mse = mean_squared_error(y_train_df, modelPrediction)
#    mse_list.append([x_name, y_name, mse])
#     accuracyRate = accuracy_score(
#         y_true=y_test_df
#     ,   y_pred=modelPrediction
#     )

    etime = time.time()
    
    print(pdf_name,', 손실함수(%s) 소요시간(%s)' % (mse,etime-stime))
    
    return mse

In [52]:
'''
maxdepth가 높으면 처리 속도가 늦음
maxdepth가 낮으면 처리 속도가 빠름
'''
for depth in range(one_hot_df.columns.size, 5, -1) :
    mse = runDTR(X, Y, train_df, test_df, depth)

./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-163 , 손실함수(0.1602270995275112) 소요시간(4.42219090461731)
./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-162 , 손실함수(0.1602270995275112) 소요시간(4.033601999282837)
./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-161 , 손실함수(0.1602270995275112) 소요시간(4.342919111251831)
./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-160 , 손실함수(0.1602270995275112) 소요시간(5.0185229778289795)
./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-159 , 손실함수(0.1602270995275112) 소요시간(3.996702194213867)
./pdf_decision_tree_regressor_mse/target-HE_MAP_feature_count-162_max_depth-158 , 손실함수(0.1602270995275112) 소요시간(4.165160179138184)


KeyboardInterrupt: 

# Random Forest Regressor

In [53]:
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(
      n_estimators=100
    , max_depth=20
    , random_state=0
)
clf.fit(train_df[X], train_df[Y])

for imp in zip(X,clf.feature_importances_) :
    print(imp)
print(clf.predict(test_df[X]))

('cfam', 0.03352658688843998)
('allownc', 0.0022185369888651813)
('house', 0.021976338010454496)
('live_t', 0.025702751113839847)
('ainc_1', 0.09050717911680142)
('ainc', 0.08134249868219913)
('tins', 0.015084318032415658)
('npins', 0.007558782933139906)
('D_1_1', 0.02620468628026756)
('D_2_1', 0.0031429198328782044)
('D_2_wk', 0.012371747751846955)
('DI2_dg', 0.0036596361154557274)
('DI2_pr', 0.008745543886213234)
('DI3_dg', 0.00036012605671314666)
('DI3_pr', 0.0002897157729278486)
('DI5_dg', 0.000984475780938376)
('DI5_pr', 0.0027511898469428043)
('DI6_dg', 0.000380018639870272)
('DI6_pr', 0.004601458634245461)
('DM2_dg', 0.0016984415535572406)
('DM2_pr', 0.0010782315905815056)
('DM3_dg', 0.0003574132974404798)
('DM3_pr', 0.0005209128234162208)
('DM4_dg', 0.0005498249984844186)
('DM4_pr', 0.000653217328392885)
('DJ2_dg', 0.0025365298607271814)
('DJ2_pr', 0.002217210790909247)
('DJ4_dg', 0.0011188565417650344)
('DJ4_pr', 0.002112886103264233)
('DE1_dg', 0.00334067210710964)
('DE1_pr',