In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pp1st_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 50)

In [4]:
import importlib
importlib.reload(pp1st_pipeline)

<module 'pp1st_pipeline' from '/Data/kimhae/pp_1st/pp1st_pipeline.py'>

# modeling

In [23]:
data_train = pd.read_csv('./data/dataset_le.csv', index_col=0)

In [24]:
data_train.drop(columns='bldng_ar_prc', inplace=True)

In [35]:
xgb = XGBClassifier(max_depth=20)
stf = StratifiedKFold(n_splits=5, shuffle=True)
cv_score = cross_val_score(xgb, X = dataset.iloc[:,1:], y =dataset['fr_yn'], cv=stf)
print(cv_score)

[0.80514988 0.80630284 0.811299   0.80399693 0.80769231]


In [47]:
xgb.fit(dataset.iloc[:,1:], y=dataset['fr_yn'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=20, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

# validation

In [4]:
data_val = pd.read_csv('../raw/PJT002_validation.csv' )

In [51]:
data_val.shape

(6898, 180)

In [15]:
data_val[data_val['fr_yn']=='Y'].shape

(1272, 196)

In [26]:
data_var = pp1st_pipeline.Pipeline_var(data_val)

In [27]:
data_var.isnull().sum().sort_values(ascending=False)

bldng_ar_prc            3086
면적대비예상채수                1259
한채당가용면적                 1259
bldng_us                1113
bldng_archtctr          1099
화재시점나이                  1091
예상평균층당면적                 925
한채당평균지하층수                925
ttl_dwn_flr              925
bldng_us_clssfctn        823
한채당평균지상층수                795
ttl_grnd_flr             795
lnd_us_sttn_nm           163
rgnl_ar_nm                91
wnd_drctn                 12
wnd_spd                    4
hmdt                       3
tmprtr                     1
계절평균온도차                    1
fr_mn_cnt                  1
jmk                        0
차량통행가능여부                   0
lnd_ar                     0
bldng_ar                   0
bldng_cnt                  0
ttl_ar                     0
mlt_us_yn                  0
fr_sttn_dstnc              0
no_tbc_zn_dstnc            0
반대풍향여부                     0
season                     0
예상채수초과                     0
dt_of_fr_mth               0
dt_of_fr_hr                0
bldng_cnt_in_5

In [28]:
data_val = data_var.drop(columns='bldng_ar_prc')

In [29]:
data_val = data_val.dropna()

In [21]:
print(dataset.shape)
print(dataset_.shape)

(6898, 44)
(4272, 43)


In [36]:
dataset_.isnull().sum().sort_values(ascending=False)

차량통행가능여부                0
tmprtr                  0
fr_wthr_fclt_dstnc      0
fr_sttn_dstnc           0
lnd_us_sttn_nm          0
rgnl_ar_nm              0
jmk                     0
hmdt                    0
wnd_drctn               0
wnd_spd                 0
bldng_us_clssfctn       0
mlt_us_yn               0
ttl_dwn_flr             0
ttl_grnd_flr            0
lnd_ar                  0
ttl_ar                  0
bldng_ar                0
bldng_cnt               0
bldng_archtctr          0
bldng_us                0
fr_mn_cnt               0
cctv_dstnc              0
측면도로크기                  0
한채당가용면적                 0
반대풍향여부                  0
계절평균온도차                 0
season                  0
예상평균층당면적                0
한채당평균지하층수               0
한채당평균지상층수               0
예상채수초과                  0
면적대비예상채수                0
화재시점나이                  0
fr_wthr_fclt_in_100m    0
dt_of_fr_mth            0
dt_of_fr_hr             0
bldng_cnt_in_50m        0
no_tbc_zn_dstnc         0
ahsm_dstnc  

In [30]:
data_val = pp1st_pipeline.Pipeline_le(data_val)

In [58]:
valid_yhat = xgb.predict(dataset_.iloc[:,1:])

In [60]:
f1_score(dataset_['fr_yn'], valid_yhat)

0.4666666666666666

In [61]:
confusion_matrix(dataset_['fr_yn'], valid_yhat)

array([[2685,  679],
       [ 425,  483]])

# concat ver

In [31]:
data_concat = pd.concat([data_train, data_val], axis=0 )

In [33]:
xgb = XGBClassifier(max_depth=20)

In [35]:
xgb.fit(data_concat.iloc[:, 1:], y = data_concat['fr_yn'])

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=20, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [13]:
data_val.isnull().sum().sort_values()[15:]

fr_yn                              0
bldng_cnt                          0
bldng_ar                           0
ttl_ar                             0
lnd_ar                             0
hm_cnt                             1
fr_mn_cnt                          1
tmprtr                             1
hmdt                               3
wnd_spd                            4
wnd_drctn                         12
rgnl_ar_nm                        91
rgnl_ar_nm2                       91
rd_sd_nm                          91
lnd_us_sttn_nm                   163
ttl_grnd_flr                     795
ttl_dwn_flr                      925
ele_engry_us_201708             1091
gas_engry_us_201707             1091
ele_engry_us_201707             1091
gas_engry_us_201708             1091
gas_engry_us_201709             1091
gas_engry_us_201401             1091
gas_engry_us_201710             1091
ele_engry_us_201710             1091
                                ... 
bldng_us_clssfctn               1612
b

# test

In [6]:
test = pd.read_csv('../raw/PJT002_test.csv')

In [13]:
test.shape

(2957, 180)

In [13]:
test[test['fr_yn']=='N'].shape

(0, 180)

In [14]:
test.isnull().sum().sort_values()[15:]

ttl_ar                             0
bldng_ar                           0
bldng_cnt                          0
no_tbc_zn_dstnc                    0
bldng_cnt_in_50m                   0
sft_emrgnc_bll_dstnc               0
tmprtr                             1
wnd_spd                            1
hmdt                               2
wnd_drctn                          4
rgnl_ar_nm                        41
rgnl_ar_nm2                       41
rd_sd_nm                          41
lnd_us_sttn_nm                    73
ttl_grnd_flr                     343
ttl_dwn_flr                      399
ele_engry_us_201711              460
gas_engry_us_201711              460
ele_engry_us_201710              460
gas_engry_us_201710              460
ele_engry_us_201709              460
gas_engry_us_201709              460
ele_engry_us_201708              460
gas_engry_us_201708              460
ele_engry_us_201707              460
                                ... 
bldng_ar_prc                    1344
d