### 전처리

In [1]:
import pymongo as mg
import pandas as pd

In [2]:
client = mg.MongoClient(host='mongodb://localhost:27017')

In [3]:
database = client['db_NHIS']
collection = database['NSC2_BND_M20']

In [4]:
cursor = collection.find({})
list_BND_M20 = list(cursor)

In [5]:
df_BND_M20 = pd.DataFrame(list_BND_M20)
df_BND_M20[:2]

Unnamed: 0,_id,RN_INDI,BTH_YYYY,DTH_YYYYMM,COD1,COD2,EDC_ADD_RT,ED_RC_TOT_AMT,SICK_SYM1
0,64ed6a2652642d1f5783b3b2,294364,1921LE,200707.0,T08-T14,W00-W19,0.15,10590,L028
1,64ed6a2652642d1f5783b3b3,294364,1921LE,200707.0,T08-T14,W00-W19,0.15,16520,J030


In [6]:
df_BND_M20['SICK_SYM1'].value_counts()

J209     8244
I10      4629
I109     3275
F_       3203
N_       3177
         ... 
H700        1
E058        1
M5397       1
T18         1
P289        1
Name: SICK_SYM1, Length: 3414, dtype: int64

In [7]:
len('F_'), len('J209'), 'J209'[:3], str('T18')[:3]

(2, 4, 'J20', 'T18')

In [8]:
def convertSICK_SYM(sick_sym) :
    if len(sick_sym) < 3:
        return None
    else :
        return sick_sym[:3]

In [9]:
convertSICK_SYM('J209'), convertSICK_SYM('F_')

('J20', None)

In [10]:
df_BND_M20['SICK_SYM1_3'] = df_BND_M20['SICK_SYM1'].apply(convertSICK_SYM)

In [11]:
df_BND_M20['SICK_SYM1_3'].value_counts()

J20    8917
I10    8427
J03    4187
M54    3904
J06    3460
       ... 
G97       1
K01       1
S28       1
I72       1
P28       1
Name: SICK_SYM1_3, Length: 747, dtype: int64

In [12]:
df_BND_M20_dropna = df_BND_M20.dropna()
df_BND_M20_dropna.isnull().sum()

_id              0
RN_INDI          0
BTH_YYYY         0
DTH_YYYYMM       0
COD1             0
COD2             0
EDC_ADD_RT       0
ED_RC_TOT_AMT    0
SICK_SYM1        0
SICK_SYM1_3      0
dtype: int64

In [13]:
df_BND_M20_dropna[:2]

Unnamed: 0,_id,RN_INDI,BTH_YYYY,DTH_YYYYMM,COD1,COD2,EDC_ADD_RT,ED_RC_TOT_AMT,SICK_SYM1,SICK_SYM1_3
0,64ed6a2652642d1f5783b3b2,294364,1921LE,200707.0,T08-T14,W00-W19,0.15,10590,L028,L02
1,64ed6a2652642d1f5783b3b3,294364,1921LE,200707.0,T08-T14,W00-W19,0.15,16520,J030,J03


In [14]:
target_list = df_BND_M20_dropna['SICK_SYM1_3'].value_counts().index.to_list()

In [15]:
target_list.index('L02'), target_list.index('J03'), 

(88, 4)

In [16]:
def applyIndexNumber(sick_sym_3):
    indexNumber = target_list.index(sick_sym_3)
    return indexNumber

In [29]:
df_BND_M20_dropna.loc['SICK_SYM1_3_target'] = df_BND_M20_dropna['SICK_SYM1_3'].apply(applyIndexNumber)
df_BND_M20_dropna[:2]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_BND_M20_dropna.loc['SICK_SYM1_3_target'] = df_BND_M20_dropna['SICK_SYM1_3'].apply(applyIndexNumber)


Unnamed: 0,_id,RN_INDI,BTH_YYYY,DTH_YYYYMM,COD1,COD2,EDC_ADD_RT,ED_RC_TOT_AMT,SICK_SYM1,SICK_SYM1_3,SICK_SYM1_3_target
0,64ed6a2652642d1f5783b3b2,294364.0,1921LE,200707.0,T08-T14,W00-W19,0.15,10590.0,L028,L02,88.0
1,64ed6a2652642d1f5783b3b3,294364.0,1921LE,200707.0,T08-T14,W00-W19,0.15,16520.0,J030,J03,4.0


### 정형화

In [18]:
df_BND_M20_dropna.columns

Index(['_id', 'RN_INDI', 'BTH_YYYY', 'DTH_YYYYMM', 'COD1', 'COD2',
       'EDC_ADD_RT', 'ED_RC_TOT_AMT', 'SICK_SYM1', 'SICK_SYM1_3',
       'SICK_SYM1_3_target'],
      dtype='object')

In [19]:
target = df_BND_M20_dropna['SICK_SYM1_3_target']
features = df_BND_M20_dropna[['EDC_ADD_RT', 'ED_RC_TOT_AMT',]]
target.shape, features.shape

((897,), (897, 2))

In [20]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(features, target, random_state=111)
features_train.shape, target_train.shape, features_test.shape, target_test.shape

((672, 2), (672,), (225, 2), (225,))

#### 모델학습 

In [21]:
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

In [22]:
model = DecisionTreeClassifier()

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
hyper_params = {'min_samples_leaf' : range(2,5)
               , 'max_depth' : range(2,5)
               , 'min_samples_split' : range(2,5)}

##### 평가 score : 분류-정확도, 예측-R squre

In [25]:
from sklearn.metrics import make_scorer, f1_score

In [27]:
scoring_method = make_scorer(f1_score)

In [28]:
grid_search = GridSearchCV(model, param_grid=hyper_params, cv=2
                           , verbose=1, scoring=scoring_method)

In [70]:
# grid_search.fit(features_train, target_train)
grid_search.fit(features, target)

Fitting 2 folds for each of 27 candidates, totalling 54 fits


Traceback (most recent call last):
  File "C:\Users\05-16\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
TypeError: f1_score() takes 2 positional arguments but 3 were given

Traceback (most recent call last):
  File "C:\Users\05-16\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
TypeError: f1_score() takes 2 positional arguments but 3 were given

Traceback (most recent call last):
  File "C:\Users\05-16\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
TypeError: f1_score() takes 2 positional arguments but 3 were given

Traceback (most recent call last):
  File "C:\Users\05-16\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 761, in _score
    scores = scorer(estimator, X_test, y_test)
TypeError: f1_sco

GridSearchCV(cv=2, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(2, 5),
                         'min_samples_leaf': range(2, 5),
                         'min_samples_split': range(2, 5)},
             scoring=<function f1_score at 0x000002684B8E2670>, verbose=1)

In [71]:
grid_search.best_estimator_

DecisionTreeClassifier(max_depth=2, min_samples_leaf=2)

In [72]:
grid_search.best_score_, grid_search.best_params_

(nan, {'max_depth': 2, 'min_samples_leaf': 2, 'min_samples_split': 2})

In [52]:
best_model = grid_search.best_estimator_
best_model

DecisionTreeClassifier(max_depth=9, min_samples_leaf=7, min_samples_split=5)

In [53]:
target_test_predict = best_model.predict(features_test)
target_test_predict

array([ 0,  0,  8,  0,  5,  6,  6,  3, 55,  3, 11,  1, 21,  1,  1,  0,  3,
        7,  5,  3,  1,  1, 16,  8, 16,  0,  3,  0,  0,  3,  1, 42,  3,  0,
       21, 42,  0,  2, 55, 21,  0,  7,  0,  1, 59,  0,  2, 37, 59,  0,  3,
        5,  2,  1,  1, 59, 55,  3, 42,  3,  9,  6,  4, 18,  0,  0,  1, 59,
        7,  7,  1,  7,  0,  0,  1, 11,  3,  7, 11, 11,  7,  1,  3,  0, 27,
       21,  1, 37,  1,  2, 55,  6, 11, 55,  3,  0,  4,  8,  0,  1,  0,  1,
        3,  0,  8,  2,  3,  5,  0,  3,  0, 32,  7,  4,  7, 18, 18,  6, 37,
        0,  3, 42,  6,  0, 18,  3,  5,  0, 15,  8,  3,  1,  2,  0,  1, 55,
       42,  0,  0, 18,  1, 42, 55,  3,  1,  3,  6,  2,  0,  7,  0,  0, 42,
        7,  2,  3,  3,  8,  8,  7,  3,  3,  0,  0,  2,  5,  6,  2,  0,  3,
        5,  1,  9,  1, 16,  0,  3,  1,  2,  3,  2, 16, 27,  6,  1,  3,  0,
        0, 15,  0,  1, 19,  0,  2,  1,  0,  8,  6,  6, 11, 16, 55,  0,  0,
       37,  0, 55,  3,  3,  0,  7,  8,  3,  0,  6,  0,  0,  2,  0,  1,  7,
       32,  3,  0,  0], d

In [54]:
from sklearn.metrics import classification_report

In [55]:
print(classification_report(target_test, target_test_predict))

              precision    recall  f1-score   support

           0       0.12      0.27      0.16        22
           1       0.14      0.27      0.19        15
           2       0.43      0.50      0.46        12
           3       0.00      0.00      0.00         7
           4       0.33      0.09      0.14        11
           5       0.86      0.67      0.75         9
           6       0.00      0.00      0.00         6
           7       0.43      1.00      0.60         6
           8       0.11      0.17      0.13         6
           9       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         4
          11       0.17      0.20      0.18         5
          12       0.00      0.00      0.00         4
          13       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         5
          15       0.00      0.00      0.00         0
          16       0.60      1.00      0.75         3
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
