In [1]:
import codecs
from bs4 import BeautifulSoup
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random, sys
import pandas as pd

Using TensorFlow backend.


In [2]:
# 30대 질병데이터 읽어오기 
df = pd.read_table('yang_30.txt', sep='\n', header = None)
df.columns = ['text']
df.head() 

Unnamed: 0,text
0,S OL766 KG504 SF108 WR405 ET321 RD201 IO789 O...
1,S FG308 PO276 FG371 KL365 QW987 LO103 LO103 P...
2,S WD675 RE385 PK765 GH791 KG341 ZA107 WR896 O...
3,S RE385 QO233 WY390 GH700 ZA107 KO109 WO378 O...
4,S QE931 QO102 GH791 SD398 ER389 QE277 OP345 G...


In [3]:
# 질병-가격 기록읽어오기 
dp_code = pd.read_csv('d_code.csv')
d_code = list(dp_code.d_code)
dp_code.head() 

Unnamed: 0,d_code,price
0,KP345,71100
1,OK163,123000
2,QW987,3500
3,ER203,38800
4,OP167,68900


In [4]:
# 질병평균 가격표
dp_code_dic = dict(zip(dp_code.d_code , dp_code.price))
dp_code_dic

{'KP345': 71100,
 'OK163': 123000,
 'QW987': 3500,
 'ER203': 38800,
 'OP167': 68900,
 'ER897': 33900,
 'PO203': 4500,
 'PK765': 6400,
 'IO789': 45900,
 'UY503': 3500,
 'ER300': 3500,
 'TY211': 14500,
 'KL365': 11800,
 'QA107': 28300,
 'ET320': 22400,
 'GH700': 3500,
 'KG907': 20100,
 'RD201': 48500,
 'BN377': 21600,
 'SF108': 3500,
 'KG504': 3500,
 'WD234': 44100,
 'QO233': 59000,
 'HD405': 3500,
 'HK504': 56000,
 'OL344': 60000,
 'PL221': 65100,
 'WY390': 3500,
 'WD100': 95900,
 'QY204': 17900,
 'UY201': 111400,
 'QE277': 46400,
 'TR675': 63400,
 'FG371': 13200,
 'KO109': 108700,
 'JV766': 107400,
 'WO208': 3500,
 'RE397': 3500,
 'CV305': 88500,
 'ZA780': 32800,
 'SD308': 3500,
 'WR405': 6300,
 'LO103': 3500,
 'WR117': 52500,
 'ET650': 79700,
 'WO900': 16400,
 'PL250': 3500,
 'RE567': 79100,
 'PK244': 61600,
 'UY980': 11000,
 'KG766': 3800,
 'OK211': 3500,
 'KP843': 25100,
 'TR508': 3500,
 'TR109': 3500,
 'HK377': 14700,
 'GH433': 36300,
 'QO349': 82000,
 'WY107': 3500,
 'WD599': 6570

In [5]:
all_text = ''
for str in df.text:
        all_text = all_text + ' ' + str

text = all_text
print('훈련 데이터: ', len(text))

훈련 데이터:  946418


In [6]:
# 질병사전 구축하기 
text_split = text.split(" ")
chars = sorted(list(set(text_split)))
print('질병코드의 종류:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))  
indices_char = dict((i, c) for i, c in enumerate(chars)) 

질병코드의 종류: 103


In [7]:
char_indices

{'': 0,
 'BN377': 1,
 'CV305': 2,
 'CV751': 3,
 'E': 4,
 'ER119': 5,
 'ER203': 6,
 'ER300': 7,
 'ER389': 8,
 'ER897': 9,
 'ET320': 10,
 'ET321': 11,
 'ET650': 12,
 'FG308': 13,
 'FG371': 14,
 'FG391': 15,
 'GH211': 16,
 'GH433': 17,
 'GH700': 18,
 'GH791': 19,
 'HD405': 20,
 'HD433': 21,
 'HK377': 22,
 'HK504': 23,
 'IO789': 24,
 'JV109': 25,
 'JV765': 26,
 'JV766': 27,
 'KG341': 28,
 'KG504': 29,
 'KG766': 30,
 'KG907': 31,
 'KL365': 32,
 'KO109': 33,
 'KP321': 34,
 'KP345': 35,
 'KP843': 36,
 'LO103': 37,
 'LO976': 38,
 'OK163': 39,
 'OK211': 40,
 'OL344': 41,
 'OL766': 42,
 'OP167': 43,
 'OP345': 44,
 'OP350': 45,
 'PK213': 46,
 'PK244': 47,
 'PK765': 48,
 'PL221': 49,
 'PL250': 50,
 'PL890': 51,
 'PO203': 52,
 'PO276': 53,
 'QA107': 54,
 'QE277': 55,
 'QE931': 56,
 'QO102': 57,
 'QO233': 58,
 'QO304': 59,
 'QO349': 60,
 'QO743': 61,
 'QW987': 62,
 'QY204': 63,
 'RD201': 64,
 'RE385': 65,
 'RE397': 66,
 'RE567': 67,
 'RE577': 68,
 'S': 69,
 'SD308': 70,
 'SD388': 71,
 'SD398': 72,
 

In [8]:
indices_char

{0: '',
 1: 'BN377',
 2: 'CV305',
 3: 'CV751',
 4: 'E',
 5: 'ER119',
 6: 'ER203',
 7: 'ER300',
 8: 'ER389',
 9: 'ER897',
 10: 'ET320',
 11: 'ET321',
 12: 'ET650',
 13: 'FG308',
 14: 'FG371',
 15: 'FG391',
 16: 'GH211',
 17: 'GH433',
 18: 'GH700',
 19: 'GH791',
 20: 'HD405',
 21: 'HD433',
 22: 'HK377',
 23: 'HK504',
 24: 'IO789',
 25: 'JV109',
 26: 'JV765',
 27: 'JV766',
 28: 'KG341',
 29: 'KG504',
 30: 'KG766',
 31: 'KG907',
 32: 'KL365',
 33: 'KO109',
 34: 'KP321',
 35: 'KP345',
 36: 'KP843',
 37: 'LO103',
 38: 'LO976',
 39: 'OK163',
 40: 'OK211',
 41: 'OL344',
 42: 'OL766',
 43: 'OP167',
 44: 'OP345',
 45: 'OP350',
 46: 'PK213',
 47: 'PK244',
 48: 'PK765',
 49: 'PL221',
 50: 'PL250',
 51: 'PL890',
 52: 'PO203',
 53: 'PO276',
 54: 'QA107',
 55: 'QE277',
 56: 'QE931',
 57: 'QO102',
 58: 'QO233',
 59: 'QO304',
 60: 'QO349',
 61: 'QO743',
 62: 'QW987',
 63: 'QY204',
 64: 'RD201',
 65: 'RE385',
 66: 'RE397',
 67: 'RE567',
 68: 'RE577',
 69: 'S',
 70: 'SD308',
 71: 'SD388',
 72: 'SD398',
 

In [9]:
# 전체 훈련 데이터를 쪼개어서 문장구조로 만들기
maxlen = 5
step = 1
sentences = []
next_chars = []

for i in range(0, len(text_split) - maxlen, step):
    sentences.append(" ".join(text_split[i: i + maxlen]))
    next_chars.append(text_split[i + maxlen])
    
print('학습할 질병패턴의 수:', len(sentences))
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)

for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence.split(" ")):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

학습할 질병패턴의 수: 179399


In [10]:
# 모델 구축하기(LSTM)
print('LSTM 모델을 만들어 본다....')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.015)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

LSTM 모델을 만들어 본다....


In [11]:
# 후보추출
def sample(preds, temp=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [12]:
# 질병코드 단어생성하기 
generated_all = []
for iteration in range(1, 8):
    print()
    print('반복번호 =', iteration)
    model.fit(X, y, batch_size=128, epochs=1)
    
    # 옵션을 다르게 하여 다양한 방법으로 생성하기
    for diversity in [1.4 , 1.6 , 1.8 , 2.0 , 2.2 , 2.4 , 2.6]:
        print()
        print('추출옵션 : ', diversity)
        generated = ''
        #sentence = text[start_index: start_index + maxlen]
        #sentence = "OL766 KG504 SF108"
        sentence = "S FG391"
        generated += sentence
        print('질병시작패턴 = "' + sentence + '"')
        sys.stdout.write(generated)
        
        # 질병코드 자동생성
        for i in range(35):      #문장의 평균 단어의 수(1년동안 질병의 코드의 수) 
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence.split(" ")):
                x[0, t, char_indices[char]] = 1.
                
            # 다음에 올 질병코드를 예측하기
            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]
            
            # 30대 예상질병코드 출력하기
            generated += " " + next_char
            sentence = " ".join(sentence.split(" ")[1:]) + " " + next_char
            sys.stdout.write(" " + next_char)
            sys.stdout.flush()
            
            if next_char == "E":
                generated_all.append(generated)
                break
        print()


반복번호 = 1
Epoch 1/1

추출옵션 :  1.4
질병시작패턴 = "S FG391"
S FG391 HD433 QA107 KG341 WR789 QO233 KP843 ZA780 QA107 WR321 E

추출옵션 :  1.6
질병시작패턴 = "S FG391"
S FG391 WY390 WD100 WD675 QW987 KP321 WR117 QO349 PL890 WO900 WR896 QO743 WD675 E

추출옵션 :  1.8
질병시작패턴 = "S FG391"
S FG391 QA107 TY901 OP350 WR321 HD433 ZA780 LO103 HD405 E

추출옵션 :  2.0
질병시작패턴 = "S FG391"
S FG391 WO188 JV765 QO743 KP345 WO188 TY901 LO976 TY211 OK211 JV765 KG504 WY504 JV766 FG308 WR789 KP843 KG766 JV765 QO349 PK213 TY341 ET320 WR321 ET320 UY980 WR405 PL890 PK213 QE277 LO103 E

추출옵션 :  2.2
질병시작패턴 = "S FG391"
S FG391 PL250 RE577 QE277 OK211 SD398 WR405 WR405 OK211 KP345 QO349 CV751 SD388 PO203 ER203 KP843 TY211 KP321 ER300 KP321 CV305 WR405 ER300 OK211 KP345 RE385 QE277 CV305 ER897 OL766 BN377 OK211 KP345 ER897 WD675 FG391

추출옵션 :  2.4
질병시작패턴 = "S FG391"
S FG391 GH791 UY109 LO103 LO103 TY341 PO203 WR896 SF108 WY390 ER203 ET321 PK213 HD405 SD398 GH791 QE931 E

추출옵션 :  2.6
질병시작패턴 = "S FG391"
S FG391 GH433 WO378 GH433 HD433 SD308 

S FG391 PO276 WO900 KG907 KG504 KG341 ER203 WD100 KG766 HK504 WR117 BN377 QY204 JV765 QE931 HD405 OL766 BN377 PO276 QO743 QO233 GH433 WY504 QO233 HK504 E

반복번호 = 7
Epoch 1/1

추출옵션 :  1.4
질병시작패턴 = "S FG391"
S FG391 WO208 KG907 OK211 OP167 WO208 WO900 ET650 KG766 KG504 SD398 HD405 QA107 TY341 OK163 ER897 GH700 UY980 JV766 ER203 OK211 E

추출옵션 :  1.6
질병시작패턴 = "S FG391"
S FG391 E

추출옵션 :  1.8
질병시작패턴 = "S FG391"
S FG391 KG766 WD234 RD201 LO976 TR508 PL250 QO743 JV109 FG371 PL250 SD308 SD388 KG341 ER119 E

추출옵션 :  2.0
질병시작패턴 = "S FG391"
S FG391 PK765 UY503 CV305 GH211 HK504 RD201 HK504 RD201 TR675 WR896 TY211 UY112 OP345 PL250 QO233 KG766 OP167 UY201 WR117 LO103 SD398 WD234 WR117 CV305 UY109 WY107 QO743 KG907 RE385 OP167 PK244 RE567 TY341 WY390 TR675

추출옵션 :  2.2
질병시작패턴 = "S FG391"
S FG391 WR789 QW987 OP350 KL365 E

추출옵션 :  2.4
질병시작패턴 = "S FG391"
S FG391 QW987 WD675 CV305 KG504 ET650 LO103 PO276 WO208 UY109 TR675 ER203 BN377 PL221 WR405 RD201 WD100 WD599 JV766 WD675 JV109 QY204 WD675 KG907 ZA

In [13]:
avg_price_1year = 0
sum_price = 0 
for i , sent in enumerate(generated_all):
    print(i,sent)
    for str in sent.split(" ")[1:-1]:
        sum_price += dp_code_dic[str]
        
avg_price_1year = round(sum_price / (i+1) , -2) 

print('-'*100)
print('이 분은 앞으로 1년간 총', avg_price_1year, '원의 병원비가 들 예정입니다.')
print('-'*100)

0 S FG391 HD433 QA107 KG341 WR789 QO233 KP843 ZA780 QA107 WR321 E
1 S FG391 WY390 WD100 WD675 QW987 KP321 WR117 QO349 PL890 WO900 WR896 QO743 WD675 E
2 S FG391 QA107 TY901 OP350 WR321 HD433 ZA780 LO103 HD405 E
3 S FG391 WO188 JV765 QO743 KP345 WO188 TY901 LO976 TY211 OK211 JV765 KG504 WY504 JV766 FG308 WR789 KP843 KG766 JV765 QO349 PK213 TY341 ET320 WR321 ET320 UY980 WR405 PL890 PK213 QE277 LO103 E
4 S FG391 GH791 UY109 LO103 LO103 TY341 PO203 WR896 SF108 WY390 ER203 ET321 PK213 HD405 SD398 GH791 QE931 E
5 S FG391 WD599 SD398 KP345 PO276 KP321 OP167 E
6 S FG391 TR675 LO103 QY204 WD599 OP345 WD599 ET320 HD405 KG504 HK504 UY109 BN377 SF108 JV765 RD201 TY211 KG341 E
7 S FG391 JV766 OL766 WD599 QO349 OP350 SD388 QO233 QE931 UY201 TR109 LO103 HD405 LO103 PL890 TY901 OL766 PO203 RD201 KG341 FG371 TR109 WO900 ZA780 ZA780 ZA780 OL766 KG907 KG341 RD201 E
8 S FG391 HK377 QE277 QY204 PK244 RD201 TY211 ZA780 WD599 OK211 HD433 OL766 QO102 PL250 E
9 S FG391 OK211 WY390 WO378 OK211 TR508 WY504 E
10 S