In [1]:
import pandas as pd
import numpy as np

In [2]:
# 에러발생이유? 인코딩의 문제 
# 리눅스 쉘에서 file -i 파일명  
'''
 리눅스에서 iconv -c -f utf-16le -t utf-8 diagnosis.data > diagnosis2.data  : 인코딩문자 변경해서 새로운 파일로 저장
'''

# read_csv에서 encoding인자 추가해서 해결도 가능

'\n 리눅스에서 iconv -c -f utf-16le -t utf-8 diagnosis.data > diagnosis2.data  : 인코딩문자 변경해서 새로운 파일로 저장\n'

In [4]:
datapath = '../data/diagnosis2.data'

original_data = pd.read_csv(datapath, sep='\t',
                           names=['temperature','nausea','lumbar_pain','urine_pushing',
                                 'micturition_pains','burning_of_urethra','inflammation','nephritis'])
original_data.shape

(120, 8)

In [5]:
original_data.columns

# 첫 행을 칼럼으로 인식

Index(['temperature', 'nausea', 'lumbar_pain', 'urine_pushing',
       'micturition_pains', 'burning_of_urethra', 'inflammation', 'nephritis'],
      dtype='object')

In [6]:
original_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflammation,nephritis
0,355,no,yes,no,no,no,no,no
1,359,no,no,yes,yes,yes,yes,no
2,359,no,yes,no,no,no,no,no
3,360,no,no,yes,yes,yes,yes,no
4,360,no,yes,no,no,no,no,no


In [7]:
        # temperature을 복제본을 떠서 apply를 호출하는 것 -> 그래서 temperature에 재할당시켜줘야함
original_data['temperature'] = original_data['temperature'].apply(lambda x : x.replace(',','.'))
original_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflammation,nephritis
0,35.5,no,yes,no,no,no,no,no
1,35.9,no,no,yes,yes,yes,yes,no
2,35.9,no,yes,no,no,no,no,no
3,36.0,no,no,yes,yes,yes,yes,no
4,36.0,no,yes,no,no,no,no,no


In [8]:
from sklearn import preprocessing

# 카테고리컬 값을 숫자값으로 바꿔줌      # 원핫인코더도 있는데 레이블인코더를 굳이 쓴 이유는!? 값을 가지는 값이 0과 1 두가지 밖에없기 때문
le_nausea = preprocessing.LabelEncoder()
# 어떤 값을 바꿔줄지 정해줌
le_nausea.fit(original_data['nausea'])
# 그리고 dict 형태(vocabulary 형태)로 만들어줌

# 클래스 종류 출력
print(le_nausea.classes_)

# 바꿔줄 카테고리값을 입력 -> 숫자로 return해줌 이 때 숫자 = index번호를 뜻함 위의 le_nausea.classes_ 의 index
print(le_nausea.transform(['no','yes','yes']))

# 숫자형을 우리가 갖고있던 카테고리값으로 다시 역변환
print(le_nausea.inverse_transform([0,0,1]))

['no' 'yes']
[0 1 1]
['no' 'no' 'yes']


In [9]:
dicted_data = original_data.copy()
dicted_data['nausea'] = le_nausea.transform(original_data['nausea'])

dicted_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflammation,nephritis
0,35.5,0,yes,no,no,no,no,no
1,35.9,0,no,yes,yes,yes,yes,no
2,35.9,0,yes,no,no,no,no,no
3,36.0,0,no,yes,yes,yes,yes,no
4,36.0,0,yes,no,no,no,no,no


In [10]:
# feature마다 맞는 인코더를 맞춰주는 것이 이상적
tmp = ['lumbar_pain','urine_pushing','micturition_pains','burning_of_urethra','inflammation','nephritis']


les = {'nausea': le_nausea}

for x in tmp:
    les[x] = preprocessing.LabelEncoder()
    dicted_data[x] = les[x].fit_transform(original_data[x])

dicted_data.head()

Unnamed: 0,temperature,nausea,lumbar_pain,urine_pushing,micturition_pains,burning_of_urethra,inflammation,nephritis
0,35.5,0,1,0,0,0,0,0
1,35.9,0,0,1,1,1,1,0
2,35.9,0,1,0,0,0,0,0
3,36.0,0,0,1,1,1,1,0
4,36.0,0,1,0,0,0,0,0


In [11]:
print(les)

{'nausea': LabelEncoder(), 'lumbar_pain': LabelEncoder(), 'urine_pushing': LabelEncoder(), 'micturition_pains': LabelEncoder(), 'burning_of_urethra': LabelEncoder(), 'inflammation': LabelEncoder(), 'nephritis': LabelEncoder()}


In [12]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

# 예측할 Label 지정
y = dicted_data['inflammation']

# temperature만 feature로 사용
features = ['temperature']
x = dicted_data[features]
                #노드 분할시 최소 데이터 개수(20개미만은 분할 안할거야!=terminal node로 간주할거야) =>자세히 구글링해보기 ..트리깊이와연관
model = DecisionTreeClassifier(min_samples_split=20, random_state=99)

In [13]:
# type : Series
y.head()

0    0
1    1
2    0
3    1
4    0
Name: inflammation, dtype: int64

In [14]:
# type : Dataframe
x.head() 

Unnamed: 0,temperature
0,35.5
1,35.9
2,35.9
3,36.0
4,36.0


In [15]:
# x=feature, y=label
model.fit(x,y)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=20,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=99, splitter='best')

In [17]:
# 디시젼 트리 모델 시각화
# teriminal node가 pure할수록 node 색깔이 진하다
from graphviz import Source
from sklearn.tree import export_graphviz
from IPython.display import SVG

inflammation_labels = les['inflammation'].inverse_transform(original_data['inflammation'])
print(inflammation_labels)

graph =  Source(tree.export_graphviz(model, out_file=None,
                                    feature_names=features, class_names=inflammation_labels,
                                    filled=True))
display(SVG(graph.pipe(format='svg')))

ModuleNotFoundError: No module named 'graphviz'

In [25]:
pred_y = model.predict(x)
# 예측할 때의 몇 퍼센트 확률이였는지 계산
pred_y_prob = model.predict_proba(x)
print(np.shape(pred_y))
print(np.shape(pred_y_prob))

(120,)
(120, 2)


In [None]:
# 확률 값이 리스트 2개요소인 것은 각 label에 대한 확률이 몇 퍼센트였는지 보여주기 때문에 label 갯수가 2개이기 떄문

In [28]:
mean_accuracy = model.score(x,y)
print(mean_accuracy)

0.7083333333333334


In [29]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree

# 예측할 Label 지정
y = dicted_data['inflammation']

features = ['temperature','nausea','lumbar_pain','urine_pushing','micturition_pains','burning_of_urethra']
x = dicted_data[features]
                #노드 분할시 최소 데이터 개수(20개미만은 분할 안할거야!=terminal node로 간주할거야) =>자세히 구글링해보기 ..트리깊이와연관
model = DecisionTreeClassifier(min_samples_split=20, random_state=99)

In [30]:
model.fit(x,y)
mean_accuracy = model.score(x,y)
print(mean_accuracy)

1.0


In [32]:
# feature의 기여도 측정
print(features)
print(model.feature_importances_)

['temperature', 'nausea', 'lumbar_pain', 'urine_pushing', 'micturition_pains', 'burning_of_urethra']
[0.22586919 0.         0.         0.48360656 0.29052425 0.        ]


In [34]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import tree
from sklearn.model_selection import train_test_split

features = ['temperature','nausea','lumbar_pain','urine_pushing','micturition_pains','burning_of_urethra']
#train/test 데이터 분할 ( 0.1이면 90%/10%)
train_d, test_d = train_test_split(dicted_data, test_size=0.1)

train_y = train_d['inflammation']
train_x = train_d[features]

test_y = test_d['inflammation']
test_x = test_d[features]

In [35]:
model = DecisionTreeClassifier(min_samples_split=20, random_state=99)
model.fit(train_x, train_y)

mean_accuracy_for_train = model.score(train_x,train_y)
print(mean_accuracy_for_train)

mean_accuracy_for_test = model.score(test_x, test_y)
print(mean_accuracy_for_test)

0.9259259259259259
0.8333333333333334


In [36]:
# min_samples_split 인자 수 줄이기
model = DecisionTreeClassifier(min_samples_split=2, random_state=99)
model.fit(train_x, train_y)

mean_accuracy_for_train = model.score(train_x,train_y)
print(mean_accuracy_for_train)

mean_accuracy_for_test = model.score(test_x, test_y)
print(mean_accuracy_for_test)

1.0
1.0


In [None]:
import pickle
modelpath = './model/decisionTree.model'
with open(modelpath, 'wb') as f:
    