# 検診データによる肝疾患判定

In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv("./train.csv")
test = pd.read_csv("./test2.csv")
sample = pd.read_csv("./sample_submit.csv",header=None)
print("Data Shapes")
print("Train:",train.shape, "Test:",test.shape, "Sample:",sample.shape)

Data Shapes
Train: (891, 12) Test: (383, 11) Sample: (382, 2)


### データセットの調査

In [2]:
print(train.isnull().sum())
print()
print(test.isnull().sum())

id          0
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    4
disease     0
dtype: int64

id          0
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    0
dtype: int64


In [3]:
#欠損データを平均値で補填
train['AG_ratio'] = train['AG_ratio'].fillna(train['AG_ratio'].mean())
print(train.isnull().sum())

id          0
Age         0
Gender      0
T_Bil       0
D_Bil       0
ALP         0
ALT_GPT     0
AST_GOT     0
TP          0
Alb         0
AG_ratio    0
disease     0
dtype: int64


In [4]:
#性別をOne-Hot表現に変更する
train = pd.get_dummies(train)
test = pd.get_dummies(test)

In [5]:
train.head()

Unnamed: 0,id,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,disease,Gender_Female,Gender_Male
0,0,60,2.9,1.3,170.9,42.1,37.1,5.5,2.9,1.01,1,0,1
1,1,28,0.7,0.1,158.8,26.0,23.9,6.4,3.7,1.36,0,1,0
2,2,60,23.1,12.5,962.0,53.0,40.9,6.8,3.3,0.96,1,0,1
3,3,20,1.0,0.5,415.9,33.9,39.0,7.0,3.8,1.31,0,0,1
4,4,44,0.6,0.3,152.9,40.9,42.0,4.5,2.1,1.04,0,1,0


In [6]:
test.head()

Unnamed: 0,id,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Female,Gender_Male
0,890,61,1.3,0.2,69.0,26.0,28.0,7.0,4.5,1.8,0,1
1,891,65,0.7,0.2,162.0,24.0,20.0,6.4,3.3,0.93,1,0
2,892,46,1.2,0.3,265.1,40.0,28.0,7.9,3.8,0.94,0,1
3,893,26,0.7,0.1,243.1,21.1,22.9,5.3,2.2,0.6,0,1
4,894,38,3.5,1.7,253.0,80.0,406.0,6.8,3.7,1.33,0,1


### 目的変数の抽出

In [7]:
y = train["disease"]
print(y)

0      1
1      0
2      1
3      0
4      0
      ..
886    0
887    1
888    1
889    1
890    0
Name: disease, Length: 891, dtype: int64


In [8]:
#ONE-HOTに変換
import keras
y = keras.utils.to_categorical(y,2)
y

Using TensorFlow backend.


array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]], dtype=float32)

In [9]:
train = train.drop(columns=["disease"])
train.head()

Unnamed: 0,id,Age,T_Bil,D_Bil,ALP,ALT_GPT,AST_GOT,TP,Alb,AG_ratio,Gender_Female,Gender_Male
0,0,60,2.9,1.3,170.9,42.1,37.1,5.5,2.9,1.01,0,1
1,1,28,0.7,0.1,158.8,26.0,23.9,6.4,3.7,1.36,1,0
2,2,60,23.1,12.5,962.0,53.0,40.9,6.8,3.3,0.96,0,1
3,3,20,1.0,0.5,415.9,33.9,39.0,7.0,3.8,1.31,0,1
4,4,44,0.6,0.3,152.9,40.9,42.0,4.5,2.1,1.04,1,0


In [10]:
train_data = train.values
train_labels = y
test_data = test.values


### 正規化

In [11]:
mean = train_data.mean(axis=0)
std = train_data.std(axis=0)
train_data = (train_data - mean) / std
test_data = (test_data - mean) / std
train_data

array([[-1.73010796,  1.00133319,  0.01873085, ...,  0.04028582,
        -0.5501196 ,  0.5501196 ],
       [-1.72622007, -0.92200364, -0.42848248, ...,  1.20154234,
         1.81778652, -1.81778652],
       [-1.72233219,  1.00133319,  4.12496239, ..., -0.12560797,
        -0.5501196 ,  0.5501196 ],
       ...,
       [ 1.72233219,  0.64070754,  4.20627391, ..., -0.52375306,
         1.81778652, -1.81778652],
       [ 1.72622007, -1.16242075, -0.32684309, ..., -0.42421679,
        -0.5501196 ,  0.5501196 ],
       [ 1.73010796,  0.82102036, -0.18454794, ..., -0.19196548,
        -0.5501196 ,  0.5501196 ]])

### モデルの構築

In [12]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(train_data.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax'))

#model.compile(optimizer='adam', 
#              loss='mse', 
#              metrics=['mae'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [13]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                832       
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 130       
Total params: 5,122
Trainable params: 5,122
Non-trainable params: 0
_________________________________________________________________


### 学習

In [14]:
history = model.fit(train_data, 
                    train_labels,
                    batch_size=1,
                    epochs=50,
                    verbose=1, 
                    validation_split=0.2)


Train on 712 samples, validate on 179 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
score = model.evaluate(train_data, train_labels, verbose=2)
print()
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Test loss: 0.2331070599598783
Test accuracy: 0.953984260559082


### 予測

In [16]:
test_predictions = model.predict(test_data)

In [17]:
print(test_predictions.shape)

(383, 2)


In [22]:
#最大値のインデックス
k = np.argmax(test_predictions,axis=1)
#先頭データが自身のデータ
print(k[0])
if k[0] == 0:
    print('正常です')
else:
    print('肝機能障害です')

0
正常です
