# 당뇨병 예측 모델링 (LogisticRegression)

In [3]:
from numpy.random import RandomState
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

data = pd.read_csv('diabetes.csv')

# todo 1. 당뇨와 상관도가 가장 높은 특징(features) 6개 적용하기
features = ['Pregnancies','Glucose','Insulin','BMI','DiabetesPedigreeFunction','Age']

x = data[features]
y = data['Outcome']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# todo 2. LogisticRegression model 만들기( line 16, 파라미터는 자유)
model = LogisticRegression(random_state=1)

# 모델 훈련
model.fit(x_train, y_train)

# 테스트 데이터셋으로 예측
predictions = model.predict(x_test)

# 정확도 계산
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy_score(y_test, predictions))

Accuracy: 0.7597402597402597


# 당뇨병 예측 모델링 (ANN)

In [4]:
#  라이브러리 불러오기
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [5]:
#  데이터 불러오기
data = pd.read_csv('diabetes.csv')

# features
features = ['Pregnancies','Glucose','BloodPressure','Insulin',	'BMI','DiabetesPedigreeFunction',	'Age']

x = data[features]
y = data['Outcome']

In [6]:
#  데이터 나누기
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [8]:
# todo 3.  데이터 스케일링이 되도록 수정 (line 2)
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [9]:
# todo 4. 입력층의 값을 적절한 값으로 수정 완성하시오 (line 4)
# todo 5. 출력 layer의 값이 이진화 값으로 출력되도록 수정 (line 7)
model = tf.keras.Sequential([ # ANN 모델 정의
    tf.keras.Input(shape=(7,)),            # 입력층
    tf.keras.layers.Dense(16, activation='relu'),  # 은닉층1
    tf.keras.layers.Dense(8, activation='relu'),   # 은닉층2
    tf.keras.layers.Dense(1, activation='sigmoid') # 출력층
])

In [10]:
# todo 6. 교안 pp33을 참고하여, 본 모델링에 가장 적절한 loss 값을 작성하시오 (line 4)
model.compile( # 모델 컴파일
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])

In [12]:
# todo 7. val_loss가 연속 10번동안 개선되지 않으면 학습을 멈추도록 반영 (line 5, line 6)
# EarlyStopping 콜백 정의
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

In [13]:
# todo 8. 에폭의 횟수를 30, 에폭 학습과정을 보여주도록 수정하기 (line 4, line 7)
history = model.fit( # 모델 학습
    x_train, y_train,
    epochs=30,
    batch_size=16,
    validation_split=0.2,
    verbose=1
)

Epoch 1/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.6710 - loss: 0.6610 - val_accuracy: 0.6098 - val_loss: 0.6729
Epoch 2/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6680 - loss: 0.6369 - val_accuracy: 0.6260 - val_loss: 0.6368
Epoch 3/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6884 - loss: 0.5935 - val_accuracy: 0.6504 - val_loss: 0.6063
Epoch 4/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7008 - loss: 0.5568 - val_accuracy: 0.6585 - val_loss: 0.5804
Epoch 5/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7098 - loss: 0.5433 - val_accuracy: 0.7073 - val_loss: 0.5554
Epoch 6/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6990 - loss: 0.5386 - val_accuracy: 0.7561 - val_loss: 0.5362
Epoch 7/30
[1m31/31[0m [32m━━━━━━━━━

In [14]:
# 모델 예측 및 평가 지표 계산
y_pred_prob = model.predict(x_test)
y_pred = (y_pred_prob > 0.5).astype(int)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)  # 민감도
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall (Sensitivity): {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Accuracy: 0.7662
Precision: 0.6792
Recall (Sensitivity): 0.6545
F1 Score: 0.6667
