In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 파일 경로 설정
file_path = "/content/drive/MyDrive/Colab Notebooks/diabetes.csv"

# CSV 파일 읽기
df = pd.read_csv(file_path)

# 데이터프레임 확인
print(df.head())


   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [6]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [7]:
X = df.drop("Outcome", axis=1)  # 특징 변수들
y = df["Outcome"]               # 타깃

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
# -----------------------------

# 2) 모델 구성

# -----------------------------

dt = DecisionTreeClassifier(random_state=42)

rf = RandomForestClassifier(n_estimators=200, random_state=42)

lr = LogisticRegression(max_iter=500)



In [15]:
# -----------------------------

# 3) 모델 학습

# -----------------------------

dt.fit(X_train, y_train)

rf.fit(X_train, y_train)

lr.fit(X_train, y_train)




In [19]:
# -----------------------------

# 4) 모델 평가

# -----------------------------

dt_acc = accuracy_score(y_test, dt.predict(X_test))

rf_acc = accuracy_score(y_test, rf.predict(X_test))

lr_acc = accuracy_score(y_test, lr.predict(X_test))

dt_y_pred = dt.predict(X_test)


print("=== Test Accuracy ===")

print(f"Decision Tree : {dt_acc:.4f}")

print(f"Random Forest : {rf_acc:.4f}")

print(f"Logistic Reg. : {lr_acc:.4f}")

=== Test Accuracy ===
Decision Tree : 0.7273
Random Forest : 0.7468
Logistic Reg. : 0.7143


In [21]:
#학습한 결정트리 모델을 사용해서 테스트 데이터를 분류한 결과
dt_y_pred

array([0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [16]:
#실제 예측한 값
y_test

Unnamed: 0,Outcome
44,0
672,0
700,0
630,1
81,0
...,...
32,0
637,0
593,0
425,1
