In [2]:
# ------------------------------------
# 예제 1: 의료 데이터 (BMI 변수 생성)
# ------------------------------------
import pandas as pd
data = {
    "height_cm": [170, 160, 180, 175, 165],
    "weight_kg": [70, 60, 90, 80, 55],
    "glucose": [110, 95, 150, 130, 85],
    "family_history": [1, 0, 1, 1, 0],
    "target": [0, 0, 1, 1, 0]  # 당뇨 여부
}
df = pd.DataFrame(data)
df

# 도메인 지식: BMI = 체중(kg) / (키(m)^2)
df["BMI"] = df["weight_kg"] / ((df["height_cm"]/100) ** 2)
df

# 여기서 BMI는 당뇨병 예측 모델에서 중요한 독립변수(feature)로 활용됨.



Unnamed: 0,height_cm,weight_kg,glucose,family_history,target,BMI
0,170,70,110,1,0,24.221453
1,160,60,95,0,0,23.4375
2,180,90,150,1,1,27.777778
3,175,80,130,1,1,26.122449
4,165,55,85,0,0,20.20202


In [3]:
# ------------------------------------
# 
# ------------------------------------
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
train = pd.read_csv("diabetes_sample.csv")
train

# 타깃
y = train["target"]

# 1) Baseline (BMI 제외)
X_base = train[["glucose", "family_history"]]


# 데이터분할
X_train, X_test, y_train, y_test = train_test_split(
    X_base, y, test_size=0.3, random_state=42, stratify=y
)

# 학습
model_base = LogisticRegression(max_iter=300)
model_base.fit(X_train, y_train)

# 예측
pred_base = model_base.predict(X_test)
proba_base = model_base.predict_proba(X_test)[:, 1]

print("Baseline (BMI 제외)")
print("accuracy_score :", accuracy_score(y_test, pred_base))
print("F1-score:", f1_score(y_test, pred_base))
print("AUC:", roc_auc_score(y_test, proba_base))
print("--------------------------")


# 2) With BMI (BMI 포함)
X_bmi = train[["glucose", "family_history", "BMI"]]

X_train, X_test, y_train, y_test = train_test_split(
    X_bmi, y, test_size=0.3, random_state=42, stratify=y
)

model_bmi = LogisticRegression(max_iter=300)
model_bmi.fit(X_train, y_train)
pred_bmi = model_bmi.predict(X_test)
proba_bmi_base = model_bmi.predict_proba(X_test)[:, 1]

#
acc_bmi = accuracy_score(y_test, pred_bmi)

# 3) 결과 출력

print("BMI 포함")
print("accuracy_score :", accuracy_score(y_test, pred_bmi))
print("F1-score:", f1_score(y_test, pred_bmi))
print("AUC:", roc_auc_score(y_test, proba_bmi_base))


Baseline (BMI 제외)
accuracy_score : 0.68
F1-score: 0.7303370786516854
AUC: 0.7486373546511628
--------------------------
BMI 포함
accuracy_score : 0.8933333333333333
F1-score: 0.9058823529411765
AUC: 0.9665697674418604


In [None]:
# 예제 2: 금융 데이터 (DTI: 소득 대비 부채 비율)
data = {
"annual_income": [50000, 40000, 80000], # 연소득
"loan_amount": [20000, 25000, 30000], # 대출 금액
"credit_score": [720, 650, 800] # 신용 점수
}
df = pd.DataFrame(data)

# 도메인 지식: DTI(Debt-to-Income Ratio) = 대출금 / 연소득
df["DTI"] = df["loan_amount"] / df["annual_income"]
df

# DTI가 높을수록 상환 불이행(부도) 위험이 크다고 해석할 수 있음.


In [4]:
# ------------------------------------
# 금융 데이터: DTI 포함 전/후 비교
# ------------------------------------
# default: 상환 불이행 여부 (0=정상, 1=부도)

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


# 데이터 불러오기
train = pd.read_csv("finance_sample.csv")

# 타깃
y = train["default"]

# 1) Baseline (DTI 제외)
X_base = train[["annual_income", "loan_amount", "credit_score"]]

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(
    X_base, y, test_size=0.3, random_state=42, stratify=y
)

# 학습(표준화스케일링 포함) - 회귀는 스케일링
model_base = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=300))
])
model_base.fit(X_train, y_train)


# 예측
pred_base = model_base.predict(X_test)
proba_base = model_base.predict_proba(X_test)[:, 1]

print("Baseline (DTI 제외)")
print("accuracy_score :", accuracy_score(y_test, pred_base))
print("F1-score:", f1_score(y_test, pred_base))
print("AUC:", roc_auc_score(y_test, proba_base))
print("--------------------------")

# 2) With DTI (DTI 포함) - 회귀는 스케일링
X_dti = train[["annual_income", "loan_amount", "credit_score", "DTI"]]

X_train, X_test, y_train, y_test = train_test_split(
    X_dti, y, test_size=0.3, random_state=42, stratify=y
)

# 학습(표준화 스케일링 포함)
model_dti = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=300))
])
model_dti.fit(X_train, y_train)


pred_dti = model_dti.predict(X_test)
proba_dti = model_dti.predict_proba(X_test)[:, 1]

print("With DTI (DTI 포함)")
print("accuracy_score :", accuracy_score(y_test, pred_dti))
print("F1-score:", f1_score(y_test, pred_dti))
print("AUC:", roc_auc_score(y_test, proba_dti))



Baseline (DTI 제외)
accuracy_score : 0.9888888888888889
F1-score: 0.9942857142857143
AUC: 0.9954285714285714
--------------------------
With DTI (DTI 포함)
accuracy_score : 0.9888888888888889
F1-score: 0.9942857142857143
AUC: 0.9977142857142858
