<a href="https://colab.research.google.com/github/yunju-1118/EWHA/blob/main/Logistic_Regression_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
hotel = pd.read_csv('/content/drive/MyDrive/Hotel_Reservations.csv')

**기존 전처리**

In [None]:
for c in ["arrival_year","arrival_month","arrival_date"]:
    hotel[c] = pd.to_numeric(hotel[c], errors="coerce")

ym = pd.to_datetime(
    hotel["arrival_year"].astype("Int64").astype(str) + "-" +
    hotel["arrival_month"].astype("Int64").astype(str).str.zfill(2) + "-01",
    errors="coerce"
)
last_day = (ym + pd.offsets.MonthEnd(0)).dt.day

invalid_month = ~hotel["arrival_month"].between(1, 12)
invalid_day_low = hotel["arrival_date"] < 1
invalid_day_high = hotel["arrival_date"] > last_day
invalid_yearmonth = ym.isna()
bad_mask = invalid_month | invalid_day_low | invalid_day_high | invalid_yearmonth

ok = ~bad_mask
hotel.loc[ok, "arrival_date_full"] = pd.to_datetime({
    "year":  hotel.loc[ok, "arrival_year"],
    "month": hotel.loc[ok, "arrival_month"],
    "day":   hotel.loc[ok, "arrival_date"],
}, errors="coerce")

hotel.loc[bad_mask, "arrival_date_full"] = pd.NaT

problem_preview = hotel.loc[bad_mask, ["arrival_year","arrival_month","arrival_date"]].head(10)
print(problem_preview)
print("문제 행 수:", bad_mask.sum(), " / 전체:", len(hotel))

      arrival_year  arrival_month  arrival_date
2626          2018              2            29
3677          2018              2            29
5600          2018              2            29
6343          2018              2            29
7648          2018              2            29
8000          2018              2            29
8989          2018              2            29
9153          2018              2            29
9245          2018              2            29
9664          2018              2            29
문제 행 수: 37  / 전체: 36275


In [None]:
for c in ["arrival_year", "arrival_month", "arrival_date"]:
    hotel[c] = pd.to_numeric(hotel[c], errors="coerce")

hotel["arrival_date_full"] = pd.to_datetime({
    "year": hotel["arrival_year"],
    "month": hotel["arrival_month"],
    "day": hotel["arrival_date"]
}, errors="coerce")

hotel = hotel.dropna(subset=["arrival_date_full"]).reset_index(drop=True)

print("남은 행 개수:", len(hotel))

남은 행 개수: 36238


In [None]:
from sklearn.preprocessing import LabelEncoder

## datetime 하나로 합치는 코드
tmp = hotel.rename(columns={
    "arrival_year": "year",
    "arrival_month": "month",
    "arrival_date": "day"
})

hotel["arrival_date_full"] = pd.to_datetime(tmp[["year","month","day"]])

In [None]:
## 범주형 변수 처리하는 코드 (binary였던 거는 그대로 냅뒀어용)
cat_cols = ["type_of_meal_plan", "room_type_reserved", "market_segment_type"]
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    hotel[col] = le.fit_transform(hotel[col])
    le_dict[col] = le


## 변수 drop
hotel = hotel.drop(columns=["Booking_ID","arrival_year", "arrival_date"])

In [None]:
## 파생 변수
## 총 숙박일수
hotel["total_nights"] = hotel["no_of_weekend_nights"] + hotel["no_of_week_nights"]

## 리드타임 그룹화 -- 구간은 일주일, 한달, 세달, 반년, 일년, 일년 이상으로 나눴습니다!
hotel["lead_time_group"] = pd.cut(hotel["lead_time"],
                                  bins=[-1,7,30,90,180,365,800],
                                  labels=["<1w","1w-1m","1-3m","3-6m","6-12m",">1y"])

## 평균 1박당 가격 (NaN은 0으로)
hotel["avg_price_per_night"] = hotel["avg_price_per_room"] / hotel["total_nights"].replace(0, np.nan)
hotel["avg_price_per_night"] = hotel["avg_price_per_night"].fillna(0)

## 동반 인원
hotel["total_guests"] = hotel["no_of_adults"] + hotel["no_of_children"]

## 자녀
hotel["has_children"] = (hotel["no_of_children"] > 0).astype(int)

## 취소 비율
hotel["cancel_ratio"] = hotel["no_of_previous_cancellations"] / (
    hotel["no_of_previous_cancellations"] + hotel["no_of_previous_bookings_not_canceled"] + 1
)

## 이전 예약 경험
hotel["has_prev_booking"] = (
    (hotel["no_of_previous_cancellations"] + hotel["no_of_previous_bookings_not_canceled"]) > 0
).astype(int)

## 성수기
hotel["is_peak"] = hotel["arrival_month"].isin([6,7,8,12]).astype(int)

## 식사 플랜
hotel["no_meal_plan"] = (hotel["type_of_meal_plan"] == le_dict["type_of_meal_plan"].transform(["Not Selected"])[0]).astype(int)

**2차 전처리**

(sin,cos 처리), 요일 관련 변수 추가

In [None]:
if "arrival_date_full" in hotel.columns:
    hotel["arrival_date_full"] = pd.to_datetime(hotel["arrival_date_full"], errors="coerce")

    hotel["arrival_day_of_week"] = hotel["arrival_date_full"].dt.dayofweek

In [None]:
hotel["month_sin"] = np.sin(2 * np.pi * hotel["arrival_month"] / 12)
hotel["month_cos"] = np.cos(2 * np.pi * hotel["arrival_month"] / 12)

hotel["dow_sin"] = np.sin(2 * np.pi * hotel["arrival_day_of_week"] / 7)
hotel["dow_cos"] = np.cos(2 * np.pi * hotel["arrival_day_of_week"] / 7)

가격 관련 추가 변수
- 1인당 가격, 성수기일 경우의 방 가격

In [None]:
hotel["price_per_guest"] = hotel["avg_price_per_room"] / (hotel["total_guests"].replace(0, np.nan))
hotel["price_per_guest"] = hotel["price_per_guest"].fillna(0)

hotel["peak_price"] = hotel["avg_price_per_room"] * hotel["is_peak"]

lead_time 비선형성 + month랑 곱해서 시즌 별 lead_Time 패턴 고려

In [None]:
hotel["log_lead_time"] = np.log1p(hotel["lead_time"])

hotel["lead_x_month"] = hotel["lead_time"] * hotel["arrival_month"]

취소 경험 여부

In [None]:
hotel["ever_canceled"] = (hotel["no_of_previous_cancellations"] > 0).astype(int)

**기존 전처리(KNN)**

In [None]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

def transform_with_knn_env(
    X_train: pd.DataFrame, y_train: pd.Series,
    X_test: pd.DataFrame,
    env_features = ["type_of_meal_plan", "required_car_parking_space",
                    "room_type_reserved", "no_of_special_requests"],
    k: int = 10
):
    ## 이웃 취소율
    knn_cls = KNeighborsClassifier(n_neighbors=k)
    knn_cls.fit(X_train[env_features], y_train)
    tr_env_cancel = knn_cls.predict_proba(X_train[env_features])[:, 1]
    te_env_cancel = knn_cls.predict_proba(X_test[env_features])[:, 1]

    ## 이웃 평균 1박 가격
    knn_reg = KNeighborsRegressor(n_neighbors=k)
    knn_reg.fit(X_train[env_features], X_train["avg_price_per_night"])
    tr_env_price = knn_reg.predict(X_train[env_features])
    te_env_price = knn_reg.predict(X_test[env_features])

    X_train_knn = X_train.copy()
    X_test_knn  = X_test.copy()

    X_train_knn["env_cancel_rate"] = tr_env_cancel
    X_test_knn["env_cancel_rate"]  = te_env_cancel
    X_train_knn["env_avg_price"]   = tr_env_price
    X_test_knn["env_avg_price"]    = te_env_price

    return X_train_knn, X_test_knn

**Logistic Regression 전처리**

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.model_selection import train_test_split


df_lr = hotel.copy()

## 이상치 제거
df_lr = df_lr[df_lr["avg_price_per_room"] < 500]

## 로그 변환
for col in ["lead_time", "no_of_previous_cancellations", "no_of_previous_bookings_not_canceled", "no_of_children"]:
    df_lr[col] = np.log1p(df_lr[col])

X_lr = df_lr.drop(columns=["booking_status","arrival_date_full"])
y_lr = (df_lr["booking_status"] == "Canceled").astype(int)

X_tr, X_te, y_tr, y_te = train_test_split(X_lr, y_lr, test_size=0.2, random_state=42)

## KNN 적용
X_tr, X_te = transform_with_knn_env(
    X_train=X_tr, y_train=y_tr, X_test=X_te,
    env_features=["type_of_meal_plan","required_car_parking_space","room_type_reserved","no_of_special_requests"],
    k=10
)


## 스케일링
scaler = StandardScaler()
num_cols = [
    "no_of_adults","no_of_children","no_of_weekend_nights","no_of_week_nights",
    "lead_time","avg_price_per_room","total_nights","avg_price_per_night",
    "total_guests","cancel_ratio","no_of_special_requests",
]

for c in ["env_cancel_rate", "env_avg_price"]:
    if c in X_tr.columns:
        num_cols.append(c)

num_cols = [c for c in num_cols if c in X_tr.columns]

X_tr[num_cols] = scaler.fit_transform(X_tr[num_cols])
X_te[num_cols] = scaler.transform(X_te[num_cols])

In [None]:
X_lr_train, X_lr_test, y_lr_train, y_lr_test = X_tr, X_te, y_tr, y_te

**모델 적합**

1) 규제 강도

- c가 유의미한지 확인

c = 0.5

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_features = ['lead_time_group']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

log_reg = LogisticRegression(
    C=0.5,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8119481236203091
F1-score: 0.6939142151358635
AUC: 0.8618031678884964


전처리 변경 후, 모든 지표 0.01 좋아짐..

c = 1

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8134657836644592
F1-score: 0.6943942133815552
AUC: 0.861556075622657


전처리 변경 후, 모든 지표 0.01 좋아짐..

-> 규제 강도를 조절하는 하이퍼 파라미터 c는 유의미하지 않음.

2. 규제 종류

- l1 규제

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='l1',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8115342163355408
F1-score: 0.6926192619261926
AUC: 0.8628021488054504


전체적으로 0.01 좋아짐

- l2 규제

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='l2',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8134657836644592
F1-score: 0.6943942133815552
AUC: 0.861556075622657


- ElasticNet

ElasticNet 1 -> l1_ratio = 0.5

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    random_state=42,
    l1_ratio=0.5
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)



In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.7719370860927153
F1-score: 0.5738592420726991
AUC: 0.8171126837360487


ElasticNet에서는 오히려 약 0.03씩 성능 감소

ElasticNet 2 -> l1_ratio = 0.2

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='elasticnet',
    solver='saga',
    max_iter=1000,
    random_state=42,
    l1_ratio=0.2
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)



In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.7719370860927153
F1-score: 0.5738592420726991
AUC: 0.8171176014243006


l1_ratio 조절해도 성능은 약 0.03 감소했음.

l1_ratio도 유의미하지 않음.

- None (규제 x)

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty=None,
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8102924944812362
F1-score: 0.6923249049004252
AUC: 0.8615740208710141


비슷

-> 규제 종류를 바꾸는 것도 유의미하지 않음.

3. 반복 횟수 조절

비슷하니 l1 규제 기준

- max_iter = 100

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='l1',
    solver='liblinear',
    max_iter=100,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8115342163355408
F1-score: 0.6926192619261926
AUC: 0.8628021488054504


- max_iter = 500

In [None]:
log_reg = LogisticRegression(
    C=1,
    penalty='l1',
    solver='liblinear',
    max_iter=500,
    random_state=42
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', log_reg)])

# 모델 학습
pipeline.fit(X_tr, y_tr)

# 예측
y_pred = pipeline.predict(X_te)

In [None]:
print("Accuracy:", accuracy_score(y_te, y_pred))

# 예측
y_pred = pipeline.predict(X_te)
y_proba = pipeline.predict_proba(X_te)[:, 1]  # 클래스 1에 대한 확률

# 1. F1 점수
f1 = f1_score(y_te, y_pred)
print("F1-score:", f1)

# 2. AUC score
auc = roc_auc_score(y_te, y_proba)
print("AUC:", auc)

Accuracy: 0.8115342163355408
F1-score: 0.6926192619261926
AUC: 0.8628021488054504


- max_iter = 1000

(앞에서 한 모델, 결과는 비슷)

규제 강도, 규제 방법, 반복 횟수 모두 유의미하지 않다.

Logistic Regression은 유의미하지 않은 것 같다.