In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")

## 데이터 준비

In [36]:
# -------------------------------------------------
# data 불러오기
# -------------------------------------------------
df = pd.read_csv("../data/bike_sharing_demand.csv", parse_dates=["datetime"])

# -------------------------------------------------
# 파생컬럼 추가
# -------------------------------------------------
# 연, 월, 일, 시, 요일
df["year"] = df["datetime"].dt.year
df["month"] = df["datetime"].dt.month
df["day"] = df["datetime"].dt.day
df["hour"] = df["datetime"].dt.hour
df["dayofweek"] = df["datetime"].dt.dayofweek  # 월요일:0, 일요일:6
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,dayofweek
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,5
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,5
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,5
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,5
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,5


## 변수선택

In [37]:
# --------------------------------------------
# 변수선택
#   - casual, registered는 독립변수의 성격이 아니므로 선택하지 않음
#   - datetime은 선택하지 않음
# --------------------------------------------

X = df.drop(["datetime", "casual", "registered", "count", "day", "temp"], axis=1).copy()
y = df["count"]

## 훈련세트 테스트세트 분할

In [38]:
# 훈련세트/테스트세트 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8164, 11) (2722, 11) (8164,) (2722,)


## 전처리 파이프라인 구성
* 파이프라인 
    * 데이터 처리 과정을 순차적으로 연결하여 자동화하는 도구
* ColumnTransformer
    * 열별 전처리 파이프라인 구성

In [39]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# -----------------------------
# 전처리 대상 컬럼 지정
#   - categorical_features : 원‑핫 인코딩할 범주형 변수 목록
#   - numerical_features   : 표준화(평균 0, 표준편차 1)할 수치형 변수 목록
# -----------------------------
categorical_features = ["season", "weather", "year", "month", "hour", "dayofweek"]
numerical_features = ["atemp", "humidity", "windspeed"]

# -----------------------------
# ColumnTransformer 구성
#   - "cat" : 범주형 → OneHotEncoder
#   - "num" : 수치형 → StandardScaler
#   remainder="passthrough" : transformers에 포함되지 않은 컬럼은 변환 없이 그대로 유지
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        # ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", StandardScaler(), numerical_features)
    ],
    remainder="passthrough"   # drop 으로 바꾸면 나머지 컬럼을 제거
)

# LinearRegression

In [40]:
# --------------------------------------------
# 파이프라인 정의
# --------------------------------------------
from sklearn.pipeline import Pipeline       # 전처리+모델 학습을 하나의 흐름으로 구성
from sklearn.linear_model import LinearRegression

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# --------------------------------------------
# 훈련
# --------------------------------------------
pipeline.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = pipeline.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f"R2: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")

print(f"Train Score: {pipeline.score(X_train, y_train):.3f}")
print(f"Test Score: {pipeline.score(X_test, y_test):.3f}")

R2: 0.387
RMSE: 141.809
MAE: 105.524
MSE: 20109.880
Train Score: 0.390
Test Score: 0.387


  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_
  return X @ coef_ + self.intercept_


# KNeighborsRegressor

In [41]:

from sklearn.neighbors import KNeighborsRegressor

# --------------------------------------------
# 파이프라인 정의
# --------------------------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

# --------------------------------------------
# 훈련
# --------------------------------------------
pipeline.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = pipeline.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f"R2: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")

print(f"Train Score: {pipeline.score(X_train, y_train):.3f}")
print(f"Test Score: {pipeline.score(X_test, y_test):.3f}")


R2: 0.882
RMSE: 62.239
MAE: 39.310
MSE: 3873.669
Train Score: 0.928
Test Score: 0.882


# DecisionTreeRegressor

In [42]:

from sklearn.tree import DecisionTreeRegressor

# --------------------------------------------
# 파이프라인 정의
# --------------------------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

# --------------------------------------------
# 훈련
# --------------------------------------------
pipeline.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = pipeline.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f"R2: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")

print(f"Train Score: {pipeline.score(X_train, y_train):.3f}")
print(f"Test Score: {pipeline.score(X_test, y_test):.3f}")


R2: 0.905
RMSE: 55.704
MAE: 33.801
MSE: 3102.893
Train Score: 1.000
Test Score: 0.905


# RandomForestRegressor

In [43]:
from sklearn.ensemble import RandomForestRegressor

# --------------------------------------------
# 파이프라인 정의
# --------------------------------------------
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# --------------------------------------------
# 훈련
# --------------------------------------------
pipeline.fit(X_train, y_train)

# --------------------------------------------
# 예측
# --------------------------------------------
pred = pipeline.predict(X_test)

# --------------------------------------------
# 모델 성능 평가
# --------------------------------------------
from sklearn.metrics import r2_score, root_mean_squared_error, mean_absolute_error, mean_squared_error
r2 = r2_score(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)

print(f"R2: {r2:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"MAE: {mae:.3f}")
print(f"MSE: {mse:.3f}")
print(f"Train Score: {pipeline.score(X_train, y_train):.3f}")
print(f"Test Score: {pipeline.score(X_test, y_test):.3f}")

R2: 0.955
RMSE: 38.467
MAE: 24.200
MSE: 1479.685
Train Score: 0.993
Test Score: 0.955


# 모델 활용


In [44]:
import joblib

# ----------------------
# 파이프라인 저장
# ----------------------
joblib.dump(pipeline, "../models/bike_rent_pipe.pkl")

['../models/bike_rent_pipe.pkl']

In [45]:
# ------------------
# 모델 불러오기 
# ------------------
loaded_pipe = joblib.load("../models/bike_rent_pipe.pkl")

In [46]:
# ------------------
# 예측 
# ------------------

new_row = {
    "season": 2, "holiday": 0, "workingday": 1, "weather": 1,
    "atemp": 20.5, "humidity": 55, "windspeed": 0.12,
    "year": 2025, "month": 5, "day": 1, "hour": 17, "dayofweek": 3
}

new_df = pd.DataFrame([new_row])

# Pipeline 내부에서 자동으로 → One‑Hot → 스케일링 → 예측
count_pred = loaded_pipe.predict(new_df)[0]
print(f"예상 대여 수: {count_pred:.0f}대")

예상 대여 수: 581대
