# 1. dataset 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import optuna
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm 
from time import time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [3]:
# 한글 폰트 경로 설정
font_path = '/System/Library/Fonts/AppleSDGothicNeo.ttc'
font_name = font_manager.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

In [4]:
data = pd.read_csv('combined_data.csv', encoding = "cp949")
data.head()

#data = data[data['pick_rgn2_nm']=='강남구']
print(data.shape) 

(130875, 13)


In [5]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130875 entries, 0 to 130874
Data columns (total 13 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pick_rgn2_nm   130875 non-null  object 
 1   rider_cnt      130875 non-null  float64
 2   datetime       130875 non-null  object 
 3   hour_reg       130875 non-null  int64  
 4   reg_date       130875 non-null  object 
 5   day_of_reg     130875 non-null  object 
 6   is_holiday     130875 non-null  int64  
 7   is_rain        130875 non-null  int64  
 8   rider_cnt_w_1  130875 non-null  float64
 9   rider_cnt_w_2  130875 non-null  float64
 10  rider_cnt_w_3  130875 non-null  float64
 11  rider_cnt_w_4  130875 non-null  float64
 12  order_cnt_w_1  130875 non-null  int64  
dtypes: float64(5), int64(4), object(4)
memory usage: 13.0+ MB
None
           rider_cnt      hour_reg     is_holiday        is_rain   
count  130875.000000  130875.00000  130875.000000  130875.000000  \
mean      

In [6]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])

data = data.sort_values(by="datetime")

In [None]:
# data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
#print(data.shape) 

In [7]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['day_of_reg','pick_rgn2_nm', 'hour_reg', 'is_holiday', 'is_rain'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


pick_rgn2_nm           category
rider_cnt               float64
datetime         datetime64[ns]
hour_reg               category
reg_date         datetime64[ns]
day_of_reg             category
is_holiday             category
is_rain                category
rider_cnt_w_1           float64
rider_cnt_w_2           float64
rider_cnt_w_3           float64
rider_cnt_w_4           float64
order_cnt_w_1             int64
dtype: object


# 2. 데이터 전처리

In [None]:
# numeric 변수 scale 
# scaler = StandardScaler()  #평균 0 , 분산 1로 조정
# #scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# data[num_vars] = scaler.fit_transform(data[num_vars])

# print(df.head(3))

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [8]:
#df = data.drop(columns = ['rider_cnt_w_3','rider_cnt_w_4'])
data = data.sort_values(by=["reg_date", "pick_rgn2_nm", "hour_reg","day_of_reg"])
df = data 
var = ['day_of_reg','pick_rgn2_nm', 'hour_reg', 'is_holiday', 'is_rain']
encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, df.drop(columns=var)], axis=1)
#print(df.head(3))
print(df.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일',
       'pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'is_holiday_0', 'is_holiday_1', 'is_rain_0'

# 3. train, test set split

In [9]:
# train_ratio = 0.8
# total_samples = df.shape[0]
# train_samples = int(train_ratio * total_samples)
# df_train = df[:train_samples]
# df_test = df[train_samples:]

df_train = df[df["reg_date"]<= '2023-03-31']
df_test = df[df["reg_date"] >= '2023-04-01']

print(df_train['reg_date'].min()) #2023-01-04
print(df_train['reg_date'].max()) #2023-04-30
print(df_test['reg_date'].min()) #2023-05-01
print(df_test['reg_date'].max()) #2023-06-12

df_train = df_train.drop(columns = ['datetime', 'reg_date'])
df_test = df_test.drop(columns = ['datetime', 'reg_date'])
print(df_train.shape, df_test.shape) 

2022-07-01 00:00:00
2023-03-31 00:00:00
2023-04-01 00:00:00
2023-06-14 00:00:00
(102750, 57) (28125, 57)


In [10]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt'])
y_train = df_train['rider_cnt']

X_test = df_test.drop(columns=['rider_cnt'])
y_test = df_test['rider_cnt']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(102750, 56) (102750,) (28125, 56) (28125,)


In [11]:
print(X_train.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일',
       'pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'is_holiday_0', 'is_holiday_1', 'is_rain_0'

### numeric_scale 

In [None]:
# # 입력 변수 
# numeric_cols = ['rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#                 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#                 'order_cnt_w_4']

# # scaler 
# scaler_X = StandardScaler()

# # X_train, X_test
# X_train_scaled = scaler_X.fit_transform(X_train[numeric_cols])
# X_test_scaled = scaler_X.transform(X_test[numeric_cols])

# # 스케일링된 결과를 DataFrame으로 변환
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index = X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index = X_test.index)

# # 원래의 범주형 변수들을 선택
# categorical_cols = [col for col in X_train.columns if col not in numeric_cols]
# X_train_cat = X_train[categorical_cols]
# X_test_cat = X_test[categorical_cols]

# # 스케일링된 DataFrame과 범주형 변수들을 병합
# X_train_final = pd.concat([X_train_scaled, X_train_cat], axis=1)
# X_test_final = pd.concat([X_test_scaled, X_test_cat], axis=1)



In [None]:
# 예측값
# y_train, y_test
# scaler_y = StandardScaler()
# y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# y_train_scaled = y_train_scaled.ravel()
# y_test_scaled=  y_test_scaled.ravel()

# print(y_train_scaled.shape)
# print(y_test_scaled.shape)

## 4. modeling

In [13]:
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) /y_test)) *100

In [20]:
# 인풋의 차원 수를 정의합니다 (여기서는 one-hot 인코딩된 데이터의 차원 수)
n_features = X_train.shape[1]

# 모델 구성
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(n_features,)))
model.add(Dense(32, activation='relu'))
model.add(Dense(1)) # 출력층은 회귀 문제이므로 활성화 함수 없음

# 모델 컴파일
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# 테스트 세트로 평가
y_pred = model.predict(X_test)

# 평가 지표 계산 및 출력
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
mae_test = mean_absolute_error(y_test, y_pred)
r2_test = r2_score(y_test, y_pred)
#mape_test = MAPE(y_test, y_pred)

print('Test RMSE:', rmse_test)
print('Test MAE:', mae_test)
print('Test R2:', r2_test)
#print('Test MAPE:', mape_test)

test_set = data[data["datetime"] >= '2023-04-01']
test_set = test_set.sort_values(by=["reg_date", "pick_rgn2_nm", "hour_reg","day_of_reg"])

result_test = pd.DataFrame({'datetime': test_set["reg_date"],
                                'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"], 
                                'is_rain': test_set["is_rain"], 'day_of_reg': test_set["day_of_reg"], "is_holiday" : test_set["is_holiday"],
                                'y_test': test_set["rider_cnt"]})

result_test['y_pred'] = y_pred
result_test.to_csv('prediction_results_test_set_DL.csv', index=False, encoding="cp949")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### stacking model 

In [23]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

# Initialize models
lasso = Lasso()
lr = LinearRegression()
rf = RandomForestRegressor()

# Meta model
gbr = GradientBoostingRegressor()

# Stacking model
stack = StackingCVRegressor(regressors=(lasso, lr, rf),
                            meta_regressor=gbr,
                            use_features_in_secondary=True)

# Training the stacking classifier
stack.fit(X_train.values, y_train.values)

# Predict
y_pred = stack.predict(X_test.values)

test_set = data[data["datetime"] >= '2023-04-01']
test_set = test_set.sort_values(by=["reg_date", "pick_rgn2_nm", "hour_reg","day_of_reg"])

result_test = pd.DataFrame({'datetime': test_set["reg_date"],
                                'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"], 
                                'is_rain': test_set["is_rain"], 'day_of_reg': test_set["day_of_reg"], "is_holiday" : test_set["is_holiday"],
                                'y_test': test_set["rider_cnt"]})

result_test['y_pred'] = y_pred
result_test.to_csv('prediction_results_test_set_DL.csv', index=False, encoding="cp949")

# Evaluate the model
print('Stack Test RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))
print('Stack Test MAE:', mean_absolute_error(y_test, y_pred))


Stack Test RMSE: 35.973767133283005
Stack Test MAE: 23.65340981122298
