# 1. dataset 

In [52]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import optuna
import joblib
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import font_manager

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.inspection import permutation_importance
import statsmodels.api as sm 
from time import time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer


# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [None]:
# 한글 폰트 경로 설정
font_path = '/System/Library/Fonts/AppleSDGothicNeo.ttc'
font_name = font_manager.FontProperties(fname=font_path).get_name()
plt.rc('font', family=font_name)

In [62]:
data = pd.read_csv('combined_data_day.csv', encoding = "cp949")
data.head()
print(data.shape) #357

(356, 15)


In [63]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356 entries, 0 to 355
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   reg_date       356 non-null    object 
 1   holiday_yn     356 non-null    object 
 2   rider_cnt      356 non-null    int64  
 3   day_of_reg     356 non-null    object 
 4   rain_c         356 non-null    float64
 5   snow_c         356 non-null    float64
 6   is_rain        356 non-null    int64  
 7   rain_group     356 non-null    object 
 8   month          356 non-null    int64  
 9   rider_cnt_w_1  356 non-null    int64  
 10  rider_cnt_w_2  356 non-null    int64  
 11  rider_cnt_w_3  356 non-null    int64  
 12  rider_cnt_w_4  356 non-null    int64  
 13  order_cnt_w_1  356 non-null    int64  
 14  group_s        356 non-null    object 
dtypes: float64(2), int64(8), object(5)
memory usage: 41.8+ KB
None
         rider_cnt      rain_c      snow_c     is_rain       month   
count    

In [64]:
data["reg_date"] = pd.to_datetime(data["reg_date"])
data = data.sort_values(by="reg_date")

In [65]:
data = data.drop(columns = ['month', 'rain_c','snow_c'])
# print(data.head())

In [66]:
#data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
#print(data.shape) 

reg_date         0
holiday_yn       0
rider_cnt        0
day_of_reg       0
is_rain          0
rain_group       0
rider_cnt_w_1    0
rider_cnt_w_2    0
rider_cnt_w_3    0
rider_cnt_w_4    0
order_cnt_w_1    0
group_s          0
dtype: int64

In [67]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in [ 'day_of_reg', 'is_rain','holiday_yn' ,'rain_group', 'group_s'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


reg_date         datetime64[ns]
holiday_yn             category
rider_cnt                 int64
day_of_reg             category
is_rain                category
rain_group             category
rider_cnt_w_1             int64
rider_cnt_w_2             int64
rider_cnt_w_3             int64
rider_cnt_w_4             int64
order_cnt_w_1             int64
group_s                category
dtype: object


# 2. 데이터 전처리

## 2-1. one-hot-encoding

In [68]:
#df = data.drop(columns = ['month'])
df = data
var = [ 'day_of_reg', 'is_rain','holiday_yn' ,'rain_group', 'group_s']
encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, df.drop(columns=var)], axis=1)

print(df.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'holiday_yn_N', 'holiday_yn_Y', 'rain_group_no',
       'rain_group_normal', 'rain_group_strong', 'rain_group_very_strong',
       'rain_group_weak', 'group_s_A', 'group_s_B', 'group_s_C', 'group_s_D',
       'group_s_G', 'group_s_H', 'reg_date', 'rider_cnt', 'rider_cnt_w_1',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1'],
      dtype='object')


# 3. train, test set split

In [69]:
# train_ratio = 0.8
# total_samples = df.shape[0]
# train_samples = int(train_ratio * total_samples)
# df_train = df[:train_samples]
# df_test = df[train_samples:]

df_train = df[df["reg_date"]<= '2023-03-31']
df_test = df[df["reg_date"] >= '2023-04-01']

# print(df_train['reg_date'].min()) #2022-01-29
# print(df_test['reg_date'].min()) #2023-02-15

# print(df_train['reg_date'].max()) #2023-02-15
# print(df_test['reg_date'].max()) #2023-05-21

df_train = df_train.drop(columns = ['reg_date'])
df_test = df_test.drop(columns = ['reg_date'])
print(df_train.shape, df_test.shape) # 319, 38 

(289, 28) (67, 28)


In [70]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt'])
y_train = df_train['rider_cnt']

X_test = df_test.drop(columns=['rider_cnt'])
y_test = df_test['rider_cnt']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(289, 27) (289,) (67, 27) (67,)


In [71]:
print(X_train.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'holiday_yn_N', 'holiday_yn_Y', 'rain_group_no',
       'rain_group_normal', 'rain_group_strong', 'rain_group_very_strong',
       'rain_group_weak', 'group_s_A', 'group_s_B', 'group_s_C', 'group_s_D',
       'group_s_G', 'group_s_H', 'rider_cnt_w_1', 'rider_cnt_w_2',
       'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1'],
      dtype='object')


### numeric_scale 

In [None]:
# # 입력 변수 
# numeric_cols = ['rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#                 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#                 'order_cnt_w_4']

# # scaler 
# scaler_X = StandardScaler()

# # X_train, X_test
# X_train_scaled = scaler_X.fit_transform(X_train[numeric_cols])
# X_test_scaled = scaler_X.transform(X_test[numeric_cols])

# # 스케일링된 결과를 DataFrame으로 변환
# X_train_scaled = pd.DataFrame(X_train_scaled, columns=numeric_cols, index = X_train.index)
# X_test_scaled = pd.DataFrame(X_test_scaled, columns=numeric_cols, index = X_test.index)

# # 원래의 범주형 변수들을 선택
# categorical_cols = [col for col in X_train.columns if col not in numeric_cols]
# X_train_cat = X_train[categorical_cols]
# X_test_cat = X_test[categorical_cols]

# # 스케일링된 DataFrame과 범주형 변수들을 병합
# X_train_final = pd.concat([X_train_scaled, X_train_cat], axis=1)
# X_test_final = pd.concat([X_test_scaled, X_test_cat], axis=1)



In [None]:
# 예측값
# y_train, y_test
# scaler_y = StandardScaler()
# y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))
# y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1))
# y_train_scaled = y_train_scaled.ravel()
# y_test_scaled=  y_test_scaled.ravel()

# print(y_train_scaled.shape)
# print(y_test_scaled.shape)

# 3. regression - benchmark model

In [72]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

                            OLS Regression Results                            
Dep. Variable:              rider_cnt   R-squared:                       0.499
Model:                            OLS   Adj. R-squared:                  0.463
Method:                 Least Squares   F-statistic:                     14.09
Date:                Wed, 07 Jun 2023   Prob (F-statistic):           1.81e-30
Time:                        14:37:24   Log-Likelihood:                -2334.1
No. Observations:                 289   AIC:                             4708.
Df Residuals:                     269   BIC:                             4782.
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                             coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                   7716

In [None]:
# 선형 회귀 모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 잔차 계산
y_pred = model.predict(X_test)
residuals = y_test - y_pred

# 변수별 잔차 그래프 그리기
for column in X_test.columns:
    plt.figure(figsize=(10,6))
    sns.scatterplot(x=X_test[column], y=residuals)
    plt.title(f'Residuals vs. {column}')
    plt.xlabel(column)
    plt.ylabel('Residuals')
    plt.show()


## 4-2. train, test set 적용 

In [76]:

# 첫 번째 선형 회귀 모델 학습
model1 = LinearRegression()
model1.fit(X_train, y_train)

# 첫 번째 모델의 예측값 계산
y_pred_train = model1.predict(X_train)
y_pred_test = model1.predict(X_test)

# 예측 오차 계산
residuals_train = y_train - y_pred_train
residuals_test = y_test - y_pred_test

# 예측 오차에 대한 선형 회귀 모델 학습
model2 = LinearRegression()

#X_train_day_of_reg = pd.concat([X_train.filter(like='day_of_reg'), X_train.filter(like='holiday_yn'), X_train.filter(like = 'rain_group')], axis=1)
#X_test_day_of_reg = pd.concat([X_test.filter(like='day_of_reg'), X_test.filter(like='holiday_yn'), X_test.filter(like = 'rain_group')], axis=1)

X_train_day_of_reg = pd.concat([ X_train.filter(like = 'rain_group'),X_train.filter(like='holiday_yn')], axis=1)
X_test_day_of_reg = pd.concat([X_test.filter(like = 'rain_group'), X_test.filter(like='holiday_yn')], axis=1)

model2.fit(X_train_day_of_reg, residuals_train)

# 두 번째 모델의 예측값 계산
residuals_pred_test = model2.predict(X_test_day_of_reg)

# 예측값 보정
corrected_y_pred = y_pred_test + residuals_pred_test

# MAE와 RMSE 계산 - 첫 번째 모델
mae1 = mean_absolute_error(y_test, y_pred_test)
rmse1 = np.sqrt(mean_squared_error(y_test, y_pred_test))

# MAE와 RMSE 계산 - 보정된 예측값
mae_corrected = mean_absolute_error(y_test, corrected_y_pred)
rmse_corrected = np.sqrt(mean_squared_error(y_test, corrected_y_pred))

# 성능 출력
print(f'First Model: MAE = {mae1}, RMSE = {rmse1}')
print(f'Corrected Model: MAE = {mae_corrected}, RMSE = {rmse_corrected}')



First Model: MAE = 599.519028916407, RMSE = 773.1127412293107
Corrected Model: MAE = 595.8772378716309, RMSE = 768.9215003366924
67


In [81]:
test_set = data[data["reg_date"] >= '2023-04-01']

result_test = pd.DataFrame({
    'reg_date': test_set["reg_date"], 
    'rain_group': test_set["rain_group"], 
    'day_of_reg': test_set["day_of_reg"], 
    'group_s' : test_set["group_s"],
    'holiday_yn': test_set["holiday_yn"], 
    'y_test': test_set["rider_cnt"],
     'corrected_y_pred': corrected_y_pred,
     'y_pred': y_pred_test
})

result_test.to_csv('LM_prediction_day.csv', index=False, encoding="cp949")