# 1. dataset 

In [108]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import joblib

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import ast 
import statsmodels.api as sm 
from time import time

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [109]:
data = pd.read_csv('combined_data.csv', encoding = "cp949")
data.head()
print(data.shape) #186,000

(186000, 27)


In [110]:
# Checking for null values
print(data.info())

# Checking for outliers
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186000 entries, 0 to 185999
Data columns (total 27 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   pick_rgn2_nm   186000 non-null  object 
 1   rider_cnt      186000 non-null  int64  
 2   order_cnt      186000 non-null  int64  
 3   datetime       186000 non-null  object 
 4   hour_reg       186000 non-null  int64  
 5   reg_date       186000 non-null  object 
 6   day_of_reg     186000 non-null  object 
 7   temp_c         186000 non-null  float64
 8   rain_c         186000 non-null  float64
 9   snow_c         186000 non-null  float64
 10  is_rain        186000 non-null  int64  
 11  month          186000 non-null  int64  
 12  week           186000 non-null  int64  
 13  is_holiday     186000 non-null  int64  
 14  q1             186000 non-null  float64
 15  q3             186000 non-null  float64
 16  IQR1.5         186000 non-null  float64
 17  outlier        186000 non-nul

In [111]:
data["datetime"] = pd.to_datetime(data["datetime"])
data["reg_date"] = pd.to_datetime(data["reg_date"])

data = data.sort_values(by="datetime")

In [112]:
data = data.drop(columns = ['rider_cnt', 'order_cnt','temp_c','rain_c', 
                            'snow_c', 'q1', 'q3', 'IQR1.5', 'outlier'])
print(data.head())

   pick_rgn2_nm            datetime  hour_reg   reg_date day_of_reg  is_rain   
0           강남구 2022-01-01 09:00:00         9 2022-01-01        토요일        0  \
24          중랑구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
23           중구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
22          종로구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   
21          은평구 2022-01-01 09:00:00         9 2022-01-01        토요일        0   

    month  week  is_holiday  rider_cnt_2  rider_cnt_w_1  rider_cnt_w_2   
0       1     1           1        215.0            NaN            NaN  \
24      1     1           1         25.0            NaN            NaN   
23      1     1           1         21.0            NaN            NaN   
22      1     1           1         23.0            NaN            NaN   
21      1     1           1         53.0            NaN            NaN   

    rider_cnt_w_3  rider_cnt_w_4  order_cnt_w_1  order_cnt_w_2  order_cnt_

In [113]:
data = data.dropna(subset=['rider_cnt_w_4'])
data.isna().sum()
print(data.shape) #165,125, 18

(165125, 18)


In [114]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)


pick_rgn2_nm           category
datetime         datetime64[ns]
hour_reg               category
reg_date         datetime64[ns]
day_of_reg             category
is_rain                category
month                  category
week                   category
is_holiday             category
rider_cnt_2             float64
rider_cnt_w_1           float64
rider_cnt_w_2           float64
rider_cnt_w_3           float64
rider_cnt_w_4           float64
order_cnt_w_1           float64
order_cnt_w_2           float64
order_cnt_w_3           float64
order_cnt_w_4           float64
dtype: object


# 2. 데이터 전처리 

## 2-1. one-hot-encoding

In [None]:
# ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
# X = np.array(ct.fit_transform(X))

In [115]:
var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday']

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
df = pd.concat([onehot, data.drop(columns=var)], axis=1)
#print(df.head(3))
print(df.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'month_1', 'month

## 2-2. numeric variable scale 

In [91]:
# numeric 변수 scale 
scaler = StandardScaler()  #평균 0 , 분산 1로 조정
#scaler = MinMaxScaler()

# num_vars = ['rider_cnt_2', 'rider_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
#             'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2', 'order_cnt_w_3',
#             'order_cnt_w_4']
# df[num_vars] = scaler.fit_transform(df[num_vars])

# print(df.head(3))


# 3. train/test set split

In [116]:
#np.random.seed(1234)
#df_train, df_test = train_test_split(dataset,train_size =0.75, test_size = 0.25, random_state = 100)

df_train = df[df["datetime"]<= '2022-12-31']
df_test = df[df["datetime"] >= '2023-01-01']

df_train = df_train.drop(columns = ['datetime', 'reg_date'])
df_test = df_test.drop(columns = ['datetime', 'reg_date'])

print(df_train.shape, df_test.shape) #116,050, 48700,77개 변수 

(116050, 77) (48700, 77)


In [117]:
# X_train, y_train 나누기
 
# X_train = train.iloc[:, :-1]
# y_train = df_train.iloc[:, -1]

# X_test = df_test.iloc[:, :-1]
# y_test = df_test.iloc[:, -1]

X_train = df_train.drop(columns=['rider_cnt_2'])
y_train = df_train['rider_cnt_2']

X_test = df_test.drop(columns=['rider_cnt_2'])
y_test = df_test['rider_cnt_2']

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(116050, 76) (116050,) (48700, 76) (48700,)


In [118]:
print(X_train.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'month_1', 'month

# 3. regression - benchmark model

In [None]:
X_train_lm = sm.add_constant(X_train)

lr_1 = sm.OLS(y_train, X_train_lm).fit()

print(lr_1.summary())

# 4.Machine Learning Modeling

## 4-1. 하이퍼파라미터 튜닝 - Grid Search 

### a. LightGBM model 

In [None]:
classifier = LGBMRegressor()

parameters = [{'learning_rate': [0.1, 0.05, 0.01, 0.005], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7]},
              {'learning_rate': [0.15, 0.125, 0.1, 0.075], 'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'num_leaves': [16, 32, 64]}]

grid_search = GridSearchCV(estimator=classifier,
                           param_grid=parameters,
                           scoring=['neg_mean_squared_error', 'neg_mean_absolute_error'],
                           cv=10,
                           n_jobs=-1,
                           refit='neg_mean_squared_error')                     

grid_search.fit(X_train, y_train)
best_rmse = np.sqrt(-1 * grid_search.cv_results_['mean_test_neg_mean_squared_error'][grid_search.best_index_])
best_mae = -1 * grid_search.cv_results_['mean_test_neg_mean_absolute_error'][grid_search.best_index_]
best_parameters = grid_search.best_params_
print("Best RMSE: {:.2f}".format(best_rmse))
print("Best MAE: {:.2f}".format(best_mae))
print("Best Parameters:", best_parameters)

# best rmse : 24.87
# best mae : 16.59
# Best Parameters: {'learning_rate': 0.15, 'max_depth': 7, 'n_estimators': 50, 'num_leaves': 64}

### b. ridge regression

In [None]:
# Ridge Regression
ridge = Ridge()
ridge_param_grid = {'alpha': [0.1, 1.0, 10.0]}
ridge_grid_search = GridSearchCV(estimator=ridge, param_grid=ridge_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)
print("Ridge Best RMSE: {:.2f}".format(np.sqrt(-ridge_grid_search.best_score_)))
print("Ridge Best Parameters: ", ridge_grid_search.best_params_)

# Ridge Best RMSE: 23.72
# Ridge Best Parameters:  {'alpha': 1.0}

### c. Lasso regression

In [None]:
# Lasso Regression
lasso = Lasso(max_iter = 10000)
lasso_param_grid = {'alpha': [0.1, 1.0, 10.0]}
lasso_grid_search = GridSearchCV(estimator=lasso, param_grid=lasso_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)
print("Lasso Best RMSE: {:.2f}".format(np.sqrt(-lasso_grid_search.best_score_)))
print("Lasso Best Parameters: ", lasso_grid_search.best_params_)

# Lasso Best RMSE: 24.19
# Lasso Best Parameters:  {'alpha': 0.1}

### d. Support vector regressor

In [None]:
# SVR
# svr = SVR()
# svr_param_grid = {'kernel': ['rbf', 'linear', 'poly', 'sigmoid'], 'C': [0.1, 1.0, 10.0], 'gamma': ['scale', 'auto']}
# svr_grid_search = GridSearchCV(estimator=svr, param_grid=svr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
# svr_grid_search.fit(df_X, df_y)
# print("SVR Best RMSE: {:.2f}".format(np.sqrt(-svr_grid_search.best_score_)))
# print("SVR Best Parameters: ", svr_grid_search.best_params_)

### e. Random Forest Regressor

In [None]:
# Random Forest Regressor
rfr = RandomForestRegressor(random_state=0)
rfr_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
rfr_grid_search = GridSearchCV(estimator=rfr, param_grid=rfr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
rfr_grid_search.fit(X_train, y_train)
print("Random Forest Regressor Best RMSE: {:.2f}".format(np.sqrt(-rfr_grid_search.best_score_)))
print("Random Forest Regressor Best Parameters: ", rfr_grid_search.best_params_)

# Random Forest Regressor Best RMSE: 25.89
# Random Forest Regressor Best Parameters:  {'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 200}


### f. Decision Tree Regressor

In [None]:

# Decision Tree Regressor
dtr = DecisionTreeRegressor(random_state=0)
dtr_param_grid = {'max_depth': [3, 5, 7], 'min_samples_split': [2, 5, 10]}
dtr_grid_search = GridSearchCV(estimator=dtr, param_grid=dtr_param_grid, scoring='neg_mean_squared_error', cv=10, n_jobs=-1)
dtr_grid_search.fit(X_train, y_train)
print("Decision Tree Regressor Best RMSE: {:.2f}".format(np.sqrt(-dtr_grid_search.best_score_)))
print("Decision Tree Regressor Best Parameters: ", dtr_grid_search.best_params_)

# Decision Tree Regressor Best RMSE: 27.85
# Decision Tree Regressor Best Parameters:  {'max_depth': 7, 'min_samples_split': 2}

## 4-2. train, test set 적용 

### a. train, test rmse, mae

In [None]:
# train_set = data[data["datetime"] <= '2022-12-31']
# test_set = data[data["datetime"] >= '2023-01-01']

# def execute_pipeline(X_train, y_train, X_test, y_test):
#     regressors = [
#        LinearRegression(),
#         Ridge(alpha = 1.0),
#         Lasso(alpha = 0.1, max_iter = 10000),
#         LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64),
#         RandomForestRegressor(random_state=0, max_depth =  7, min_samples_split = 5, n_estimators = 200),
#         DecisionTreeRegressor(random_state=0, max_depth = 7, min_samples_split =  2)
#     ]
    
#     result_train = pd.DataFrame({'datetime': train_set["datetime"],
#                               'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
#                               'day_of_reg': train_set["day_of_reg"], 'is_rain': train_set["is_rain"],
#                               'is_holiday': train_set["is_holiday"], 'y_test': y_train})
    
#     result_test = pd.DataFrame({'datetime': test_set["datetime"],
#                               'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
#                               'day_of_reg': test_set["day_of_reg"], 'is_rain': test_set["is_rain"],
#                               'is_holiday': test_set["is_holiday"], 'y_test': y_test})
    
#     scores = {}
#     for reg in regressors:
#         reg_name = reg.__class__.__name__
#         cv_scores = cross_val_score(reg, X_train, y_train, cv = 10, scoring='neg_root_mean_squared_error')
#         mean_rmse = -1.0 * np.mean(cv_scores)
#         std_rmse = np.std(cv_scores)
        
#         reg.fit(X_train, y_train)
#         y_pred_train = reg.predict(X_train)
#         y_pred_test = reg.predict(X_test)
        
#         rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
#         mae_train = mean_absolute_error(y_train, y_pred_train)
#         rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
#         mae_test = mean_absolute_error(y_test, y_pred_test)
        
#         # 모델 저장
#         with open(f'model_{reg.__class__.__name__}.pkl', 'wb') as f:
#             pickle.dump(reg, f)

#         scores[reg_name] = {
#             'CV RMSE Mean': mean_rmse,
#             'CV RMSE Std': std_rmse,
#             'Train RMSE': rmse_train,
#             'Train MAE': mae_train,
#             'Test RMSE': rmse_test,
#             'Test MAE': mae_test
#         }
#         result_train[f'y_pred_train_{reg.__class__.__name__}'] = y_pred_train
#         result_test[f'y_pred_test_{reg.__class__.__name__}'] = y_pred_test
        
#     scores_df = pd.DataFrame(scores).transpose()
    
#     #train, test 예측치 저장 
#     result_train.to_csv('prediction_results_train_set.csv', index=False, encoding="cp949")
#     result_test.to_csv('prediction_results_test_set.csv', index=False, encoding="cp949")
    
#     return scores_df

# # Example usage
# scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
# print(scores_df)


In [120]:
def MAPE(y_test, y_pred):
    return np.mean(np.abs((y_test - y_pred) /y_test)) *100

In [124]:
train_set = data[data["datetime"] <= '2022-12-31']
test_set = data[data["datetime"] >= '2023-01-01']


def execute_pipeline(X_train, y_train, X_test, y_test):
    regressors = [
        LinearRegression(),
        Ridge(alpha=1.0),
        Lasso(alpha=0.1, max_iter=5000),
        LGBMRegressor(learning_rate=0.15, max_depth=7, n_estimators=200, num_leaves=64),
        RandomForestRegressor(random_state=0, max_depth=7, min_samples_split=5, n_estimators=200),
        DecisionTreeRegressor(random_state=0, max_depth=7, min_samples_split=2)
     ]

    result_train = pd.DataFrame({'datetime': train_set["datetime"],
                                 'pick_rgn2_nm': train_set["pick_rgn2_nm"], 'hour_reg': train_set["hour_reg"],
                                 'day_of_reg': train_set["day_of_reg"], 'is_rain': train_set["is_rain"],
                                 'is_holiday': train_set["is_holiday"], 'y_test': y_train})

    result_test = pd.DataFrame({'datetime': test_set["datetime"],
                                'pick_rgn2_nm': test_set["pick_rgn2_nm"], 'hour_reg': test_set["hour_reg"],
                                'day_of_reg': test_set["day_of_reg"], 'is_rain': test_set["is_rain"],
                                'is_holiday': test_set["is_holiday"], 'y_test': y_test})

    scores = {}
    for reg in regressors:
        reg_name = reg.__class__.__name__
        scoring = {
            'rmse': 'neg_root_mean_squared_error',
            'mae': 'neg_mean_absolute_error',
            'r2': 'r2'
        }
        # 학습할 때 사용한 피처의 순서를 저장
        cv_results = cross_validate(reg, X_train, y_train, cv=10, scoring=scoring)

        mean_rmse = -1.0 * np.mean(cv_results['test_rmse'])
        std_rmse = np.std(cv_results['test_rmse'])
        mean_mae = -1.0 * np.mean(cv_results['test_mae'])
        std_mae = np.std(cv_results['test_mae'])
        mean_r2 = np.mean(cv_results['test_r2'])
        std_r2 = np.std(cv_results['test_r2'])

        reg.fit(X_train, y_train)
        y_pred_train = reg.predict(X_train)
        y_pred_test = reg.predict(X_test)

        rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
        mae_train = mean_absolute_error(y_train, y_pred_train)
        mape_train = MAPE(y_train, y_pred_train)
        rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
        mae_test = mean_absolute_error(y_test, y_pred_test)
        mape_test = MAPE(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)

        # 모델 저장
        model_file = f'model_{reg_name}.joblib'
        joblib.dump(reg, model_file)

        scores[reg_name] = {
            # 'CV RMSE Mean': mean_rmse,
            # 'CV RMSE Std': std_rmse,
            # 'CV MAE Mean': mean_mae,
            # 'CV MAE Std': std_mae,
            # 'CV R2 Mean': mean_r2,
            # 'CV R2 Std': std_r2,
            'Train RMSE': rmse_train,
            'Train MAE': mae_train,
            'Train MAPE' : mape_train ,
            'Test RMSE': rmse_test,
            'Test MAE' : mae_test,
            'Test MAPE' : mape_test,
            'Test R2' : r2_test 
        }
        
        result_train[f'y_pred_train_{reg_name}'] = y_pred_train
        result_test[f'y_pred_test_{reg_name}'] = y_pred_test
        
         # 로깅
        # print(f"Model: {reg_name}")
        # print(f"CV RMSE: {mean_rmse} (±{std_rmse})")
        # print(f"CV MAE: {mean_mae} (±{std_mae})")
        # print(f"CV R2: {mean_r2} (±{std_r2})")
        # print(f"Train RMSE: {rmse_train}")
        # print(f"Train MAE: {mae_train}")
        # print(f"Test RMSE: {rmse_test}")
        # print(f"Test MAE: {mae_test}")
        # print(f"Test R2: {r2_test}")
        # print()
        
    scores_df = pd.DataFrame(scores).transpose()

    # train, test 예측치 저장
    result_train.to_csv('prediction_results_train_set.csv', index=False, encoding="cp949")
    result_test.to_csv('prediction_results_test_set.csv', index=False, encoding="cp949")

    return scores_df

# usage 
scores_df = execute_pipeline(X_train, y_train, X_test, y_test)
print(scores_df)


                       Train RMSE  Train MAE  Train MAPE  Test RMSE   
LinearRegression        23.641602  16.015795    9.452132  22.412126  \
Ridge                   23.640647  16.014546    9.453001  22.407407   
Lasso                   24.120089  16.054495    9.187980  22.835155   
LGBMRegressor           15.935964  11.480302    6.741311  22.795690   
RandomForestRegressor   23.592423  15.806745    8.755942  21.843440   
DecisionTreeRegressor   25.254577  16.895315    9.340174  23.951169   

                        Test MAE  Test MAPE   Test R2  
LinearRegression       16.016783  10.524194  0.979136  
Ridge                  16.009787  10.523413  0.979145  
Lasso                  15.989692   9.936235  0.978341  
LGBMRegressor          15.501312   8.641876  0.978416  
RandomForestRegressor  14.731479   8.427777  0.980182  
DecisionTreeRegressor  16.015110   9.085781  0.976172  


## 다음주 6일 예측하기

In [98]:
data = pd.read_csv('predict_data.csv', encoding = "cp949")
data.head()
print(data.shape) #2250,20

(2250, 16)


In [99]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2250 entries, 0 to 2249
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   reg_date       2250 non-null   object
 1   hour_reg       2250 non-null   int64 
 2   day_of_reg     2250 non-null   object
 3   pick_rgn2_nm   2250 non-null   object
 4   is_rain        2250 non-null   int64 
 5   month          2250 non-null   int64 
 6   week           2250 non-null   int64 
 7   is_holiday     2250 non-null   int64 
 8   rider_cnt_w_2  2250 non-null   int64 
 9   rider_cnt_w_3  2250 non-null   int64 
 10  rider_cnt_w_4  2250 non-null   int64 
 11  order_cnt_w_2  2250 non-null   int64 
 12  order_cnt_w_3  2250 non-null   int64 
 13  order_cnt_w_4  2250 non-null   int64 
 14  rider_cnt_w_1  2250 non-null   int64 
 15  order_cnt_w_1  2250 non-null   int64 
dtypes: int64(13), object(3)
memory usage: 281.4+ KB
None


In [100]:
data["reg_date"] = pd.to_datetime(data["reg_date"])
data = data.sort_values(by="reg_date")

In [101]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday'] : 
    data[col] = data[col].astype('category')

print(data.dtypes)

reg_date         datetime64[ns]
hour_reg               category
day_of_reg             category
pick_rgn2_nm           category
is_rain                category
month                  category
week                   category
is_holiday             category
rider_cnt_w_2             int64
rider_cnt_w_3             int64
rider_cnt_w_4             int64
order_cnt_w_2             int64
order_cnt_w_3             int64
order_cnt_w_4             int64
rider_cnt_w_1             int64
order_cnt_w_1             int64
dtype: object


In [102]:
# one-hot-encoding

var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday']

data = data.sort_values(by="reg_date")
X_test = data.drop(columns = ['reg_date'])

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = data.index)
X_test = pd.concat([onehot, X_test.drop(columns=var)], axis=1)
#print(X_test.head(3))
print(X_test.columns)


Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_일요일',
       'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0', 'is_rain_1', 'month_5',
       'week_2', 'is_holiday_0'

### 존재하지 않는 변수 추가하기 (month,week)

In [103]:
# 새로운 변수 생성
new_variables = ['day_of_reg_월요일','month_1', 'month_2', 'month_3', 'month_4', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'week_1', 'week_3', 'week_4', 'week_5']

# 새로운 변수를 포함한 빈 DataFrame 생성
new_data = pd.DataFrame(0, columns=new_variables, index= X_test.index)

# 기존 x_test DataFrame과 새로운 변수를 포함한 DataFrame을 병합
X_test = pd.concat([X_test, new_data], axis=1)

# 결과 확인
#print(X_test.head())
print(X_test.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_일요일',
       'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0', 'is_rain_1', 'month_5',
       'week_2', 'is_holiday_0'

### train/test set split 

In [104]:
print(X_test.shape) # 2250,76

(2250, 76)


In [105]:
X_train.isna().sum()

pick_rgn2_nm_강남구    0
pick_rgn2_nm_강동구    0
pick_rgn2_nm_강북구    0
pick_rgn2_nm_강서구    0
pick_rgn2_nm_관악구    0
                   ..
rider_cnt_w_4       0
order_cnt_w_1       0
order_cnt_w_2       0
order_cnt_w_3       0
order_cnt_w_4       0
Length: 76, dtype: int64

In [106]:
desired_order = ['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5',
       'is_holiday_0', 'is_holiday_1', 'rider_cnt_w_1', 'rider_cnt_w_2',
       'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2',
       'order_cnt_w_3', 'order_cnt_w_4']

X_test = X_test[desired_order]

In [107]:

def get_predict(X_test):

    linear_model = joblib.load('model_LinearRegression.joblib')
    LGBM_model = joblib.load('model_LGBMRegressor.joblib')
    RF_model = joblib.load('model_RandomForestRegressor.joblib')

    result_df = pd.DataFrame({'reg_date': data["reg_date"],
                              'pick_rgn2_nm': data["pick_rgn2_nm"], 'hour_reg': data["hour_reg"],
                              'day_of_reg': data["day_of_reg"], 'is_rain': data["is_rain"],
                              'is_holiday': data["is_holiday"]})

    y_pred_linear = linear_model.predict(X_test)
    y_pred_LGBM = LGBM_model.predict(X_test)
    y_pred_RF = RF_model.predict(X_test)
    
    result_df['y_pred_linear'] = y_pred_linear
    result_df['y_pred_LGBM'] = y_pred_LGBM
    result_df['y_pred_RF'] = y_pred_RF

    result_df.to_csv('prediction_results_latest6days.csv', index=False, encoding="cp949")


get_predict(X_test)