## 최신 6일 라이더 운행 인원 예측

### 1. import new_dataset 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time
from datetime import datetime, timedelta 
import joblib
import json
import requests 

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import ast 
import statsmodels.api as sm 
from pandas import json_normalize

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

###### data download : https://redash.woowa.in/queries/26484/source?p_dateFrom=2023-01-01&p_dateTo=d_yesterday 
###### weather download : https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36

In [5]:
new_data = pd.read_excel("/Users/yj.noh/Desktop/predict_data.xlsx")
new_data = new_data.rename(columns={'라이더수': 'rider_cnt', '주문수': 'order_cnt'})

new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date
new_data = new_data[(new_data['pick_rgn1_nm'] == '서울특별시') & (new_data['reg_date'] < new_data['reg_date'].max())]

print(new_data.shape) 

(62884, 7)


In [6]:
#print(new_data['pick_rgn2_nm'].value_counts())
print(new_data['hour_reg'].value_counts())
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

hour_reg
9     3625
17    3625
23    3625
22    3625
21    3625
20    3625
19    3625
10    3625
18    3625
16    3625
15    3625
14    3625
13    3625
12    3625
11    3625
0     3600
1     3600
2     1309
Name: count, dtype: int64
2023-01-01
2023-05-25


In [7]:
# 9~ 23시만 
new_data = new_data[new_data['hour_reg'].isin([9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])]
print(new_data['hour_reg'].value_counts())
print(new_data.shape)  # 54000,7
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

hour_reg
9     3625
10    3625
11    3625
12    3625
13    3625
14    3625
15    3625
16    3625
17    3625
18    3625
19    3625
20    3625
21    3625
22    3625
23    3625
Name: count, dtype: int64
(54375, 7)
2023-01-01
2023-05-25


### 2. 파생변수 생성 - month, week, day_of_reg2, day_of_reg

In [8]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date'])
new_data['month'] = new_data['reg_date'].dt.month
new_data['week'] = ((new_data['reg_date'].dt.day - 1) // 7) + 1
new_data['day_of_reg'] = new_data['reg_date'].dt.strftime('%a')
new_data['day_of_reg'] = new_data['day_of_reg'].replace({'Sat': '토요일', 'Sun': '일요일', 'Fri': '금요일', 'Mon': '월요일', 'Tue': '화요일', 'Wed': '수요일', 'Thu': '목요일'})
print(new_data['day_of_reg'].value_counts())

day_of_reg
일요일    7875
월요일    7875
화요일    7875
수요일    7875
목요일    7875
금요일    7500
토요일    7500
Name: count, dtype: int64


### 3. reg_date +7, 변수명 변경 

In [9]:
new_data['reg_date'] = new_data['reg_date'] + pd.DateOffset(days=7)

new_data['rider_cnt_w_1'] = new_data['rider_cnt']
new_data['order_cnt_w_1'] = new_data['order_cnt']

print(new_data['reg_date'].min()) #2023-01-08
print(new_data['reg_date'].max()) #2023-05-31



2023-01-08 00:00:00
2023-06-01 00:00:00


### 4. 파생변수 생성 - is_holiday 

In [None]:
# #공휴일 가져오기 
# import json
# from pandas import json_normalize
# from urllib.parse import unquote

# today = datetime.today().strftime('%Y%m%d')
# today_year = datetime.today().year

# key = unquote('uw7Y8kgQ1Fqg6z9GjnSN8jJ8S%2FuV%2Bl%2B8PnHC4By9xwtYF5ZAExmO2Ip1mJcQC3HDTDcKmIxB9rBBuI0gDaIYVA%3D%3D')
# url = "https://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?&solYear="+ str(today_year)+ '&ServiceKey=' + str(key) 

# response = requests.get(url)

# if response.status_code == 200 :
#     json_ob = json.loads(response.text)
#     holidays_new_data = json_ob['response']['body']['items']['item']
#     new_dataframe = json_normalize(holidays_new_data)

# holiday_list = new_dataframe['locdate'].tolist()
# print(holiday_list)


In [10]:
holiday_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2022-01-01', '2022-01-31', '2022-02-01', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', 
                '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', 
                '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01', '2023-05-01', '2023-05-05', '2023-05-27', '2023-05-29', '2023-06-06', '2023-08-15', 
                '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25']]

new_data['is_holiday2'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in holiday_list) or (row['day_of_reg'] in ['토요일', '일요일']) else 0, axis=1)
print(new_data['is_holiday2'].value_counts())  # 17625 

is_holiday2
0    36750
1    17625
Name: count, dtype: int64


### 5. 파생변수 생성 - is_rain 

In [11]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date

weather = pd.read_csv("/Users/yj.noh/Desktop/weather_2023.csv", encoding='cp949')

weather = weather.rename(columns={'기온(°C)': 'temp_c', '강수량(mm)': 'rain_c', '적설(cm)': 'snow_c', '일시': 'date'})

weather['date_2'] = pd.to_datetime(weather['date']).dt.date
weather['hour'] = pd.to_datetime(weather['date']).dt.hour

weather = weather[weather['hour'].isin([9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])]

new_data = pd.merge(new_data, weather[['date_2', 'hour','rain_c', 'snow_c']], left_on=['reg_date', 'hour_reg'], right_on=['date_2', 'hour'], how='left')

new_data['rain_c'] = new_data['rain_c'].fillna(0)
new_data['snow_c'] = new_data['snow_c'].fillna(0)

new_data['is_rain'] = np.where((new_data['rain_c'] > 0) | (new_data['snow_c'] > 0), 1, 0)

print(new_data['is_rain'].value_counts()) 

is_rain
0    51700
1     2675
Name: count, dtype: int64


In [12]:
# 날짜만 입력 
# new_data['is_rain'] = new_data['reg_date'].apply(lambda x: 1 if x.date() == datetime.strptime('2023-05-18', '%Y-%m-%d').date() else 0)
# print(new_data['is_rain'].value_counts()) #375

rain_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2023-05-27', '2023-05-28', '2023-05-29','2023-05-30','2023-05-31']]
#new_data['is_rain'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in rain_list) else 0, axis =1)

new_data.loc[new_data['reg_date'].isin(rain_list), 'is_rain'] = 1
print(new_data['is_rain'].value_counts()) # 49450, 4550



is_rain
0    49825
1     4550
Name: count, dtype: int64


In [13]:
print(new_data.shape) #54000
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

#print(new_data.isna().sum())

(54375, 17)
2023-01-08
2023-06-01


### 6. 파생변수 생성 -  w-1,w-2,w-3,w-4 동일요일, 동일 시간대 주문수, 라이더 수 

In [14]:
new_data = new_data.sort_values(['reg_date', 'pick_rgn2_nm'])
new_data['rider_cnt_w_2'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(1)
new_data['rider_cnt_w_3'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(2)
new_data['rider_cnt_w_4'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(3)
new_data['order_cnt_w_2'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['order_cnt'].shift(1)
new_data['order_cnt_w_3'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['order_cnt'].shift(2)
new_data['order_cnt_w_4'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['order_cnt'].shift(3)
print(new_data.isna().sum())

reg_date             0
hour_reg             0
day_of_reg           0
pick_rgn1_nm         0
pick_rgn2_nm         0
rider_cnt            0
order_cnt            0
month                0
week                 0
rider_cnt_w_1        0
order_cnt_w_1        0
is_holiday2          0
date_2            2625
hour              2625
rain_c               0
snow_c               0
is_rain              0
rider_cnt_w_2     5250
rider_cnt_w_3     9300
rider_cnt_w_4    12250
order_cnt_w_2     5250
order_cnt_w_3     9300
order_cnt_w_4    12250
dtype: int64


### 7. 파생변수 생성 group_s 

In [16]:
def group_assignment(row):
    if row['day_of_reg'] in ['월요일','화요일','수요일','목요일','금요일']:
        if row['is_holiday2'] == 0 and row['is_rain'] == 0:
            return 'A'
        elif row['is_holiday2'] == 0 and row['is_rain'] == 1:
            return 'B'
        elif row['is_holiday2'] == 1 and row['is_rain'] == 0:
            return 'C'
        elif row['is_holiday2'] == 1 and row['is_rain'] == 1:
            return 'D'
    elif row['day_of_reg'] in ['토요일','일요일']:
        if row['is_holiday2'] == 0 and row['is_rain'] == 0:
            return 'E'
        elif row['is_holiday2'] == 0 and row['is_rain'] == 1:
            return 'F'
        elif row['is_holiday2'] == 1 and row['is_rain'] == 0:
            return 'G'
        elif row['is_holiday2'] == 1 and row['is_rain'] == 1:
            return 'H'

new_data['group_s'] = new_data.apply(group_assignment, axis=1)


In [17]:
tomorrow = (datetime.now() + timedelta(days=1)).date()

#new_data = new_data.drop(['pick_rgn1_nm', 'rider_cnt', 'order_cnt', 'date_2', 'hour'], axis=1)

new_data = new_data[new_data['reg_date'].apply(lambda x: x) >= tomorrow] # 내일날짜부터 예측되게끔. 
new_data.to_csv('predict_new_data.csv', encoding='cp949', index=False)

print(new_data.shape)  # (2,250, num_columns)
#print(new_data.info())
print(new_data['reg_date'].min()) 

(2250, 24)
2023-05-27


In [19]:
# NA 채우기
new_data.loc[new_data['rider_cnt_w_2'].isna(), 'rider_cnt_w_2'] = new_data['rider_cnt_w_1']
new_data.loc[new_data['rider_cnt_w_3'].isna(), 'rider_cnt_w_3'] = new_data['rider_cnt_w_2']
new_data.loc[new_data['rider_cnt_w_4'].isna(), 'rider_cnt_w_4'] = new_data['rider_cnt_w_3']

new_data.loc[new_data['order_cnt_w_2'].isna(), 'order_cnt_w_2'] = new_data['order_cnt_w_1']
new_data.loc[new_data['order_cnt_w_3'].isna(), 'order_cnt_w_3'] = new_data['order_cnt_w_2']
new_data.loc[new_data['order_cnt_w_4'].isna(), 'order_cnt_w_4'] = new_data['order_cnt_w_3']

print(new_data.isna().sum())


reg_date            0
hour_reg            0
day_of_reg          0
pick_rgn1_nm        0
pick_rgn2_nm        0
rider_cnt           0
order_cnt           0
month               0
week                0
rider_cnt_w_1       0
order_cnt_w_1       0
is_holiday2         0
date_2           2250
hour             2250
rain_c              0
snow_c              0
is_rain             0
rider_cnt_w_2       0
rider_cnt_w_3       0
rider_cnt_w_4       0
order_cnt_w_2       0
order_cnt_w_3       0
order_cnt_w_4       0
group_s             0
dtype: int64


### 7. 데이터 전처리 

In [20]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday2', 'group_s'] : 
   #new_data[col] = new_data[col].astype('category')
   new_data.loc[:, col] = new_data.loc[:, col].astype('category')

print(new_data.dtypes)

reg_date           object
hour_reg         category
day_of_reg         object
pick_rgn1_nm       object
pick_rgn2_nm       object
rider_cnt           int64
order_cnt           int64
month            category
week             category
rider_cnt_w_1       int64
order_cnt_w_1       int64
is_holiday2      category
date_2             object
hour              float64
rain_c            float64
snow_c            float64
is_rain          category
rider_cnt_w_2     float64
rider_cnt_w_3     float64
rider_cnt_w_4     float64
order_cnt_w_2     float64
order_cnt_w_3     float64
order_cnt_w_4     float64
group_s            object
dtype: object


In [21]:
# one-hot-encoding

new_data = new_data.drop(['rain_c','snow_c', 'date_2','hour'], axis=1)

var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday2', 'group_s']

encode_data = new_data.sort_values(by="reg_date")
X_test = encode_data.drop(columns = ['reg_date'])

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(encode_data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = encode_data.index)
X_test = pd.concat([onehot, X_test.drop(columns=var)], axis=1)
#print(X_test.head(3))
print(X_test.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일', 'day_of_reg_일요일',
       'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0', 'is_rain_1', 'month_5',
       'week_3', 'week_4', 'is_

### 8. 존재하지 않는 변수 추가하기

In [22]:
desired_order = ['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'week_1',
       'week_2', 'week_3', 'week_4', 'week_5', 'group_s_A', 'group_s_B',
       'group_s_C', 'group_s_D', 'group_s_E', 'group_s_F', 'group_s_G',
       'group_s_H', 'is_rain_0', 'is_rain_1', 'is_holiday2_0', 'is_holiday2_1',
       'day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'rider_cnt_w_1',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1',
       'order_cnt_w_2', 'order_cnt_w_3', 'order_cnt_w_4']

new_variables = [col for col in desired_order if col not in X_test.columns]

# 새로운 변수를 포함한 빈 DataFrame 생성
encode_data = pd.DataFrame(0, columns=new_variables, index= X_test.index)

# 기존 x_test DataFrame과 새로운 변수를 포함한 DataFrame을 병합
X_test = pd.concat([X_test, encode_data], axis=1)
X_test = X_test[desired_order]

# 결과 확인
#print(X_test.head())
print(X_test.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23', 'month_1',
       'month_2', 'month_3', 'month_4', 'month_5', 'month_6', 'month_7',
       'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'week_1',
       'week_2', 'week_3', 'wee

### 9. train, test set split 

In [23]:
print(X_test.shape) #2250,84
X_test.isna().sum()

(2250, 84)


pick_rgn2_nm_강남구    0
pick_rgn2_nm_강동구    0
pick_rgn2_nm_강북구    0
pick_rgn2_nm_강서구    0
pick_rgn2_nm_관악구    0
                   ..
rider_cnt_w_4       0
order_cnt_w_1       0
order_cnt_w_2       0
order_cnt_w_3       0
order_cnt_w_4       0
Length: 84, dtype: int64

### 10. model 적용 

In [24]:

def get_predict(X_test):

    Lasso_model = joblib.load('model_Lasso.joblib')
    LGBM_model = joblib.load('model_LGBMRegressor.joblib')
    RF_model = joblib.load('model_RandomForestRegressor.joblib')

    result_df = pd.DataFrame({'reg_date': new_data["reg_date"],
                              'pick_rgn2_nm': new_data["pick_rgn2_nm"], 'hour_reg': new_data["hour_reg"],
                              'day_of_reg': new_data["day_of_reg"], 'is_rain': new_data["is_rain"],
                              'is_holiday': new_data["is_holiday2"]})

    y_pred_Lasso = Lasso_model.predict(X_test)
    y_pred_LGBM = LGBM_model.predict(X_test)
    y_pred_RF = RF_model.predict(X_test)
    
    result_df['y_pred_Lasso'] = y_pred_Lasso
    result_df['y_pred_LGBM'] = y_pred_LGBM
    result_df['y_pred_RF'] = y_pred_RF
    
    result_df['y_pred_avg'] = (result_df['y_pred_Lasso'] + result_df['y_pred_LGBM'] + result_df['y_pred_RF']) / 3 #3개 모델 예측값 


    result_df.to_csv('prediction_results_latest6days.csv', index=False, encoding="cp949")


get_predict(X_test)

### 11. 보정계수 적용 - rgn2, is_rain, is_holiday 

In [None]:
buffer_df = pd.read_csv("prediction_results_test_set.csv", encoding = "cp949")
buffer_df.head()

In [None]:
buffer_df['buffer_value'] = buffer_df['y_test'] / buffer_df['y_pred_test_avg']
buffer_df.head()