## 최신 6일 라이더 운행 인원 예측

### 1. import new_dataset 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
import joblib
import json
import requests 

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import ast 
import statsmodels.api as sm 
from time import time
from pandas import json_normalize

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [55]:
new_data = pd.read_excel("/Users/yj.noh/Desktop/predict_data.xlsx")
new_data = new_data.rename(columns={'라이더수': 'rider_cnt', '주문수': 'order_cnt'})
new_data['reg_date'] = pd.to_datetime(new_data['reg_date'])
new_data = new_data[(new_data['pick_rgn1_nm'] == '서울특별시') & (new_data['reg_date'] < new_data['reg_date'].max())]
print(new_data.shape) #11675,7

(11675, 7)


In [56]:
print(new_data['pick_rgn2_nm'].value_counts())

pick_rgn2_nm
강남구     481
송파구     479
관악구     479
강서구     474
영등포구    473
마포구     472
은평구     468
강동구     468
서초구     468
중구      466
성북구     466
광진구     466
양천구     466
강북구     466
성동구     465
노원구     465
동작구     464
서대문구    463
용산구     463
동대문구    462
금천구     462
구로구     461
중랑구     461
도봉구     459
종로구     458
Name: count, dtype: int64


In [57]:
new_data = new_data[new_data['hour_reg'].isin([9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])]
print(new_data['hour_reg'].value_counts())

hour_reg
9     675
10    675
11    675
12    675
13    675
14    675
15    675
16    675
17    675
18    675
19    675
20    675
21    675
22    675
23    675
Name: count, dtype: int64


In [58]:
print(new_data.shape)  # (10,125, 7)

(10125, 7)


### 2. 파생변수 생성 - month, week, day_of_reg2, day_of_reg

In [59]:
new_data['month'] = new_data['reg_date'].dt.month
new_data['week'] = ((new_data['reg_date'].dt.day - 1) // 7) + 1
new_data['day_of_reg2'] = new_data['reg_date'].dt.strftime('%a')
new_data['day_of_reg'] = new_data['day_of_reg2'].replace({'Sat': '주말', 'Sun': '주말', 'Fri': '금', 'Mon': '월목', 'Tue': '월목', 'Wed': '월목', 'Thu': '월목'})
print(new_data['day_of_reg2'].value_counts())
print(new_data['day_of_reg'].value_counts())

day_of_reg2
Tue    1500
Wed    1500
Thu    1500
Fri    1500
Sat    1500
Sun    1500
Mon    1125
Name: count, dtype: int64
day_of_reg
월목    5625
주말    3000
금     1500
Name: count, dtype: int64


### 3. 파생변수 생성 -  w-1,w-2,w-3,w-4 동일요일, 동일 시간대 주문수, 라이더 수 

In [60]:
new_data = new_data.sort_values(['reg_date', 'pick_rgn2_nm'])
new_data['rider_cnt_w_2'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['rider_cnt'].shift(1)
new_data['rider_cnt_w_3'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['rider_cnt'].shift(2)
new_data['rider_cnt_w_4'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['rider_cnt'].shift(3)
new_data['order_cnt_w_2'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['order_cnt'].shift(1)
new_data['order_cnt_w_3'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['order_cnt'].shift(2)
new_data['order_cnt_w_4'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg2', 'hour_reg'])['order_cnt'].shift(3)

In [61]:
print(new_data.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10125 entries, 54 to 50552
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   reg_date       10125 non-null  datetime64[ns]
 1   hour_reg       10125 non-null  int64         
 2   day_of_reg     10125 non-null  object        
 3   pick_rgn1_nm   10125 non-null  object        
 4   pick_rgn2_nm   10125 non-null  object        
 5   rider_cnt      10125 non-null  int64         
 6   order_cnt      10125 non-null  int64         
 7   month          10125 non-null  int32         
 8   week           10125 non-null  int32         
 9   day_of_reg2    10125 non-null  object        
 10  rider_cnt_w_2  7500 non-null   float64       
 11  rider_cnt_w_3  4875 non-null   float64       
 12  rider_cnt_w_4  2250 non-null   float64       
 13  order_cnt_w_2  7500 non-null   float64       
 14  order_cnt_w_3  4875 non-null   float64       
 15  order_cnt_w_4  2250 non

### 4. reg_date +7, 변수명 변경 

In [62]:
new_data['reg_date'] = new_data['reg_date'] + pd.DateOffset(days=7)
new_data['rider_cnt_w_1'] = new_data['rider_cnt']
new_data['order_cnt_w_1'] = new_data['order_cnt']
new_data = new_data[new_data['reg_date'] >= '2023-05-16'] #예측일 일자 이후 
print(new_data.shape)  # (2,250, num_columns)

(2250, 18)


### 5. 파생변수 생성 - is_holiday (API 받아서)

In [None]:
# 공휴일 가져오기 

today = datetime.today().strftime('%Y%m%d')
today_year = datetime.today().year

key = 'uw7Y8kgQ1Fqg6z9GjnSN8jJ8S%2FuV%2Bl%2B8PnHC4By9xwtYF5ZAExmO2Ip1mJcQC3HDTDcKmIxB9rBBuI0gDaIYVA%3D%3D'
url = "https://apis.new_data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?_type=json&numOfRows=50&solYear="+ str(today_year)+ '&ServiceKey=' + str(key) 

response = requests.get(url)

if response.status_code == 200 :
    json_ob = json.loads(response.text)
    holidays_new_data = json_ob['response']['body']['items']['item']
    new_dataframe = json_normalize(holidays_new_data)

holiday_list = new_dataframe['locdate'].tolist()
print(holiday_list)

In [64]:
holiday_list = ['2022-01-01', '2022-01-31', '2022-02-01', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', 
                '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', 
                '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01', '2023-05-01', '2023-05-05', '2023-05-27', '2023-05-29', '2023-06-06', '2023-08-15', 
                '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25']
new_data['is_holiday'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in holiday_list) or (row['day_of_reg2'] in ['Sat', 'Sun']) else 0, axis=1)
new_data['is_holiday2'] = new_data['reg_date'].apply(lambda x: 1 if x.date() in holiday_list else 0)

print(new_data['is_holiday'].value_counts()) #1500,750
print(new_data['is_holiday2'].value_counts())  #2250 1없음. 


is_holiday
0    1500
1     750
Name: count, dtype: int64
is_holiday2
0    2250
Name: count, dtype: int64


### 6. 파생변수 생성 - is_rain (추후 API로 해결)

In [None]:
# 시간대 입력 
# 강수 예정되어 있는 날짜, 시간대 입력
# specific_dates = ['2023-05-16', '2023-05-22']
# specific_times = [datetime.time(9, 0, 0), datetime.time(11, 0, 0), datetime.time(15, 0, 0), datetime.time(17, 0, 0)]

# new_data['is_rain'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in [datetime.datetime.strptime(date, '%Y-%m-%d').date() for date in specific_dates] and row['reg_date'].time() in specific_times) else 0, axis=1)

# print(new_data['is_rain'].value_counts())


In [71]:
# 날짜만 입력 
new_data['is_rain'] = new_data['reg_date'].apply(lambda x: 1 if x.date() == datetime.strptime('2023-05-18', '%Y-%m-%d').date() else 0)

print(new_data['is_rain'].value_counts()) #375

is_rain
0    1875
1     375
Name: count, dtype: int64


In [67]:
print(new_data.shape)

(2250, 21)


In [72]:
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

#print(new_data.isna().sum())

2023-05-16 00:00:00
2023-05-21 00:00:00


In [73]:
new_data = new_data.drop(['pick_rgn1_nm', 'rider_cnt', 'order_cnt'], axis=1)
new_data.to_csv('predict_new_data.csv', encoding='cp949', index=False)

### 7. 데이터 전처리 

In [74]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'is_rain', 'month', 'week', 'is_holiday2'] : 
   new_data[col] = new_data[col].astype('category')

print(new_data.dtypes)

reg_date         datetime64[ns]
hour_reg               category
day_of_reg             category
pick_rgn2_nm           category
month                  category
week                   category
day_of_reg2              object
rider_cnt_w_2           float64
rider_cnt_w_3           float64
rider_cnt_w_4           float64
order_cnt_w_2           float64
order_cnt_w_3           float64
order_cnt_w_4           float64
rider_cnt_w_1             int64
order_cnt_w_1             int64
is_holiday                int64
is_holiday2            category
is_rain                category
dtype: object


In [75]:
# one-hot-encoding

var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'is_rain', 'month','week','is_holiday2']

encode_data = new_data.sort_values(by="reg_date")
X_test = encode_data.drop(columns = ['reg_date'])

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(encode_data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = encode_data.index)
X_test = pd.concat([onehot, X_test.drop(columns=var)], axis=1)
#print(X_test.head(3))
print(X_test.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금', 'day_of_reg_월목', 'day_of_reg_주말', 'is_rain_0',
       'is_rain_1', 'month_5', 'week_2', 'is_holiday2_0', 'day_of_reg2',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider

### 8. 존재하지 않는 변수 추가하기

In [76]:
# 새로운 변수 생성
new_variables = ['month_1', 'month_2', 'month_3', 'month_4', 'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11', 'month_12', 'week_1', 'week_3', 'week_4', 'week_5', 'is_holiday2_1']

# 새로운 변수를 포함한 빈 DataFrame 생성
encode_data = pd.DataFrame(0, columns=new_variables, index= X_test.index)

# 기존 x_test DataFrame과 새로운 변수를 포함한 DataFrame을 병합
X_test = pd.concat([X_test, encode_data], axis=1)

# 결과 확인
#print(X_test.head())
print(X_test.columns)

Index(['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금', 'day_of_reg_월목', 'day_of_reg_주말', 'is_rain_0',
       'is_rain_1', 'month_5', 'week_2', 'is_holiday2_0', 'day_of_reg2',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider

### 9. train, test set split 

In [78]:
print(X_test.shape) #2250,74
X_test.isna().sum()

(2250, 74)


pick_rgn2_nm_강남구    0
pick_rgn2_nm_강동구    0
pick_rgn2_nm_강북구    0
pick_rgn2_nm_강서구    0
pick_rgn2_nm_관악구    0
                   ..
week_1              0
week_3              0
week_4              0
week_5              0
is_holiday2_1       0
Length: 74, dtype: int64

In [79]:
desired_order = ['pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23',
       'day_of_reg_금', 'day_of_reg_월목', 'day_of_reg_주말', 'is_rain_0',
       'is_rain_1', 'month_1', 'month_2', 'month_3', 'month_4', 'month_5',
       'month_6', 'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'week_1', 'week_2', 'week_3', 'week_4', 'week_5',
       'is_holiday2_0', 'is_holiday2_1', 'rider_cnt_w_1', 'rider_cnt_w_2',
       'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1', 'order_cnt_w_2',
       'order_cnt_w_3', 'order_cnt_w_4']

X_test = X_test[desired_order]

### 10. model 적용 

In [82]:

def get_predict(X_test):

    Lasso_model = joblib.load('model_Lasso.joblib')
    LGBM_model = joblib.load('model_LGBMRegressor.joblib')
    RF_model = joblib.load('model_RandomForestRegressor.joblib')

    result_df = pd.DataFrame({'reg_date': new_data["reg_date"],
                              'pick_rgn2_nm': new_data["pick_rgn2_nm"], 'hour_reg': new_data["hour_reg"],
                              'day_of_reg': new_data["day_of_reg"], 'is_rain': new_data["is_rain"],
                              'is_holiday': new_data["is_holiday2"]})

    y_pred_Lasso = Lasso_model.predict(X_test)
    y_pred_LGBM = LGBM_model.predict(X_test)
    y_pred_RF = RF_model.predict(X_test)
    
    result_df['y_pred_Lasso'] = y_pred_Lasso
    result_df['y_pred_LGBM'] = y_pred_LGBM
    result_df['y_pred_RF'] = y_pred_RF
    
    result_df['y_pred_avg'] = (result_df['y_pred_Lasso'] + result_df['y_pred_LGBM'] + result_df['y_pred_RF']) / 3 #3개 모델 예측값 


    result_df.to_csv('prediction_results_latest6days.csv', index=False, encoding="cp949")


get_predict(X_test)

### 11. 보정계수 적용 - rgn2, is_rain, is_holiday 

In [86]:
buffer_df = pd.read_csv("prediction_results_test_set.csv", encoding = "cp949")
buffer_df.head()

Unnamed: 0,datetime,pick_rgn2_nm,hour_reg,is_rain,day_of_reg,is_holiday,y_test,y_pred_test_LinearRegression,y_pred_test_Ridge,y_pred_test_Lasso,y_pred_test_LGBMRegressor,y_pred_test_RandomForestRegressor,y_pred_test_DecisionTreeRegressor,Average Prediction
0,2023-01-01 09:00:00,서초구,9,0,주말,1,71.0,93.092379,92.958571,92.494702,73.848144,82.477112,81.943897,82.939986
1,2023-01-01 09:00:00,성동구,9,0,주말,1,37.0,51.329414,51.264454,55.904069,48.885704,51.179571,52.176028,51.989781
2,2023-01-01 09:00:00,성북구,9,0,주말,1,50.0,70.875048,70.73649,70.450702,61.640803,64.114463,64.951275,65.401989
3,2023-01-01 09:00:00,송파구,9,0,주말,1,124.0,160.912889,160.663204,158.944535,154.45378,159.02163,161.572281,157.473315
4,2023-01-01 09:00:00,양천구,9,0,주말,1,56.0,58.335773,58.243148,61.113861,50.560986,52.604213,52.176028,54.759687


In [91]:
buffer_df['buffer_value'] = buffer_df['y_test'] / buffer_df['y_pred_test_avg']
buffer_df.head()

Unnamed: 0,datetime,pick_rgn2_nm,hour_reg,is_rain,day_of_reg,is_holiday,y_test,y_pred_test_LinearRegression,y_pred_test_Ridge,y_pred_test_Lasso,y_pred_test_LGBMRegressor,y_pred_test_RandomForestRegressor,y_pred_test_DecisionTreeRegressor,Average Prediction,y_pred_test_avg,buffer_value
0,2023-01-01 09:00:00,서초구,9,0,주말,1,71.0,93.092379,92.958571,92.494702,73.848144,82.477112,81.943897,82.939986,82.939986,0.856041
1,2023-01-01 09:00:00,성동구,9,0,주말,1,37.0,51.329414,51.264454,55.904069,48.885704,51.179571,52.176028,51.989781,51.989781,0.711678
2,2023-01-01 09:00:00,성북구,9,0,주말,1,50.0,70.875048,70.73649,70.450702,61.640803,64.114463,64.951275,65.401989,65.401989,0.764503
3,2023-01-01 09:00:00,송파구,9,0,주말,1,124.0,160.912889,160.663204,158.944535,154.45378,159.02163,161.572281,157.473315,157.473315,0.787435
4,2023-01-01 09:00:00,양천구,9,0,주말,1,56.0,58.335773,58.243148,61.113861,50.560986,52.604213,52.176028,54.759687,54.759687,1.02265
