## 최신 6일 라이더 운행 인원 예측

### 1. import new_dataset 

In [37]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time
from datetime import datetime, timedelta 
import joblib
import json
import requests 

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import ast 
import statsmodels.api as sm 
from scipy import stats
from pandas import json_normalize
from datetime import datetime, date

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

In [38]:
new_data = pd.read_excel("/Users/yj.noh/Desktop/seoul/seoul_day_new_data_20230613.xlsx")
new_data = new_data.rename(columns={'라이더수': 'rider_cnt', '주문수': 'order_cnt'})

new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date
new_data = new_data[(new_data['pick_rgn1_nm'] == '서울특별시') & (new_data['reg_date'] < new_data['reg_date'].max())]

print(new_data.shape) 

(377, 5)


In [39]:
#print(new_data['pick_rgn2_nm'].value_counts())

print(new_data['reg_date'].min()) #2022-06-01
print(new_data['reg_date'].max()) #2023-06-12

2022-06-01
2023-06-12


In [40]:
new_data.isna().sum()

reg_date        0
holiday_yn      0
pick_rgn1_nm    0
rider_cnt       0
order_cnt       0
dtype: int64

### 2. 파생변수 생성 - month, week, day_of_reg2, day_of_reg

In [41]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date'])

new_data['day_of_reg'] = new_data['reg_date'].dt.strftime('%a')
new_data['day_of_reg'] = new_data['day_of_reg'].replace({'Sat': '토요일', 'Sun': '일요일', 'Fri': '금요일', 'Mon': '월요일', 'Tue': '화요일', 'Wed': '수요일', 'Thu': '목요일'})
print(new_data['day_of_reg'].value_counts())

day_of_reg
수요일    54
목요일    54
금요일    54
토요일    54
일요일    54
월요일    54
화요일    53
Name: count, dtype: int64


### 3. reg_date +7, 변수명 변경 

In [42]:
new_data['reg_date'] = new_data['reg_date'] + pd.DateOffset(days=7)

new_data['rider_cnt_w_1'] = new_data['rider_cnt']
new_data['order_cnt_w_1'] = new_data['order_cnt']

print(new_data['reg_date'].min()) #2022-06-08
print(new_data['reg_date'].max()) #2023-06-19



2022-06-08 00:00:00
2023-06-19 00:00:00


### 4. 파생변수 생성 - is_holiday 

In [43]:
# #공휴일 가져오기 
# import json
# from pandas import json_normalize
# from urllib.parse import unquote

# today = datetime.today().strftime('%Y%m%d')
# today_year = datetime.today().year

# key = unquote('uw7Y8kgQ1Fqg6z9GjnSN8jJ8S%2FuV%2Bl%2B8PnHC4By9xwtYF5ZAExmO2Ip1mJcQC3HDTDcKmIxB9rBBuI0gDaIYVA%3D%3D')
# url = "https://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?&solYear="+ str(today_year)+ '&ServiceKey=' + str(key) 

# response = requests.get(url)

# if response.status_code == 200 :
#     json_ob = json.loads(response.text)
#     holidays_new_data = json_ob['response']['body']['items']['item']
#     new_dataframe = json_normalize(holidays_new_data)

# holiday_list = new_dataframe['locdate'].tolist()
# print(holiday_list)


In [44]:
holiday_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2022-01-01', '2022-01-31', '2022-02-01', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', 
                '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', 
                '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01', '2023-05-01', '2023-05-05', '2023-05-27', '2023-05-29', '2023-06-06', '2023-08-15', 
                '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25']]

new_data['is_holiday'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in holiday_list) or (row['day_of_reg'] in ['토요일', '일요일']) else 0, axis=1)
print(new_data['is_holiday'].value_counts())  

is_holiday
0    257
1    120
Name: count, dtype: int64


### 4. 파생변수 생성 - is_rain 

In [45]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date

weather = pd.read_csv("/Users/yj.noh/Desktop/seoul/seoul_day_weather_20230613.csv", encoding='cp949')

weather = weather.rename(columns={'기온(°C)': 'temp_c', '일강수량(mm)': 'rain_c', '일 최심적설(cm)': 'snow_c', '일시': 'date'})

weather['date_2'] = pd.to_datetime(weather['date']).dt.date

new_data = pd.merge(new_data, weather[['date_2', 'rain_c', 'snow_c']], left_on=['reg_date'], right_on=['date_2'], how='left')

new_data['rain_c'] = new_data['rain_c'].fillna(0)
new_data['snow_c'] = new_data['snow_c'].fillna(0)

new_data['is_rain'] = np.where((new_data['rain_c'] > 0) | (new_data['snow_c'] > 0), 1, 0)

print(new_data['is_rain'].value_counts()) 

is_rain
0    258
1    119
Name: count, dtype: int64


In [46]:
# 모두 비 안온다고 가정 
rain_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2023-06-14', '2023-06-15', '2023-06-16','2023-06-17', '2023-06-18', '2023-06-19']]

new_data.loc[new_data['reg_date'].isin(rain_list), 'is_rain'] = 1

print(new_data['is_rain'].value_counts()) 



is_rain
0    252
1    125
Name: count, dtype: int64


In [47]:
print(new_data.shape) 
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

#print(new_data.isna().sum())

(377, 13)
2022-06-08
2023-06-19


### rain_group 추가

In [48]:
# rain = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2023-06-08', '2023-06-09', '2023-06-10','2023-06-11', '2023-06-12', '2023-06-13']]
# new_data.loc[new_data['reg_date'].isin(rain), 'rain_group'] = 'weak'

# # def assign_rain_group(rain_c):
# #     if rain_c <= 0:
# #         return "no"
# #     elif 0 < rain_c and rain_c < 3.0:
# #         return "weak"
# #     elif 3.0 <= rain_c and rain_c < 15:
# #         return "normal"
# #     elif 15 <= rain_c and rain_c < 30:
# #         return "strong"
# #     elif 30 <= rain_c:
# #         return "very_strong"


# print(new_data['rain_group'].value_counts())


### 5. 파생변수 생성 -  w-1,w-2,w-3,w-4 동일요일, 동일 시간대 주문수, 라이더 수 

In [49]:
new_data = new_data.sort_values(['reg_date'])
new_data['rider_cnt_w_2'] = new_data.groupby(['day_of_reg', 'is_rain'])['rider_cnt'].shift(1)
new_data['rider_cnt_w_3'] = new_data.groupby(['day_of_reg', 'is_rain'])['rider_cnt'].shift(2)
new_data['rider_cnt_w_4'] = new_data.groupby(['day_of_reg', 'is_rain'])['rider_cnt'].shift(3)

print(new_data.isna().sum())

reg_date          0
holiday_yn        0
pick_rgn1_nm      0
rider_cnt         0
order_cnt         0
day_of_reg        0
rider_cnt_w_1     0
order_cnt_w_1     0
is_holiday        0
date_2            7
rain_c            0
snow_c            0
is_rain           0
rider_cnt_w_2    14
rider_cnt_w_3    28
rider_cnt_w_4    42
dtype: int64


In [50]:
# 결측치 애우기 

new_data = new_data.sort_values(by=['day_of_reg'])

new_data['rider_cnt_w_2'].fillna(new_data['rider_cnt_w_1'], inplace=True)
new_data['rider_cnt_w_3'].fillna(new_data['rider_cnt_w_2'], inplace=True)
new_data['rider_cnt_w_4'].fillna(new_data['rider_cnt_w_3'], inplace=True)

print(new_data.isna().sum())



reg_date         0
holiday_yn       0
pick_rgn1_nm     0
rider_cnt        0
order_cnt        0
day_of_reg       0
rider_cnt_w_1    0
order_cnt_w_1    0
is_holiday       0
date_2           7
rain_c           0
snow_c           0
is_rain          0
rider_cnt_w_2    0
rider_cnt_w_3    0
rider_cnt_w_4    0
dtype: int64


### 6. 파생변수 생성 group_s 

In [51]:
# def group_assignment(row):
#     if row['day_of_reg'] in ['월요일','화요일','수요일','목요일','금요일']:
#         if row['holiday_yn'] == 0 and row['is_rain'] == 0:
#             return 'A'
#         elif row['holiday_yn'] == 0 and row['is_rain'] == 1:
#             return 'B'
#         elif row['holiday_yn'] == 1 and row['is_rain'] == 0:
#             return 'C'
#         elif row['holiday_yn'] == 1 and row['is_rain'] == 1:
#             return 'D'
#     elif row['day_of_reg'] in ['토요일','일요일']:
#         if row['holiday_yn'] == 0 and row['is_rain'] == 0:
#             return 'E'
#         elif row['holiday_yn'] == 0 and row['is_rain'] == 1:
#             return 'F'
 
# new_data['group_s'] = new_data.apply(group_assignment, axis=1)
# print(new_data['group_s'].value_counts())

In [52]:
tomorrow = (datetime.now() + timedelta(days=1)).date()
new_data = new_data[new_data['reg_date'].apply(lambda x: x) >= tomorrow] # 내일날짜부터 예측되게끔. 
new_data.to_csv('predict_new_data.csv', encoding='cp949', index=False)

# new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date
# new_data = new_data[new_data['reg_date'] >= date(2023, 6, 8)]
# new_data = new_data[new_data['reg_date'] <= date(2023, 6, 13)]

print(new_data.shape)  # (2,250, num_columns)
#print(new_data.info())
print(new_data['reg_date'].min())  #2023-06-14

(5, 16)
2023-06-15


### 7. 데이터 전처리 

In [53]:
# category 
for col in ['day_of_reg',  'is_holiday', 'is_rain'] : 
   new_data.loc[:, col] = new_data.loc[:, col].astype('category')

print(new_data.dtypes)

reg_date           object
holiday_yn         object
pick_rgn1_nm       object
rider_cnt           int64
order_cnt           int64
day_of_reg         object
rider_cnt_w_1       int64
order_cnt_w_1       int64
is_holiday       category
date_2             object
rain_c            float64
snow_c            float64
is_rain          category
rider_cnt_w_2     float64
rider_cnt_w_3     float64
rider_cnt_w_4     float64
dtype: object


In [54]:
# one-hot-encoding

new_data = new_data.drop(['holiday_yn','pick_rgn1_nm','rain_c','snow_c', 'rider_cnt','order_cnt','date_2'], axis=1)

var = ['day_of_reg', 'is_holiday', 'is_rain']

new_data = new_data.sort_values(by="reg_date")
encode_data = new_data.sort_values(by="reg_date")

X_test = encode_data.drop(columns = ['reg_date'])

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(encode_data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = encode_data.index)
X_test = pd.concat([onehot, X_test.drop(columns=var)], axis=1)

print(X_test.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_월요일', 'day_of_reg_일요일',
       'day_of_reg_토요일', 'is_holiday_0', 'is_holiday_1', 'is_rain_1',
       'rider_cnt_w_1', 'order_cnt_w_1', 'rider_cnt_w_2', 'rider_cnt_w_3',
       'rider_cnt_w_4'],
      dtype='object')


### 8. 존재하지 않는 변수 추가하기

In [55]:
desired_order = ['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'is_holiday_0', 'is_holiday_1', 'rider_cnt_w_1',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1']


new_variables = [col for col in desired_order if col not in X_test.columns]

# 새로운 변수를 포함한 빈 DataFrame 생성
encode_data = pd.DataFrame(0, columns=new_variables, index= X_test.index)


# 기존 x_test DataFrame과 새로운 변수를 포함한 DataFrame을 병합
X_test = pd.concat([X_test, encode_data], axis=1)
X_test = X_test[desired_order]

# 결과 확인
#print(X_test.head())
print(X_test.columns)

Index(['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일', 'is_rain_0',
       'is_rain_1', 'is_holiday_0', 'is_holiday_1', 'rider_cnt_w_1',
       'rider_cnt_w_2', 'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1'],
      dtype='object')


### 9. train, test set split 

In [56]:
print(X_test.shape) 
X_test.isna().sum()

(5, 16)


day_of_reg_금요일    0
day_of_reg_목요일    0
day_of_reg_수요일    0
day_of_reg_월요일    0
day_of_reg_일요일    0
day_of_reg_토요일    0
day_of_reg_화요일    0
is_rain_0         0
is_rain_1         0
is_holiday_0      0
is_holiday_1      0
rider_cnt_w_1     0
rider_cnt_w_2     0
rider_cnt_w_3     0
rider_cnt_w_4     0
order_cnt_w_1     0
dtype: int64

### 10. model 적용 

In [57]:
def get_predict(X_test):
    
    LM_model = joblib.load('model_LinearRegression.joblib')
    LGBM_model = joblib.load('model_LGBMRegressor.joblib')
    RF_model = joblib.load('model_RandomForestRegressor.joblib')

    
    result_df = pd.DataFrame({'reg_date': new_data["reg_date"],
                              'day_of_reg': new_data["day_of_reg"], 'is_rain': new_data["is_rain"],
                              'is_holiday': new_data["is_holiday"]})


    y_pred_LM = LM_model.predict(X_test)
    y_pred_LGBM = LGBM_model.predict(X_test)
    y_pred_RF = RF_model.predict(X_test)
    
     # Confidence intervals
    alpha = 0.1  # 90% confidence interval
    z_score = stats.norm.ppf(1 - alpha / 2)  # Z-score for given alpha
    std_Lasso = np.std(y_pred_LM)
    std_LGBM = np.std(y_pred_LGBM)
    std_RF = np.std(y_pred_RF)
    n_samples = X_test.shape[0]

    ci_LM = z_score * std_Lasso / np.sqrt(n_samples)
    ci_LGBM = z_score * std_LGBM / np.sqrt(n_samples)
    ci_RF = z_score * std_RF / np.sqrt(n_samples)
    
    result_df['y_pred_LM'] = y_pred_LM
    result_df['y_pred_LGBM'] = y_pred_LGBM
    result_df['y_pred_RF'] = y_pred_RF
    result_df['y_pred'] = (result_df['y_pred_LM'] + result_df['y_pred_LGBM'] + result_df['y_pred_RF']) / 3 #3개 모델 예측값 
    
    # Add confidence intervals to the dataframe
    result_df['ci_LM'] = ci_LM
    result_df['ci_LGBM'] = ci_LGBM
    result_df['ci_RF'] = ci_RF
    
    result_df.to_csv('prediction_results_day_latest6days_weather_O.csv', index=False, encoding="cp949")


get_predict(X_test)