## 최신 6일 라이더 운행 인원 예측

### 1. import new_dataset 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from time import time
from datetime import datetime, timedelta 
import joblib
import json
import requests 

# sklearn 관련
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
import ast 
import statsmodels.api as sm 
from pandas import json_normalize

# 전처리 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# lightgbm 관련
from lightgbm import LGBMRegressor
from lightgbm import plot_importance

###### data download : https://redash.woowa.in/queries/26484/source?p_dateFrom=2023-01-01&p_dateTo=d_yesterday 
###### weather download : https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36

In [None]:
new_data = pd.read_excel("/Users/yj.noh/Desktop/new_data.xlsx")
new_data = new_data.rename(columns={'라이더수': 'rider_cnt', '주문수': 'order_cnt'})

new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date
new_data = new_data[(new_data['pick_rgn1_nm'] == '서울특별시') & (new_data['reg_date'] < new_data['reg_date'].max())]

print(new_data.shape) # 67395

In [None]:
#print(new_data['pick_rgn2_nm'].value_counts())
print(new_data['hour_reg'].value_counts())
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

In [None]:
# 9~ 23시만 
new_data = new_data[new_data['hour_reg'].isin([9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23])]
print(new_data['hour_reg'].value_counts())
print(new_data.shape)  # 58125
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

In [None]:
new_data.isna().sum()

### 2. 파생변수 생성 - month, week, day_of_reg2, day_of_reg

In [None]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date'])
#new_data['month'] = new_data['reg_date'].dt.month
new_data['week'] = ((new_data['reg_date'].dt.day - 1) // 7) + 1
new_data['day_of_reg'] = new_data['reg_date'].dt.strftime('%a')
new_data['day_of_reg'] = new_data['day_of_reg'].replace({'Sat': '토요일', 'Sun': '일요일', 'Fri': '금요일', 'Mon': '월요일', 'Tue': '화요일', 'Wed': '수요일', 'Thu': '목요일'})
print(new_data['day_of_reg'].value_counts())

### 3. reg_date +7, 변수명 변경 

In [None]:
new_data['reg_date'] = new_data['reg_date'] + pd.DateOffset(days=7)

new_data['rider_cnt_w_1'] = new_data['rider_cnt']
new_data['order_cnt_w_1'] = new_data['order_cnt']

print(new_data['reg_date'].min()) #2023-01-08
print(new_data['reg_date'].max()) #2023-06-11



### 4. 파생변수 생성 - is_holiday 

In [None]:
# #공휴일 가져오기 
# import json
# from pandas import json_normalize
# from urllib.parse import unquote

# today = datetime.today().strftime('%Y%m%d')
# today_year = datetime.today().year

# key = unquote('uw7Y8kgQ1Fqg6z9GjnSN8jJ8S%2FuV%2Bl%2B8PnHC4By9xwtYF5ZAExmO2Ip1mJcQC3HDTDcKmIxB9rBBuI0gDaIYVA%3D%3D')
# url = "https://apis.data.go.kr/B090041/openapi/service/SpcdeInfoService/getRestDeInfo?&solYear="+ str(today_year)+ '&ServiceKey=' + str(key) 

# response = requests.get(url)

# if response.status_code == 200 :
#     json_ob = json.loads(response.text)
#     holidays_new_data = json_ob['response']['body']['items']['item']
#     new_dataframe = json_normalize(holidays_new_data)

# holiday_list = new_dataframe['locdate'].tolist()
# print(holiday_list)


In [None]:
holiday_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2022-01-01', '2022-01-31', '2022-02-01', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', 
                '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', 
                '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01', '2023-05-01', '2023-05-05', '2023-05-27', '2023-05-29', '2023-06-06', '2023-08-15', 
                '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25']]

new_data['holiday_yn'] = new_data.apply(lambda row: 1 if (row['reg_date'].date() in holiday_list) or (row['day_of_reg'] in ['토요일', '일요일']) else 0, axis=1)
print(new_data['holiday_yn'].value_counts())  

### 4. 파생변수 생성 - is_rain 

In [None]:
new_data['reg_date'] = pd.to_datetime(new_data['reg_date']).dt.date

weather = pd.read_csv("/Users/yj.noh/Desktop/weather_2023.csv", encoding='cp949')

weather = weather.rename(columns={'기온(°C)': 'temp_c', '강수량(mm)': 'rain_c', '적설(cm)': 'snow_c', '일시': 'date'})

weather['date_2'] = pd.to_datetime(weather['date']).dt.date
weather['hour'] = pd.to_datetime(weather['date']).dt.hour

weather = weather[weather['hour'].isin([9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])]

new_data = pd.merge(new_data, weather[['date_2', 'hour','rain_c', 'snow_c']], left_on=['reg_date', 'hour_reg'], right_on=['date_2', 'hour'], how='left')

new_data['rain_c'] = new_data['rain_c'].fillna(0)
new_data['snow_c'] = new_data['snow_c'].fillna(0)

new_data['is_rain'] = np.where((new_data['rain_c'] > 0) | (new_data['snow_c'] > 0), 1, 0)

print(new_data['is_rain'].value_counts()) 

In [None]:
# 날짜만 입력 
# new_data['is_rain'] = new_data['reg_date'].apply(lambda x: 1 if x.date() == datetime.strptime('2023-05-18', '%Y-%m-%d').date() else 0)
# print(new_data['is_rain'].value_counts()) #375

# 비오는 날씨 입력 
rain_list = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2023-06-09']]

new_data.loc[new_data['reg_date'].isin(rain_list), 'is_rain'] = 1

print(new_data['is_rain'].value_counts()) # 



In [None]:
print(new_data.shape) 
print(new_data['reg_date'].min())
print(new_data['reg_date'].max())

#print(new_data.isna().sum())

### rain_group 추가

In [None]:
not_rain = [datetime.strptime(date, '%Y-%m-%d').date() for date in ['2023-06-06', '2023-06-07', '2023-06-08','2023-06-09', '2023-06-10', '2023-06-11']]
new_data.loc[new_data['reg_date'].isin(not_rain), 'rain_group'] = 'no'

# def assign_rain_group(rain_c):
#     if rain_c <= 0:
#         return "no"
#     elif 0 < rain_c and rain_c < 3.0:
#         return "weak"
#     elif 3.0 <= rain_c and rain_c < 15:
#         return "normal"
#     elif 15 <= rain_c and rain_c < 30:
#         return "strong"
#     elif 30 <= rain_c:
#         return "very_strong"


print(new_data['rain_group'].value_counts())


### 5. 파생변수 생성 -  w-1,w-2,w-3,w-4 동일요일, 동일 시간대 주문수, 라이더 수 

In [None]:
new_data = new_data.sort_values(['reg_date', 'pick_rgn2_nm'])
new_data['rider_cnt_w_2'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(1)
new_data['rider_cnt_w_3'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(2)
new_data['rider_cnt_w_4'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg','is_rain'])['rider_cnt'].shift(3)

print(new_data.isna().sum())

In [None]:
# 결측치 애우기 

new_data = new_data.sort_values(by=['pick_rgn2_nm', 'day_of_reg', 'hour_reg'])

#new_data['rider_cnt_w_1'] = new_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg'])['rider_cnt'].shift(1)
new_data['rider_cnt_w_1'].fillna(new_data['rider_cnt'], inplace=True)

#new_data['rider_cnt_w_2'] = new_data['rider_cnt_w_1']
new_data['rider_cnt_w_2'].fillna(new_data['rider_cnt_w_1'], inplace=True)

#new_data['rider_cnt_w_3'] = new_data['rider_cnt_w_2']
new_data['rider_cnt_w_3'].fillna(new_data['rider_cnt_w_2'], inplace=True)

#new_data['rider_cnt_w_4'] = new_data['rider_cnt_w_3']
new_data['rider_cnt_w_4'].fillna(new_data['rider_cnt_w_3'], inplace=True)

print(new_data.isna().sum())



### 6. 파생변수 생성 group_s 

In [None]:
def group_assignment(row):
    if row['day_of_reg'] in ['월요일','화요일','수요일','목요일','금요일']:
        if row['holiday_yn'] == 0 and row['is_rain'] == 0:
            return 'A'
        elif row['holiday_yn'] == 0 and row['is_rain'] == 1:
            return 'B'
        elif row['holiday_yn'] == 1 and row['is_rain'] == 0:
            return 'C'
        elif row['holiday_yn'] == 1 and row['is_rain'] == 1:
            return 'D'
    elif row['day_of_reg'] in ['토요일','일요일']:
        if row['holiday_yn'] == 0 and row['is_rain'] == 0:
            return 'E'
        elif row['holiday_yn'] == 0 and row['is_rain'] == 1:
            return 'F'
 
new_data['group_s'] = new_data.apply(group_assignment, axis=1)
print(new_data['group_s'].value_counts())

In [None]:
tomorrow = (datetime.now() + timedelta(days=1)).date()

new_data = new_data[new_data['reg_date'].apply(lambda x: x) >= tomorrow] # 내일날짜부터 예측되게끔. 
new_data.to_csv('predict_new_data.csv', encoding='cp949', index=False)

print(new_data.shape)  # (2,250, num_columns)
#print(new_data.info())
print(new_data['reg_date'].min()) 

### 7. 데이터 전처리 

In [None]:
# category  - pick_rgn2_nm, hour_reg, day_of_reg, is_rain, month, week, is_holiday
for col in ['pick_rgn2_nm', 'hour_reg', 'day_of_reg', 'rain_group', 'week', 'holiday_yn', 'group_s'] : 
   new_data.loc[:, col] = new_data.loc[:, col].astype('category')

print(new_data.dtypes)

In [None]:
# one-hot-encoding

new_data = new_data.drop(['rain_c','snow_c', 'date_2','hour','is_rain'], axis=1)

var = ['pick_rgn2_nm', 'hour_reg','day_of_reg', 'rain_group','week','holiday_yn', 'group_s']

encode_data = new_data.sort_values(by="reg_date")
X_test = encode_data.drop(columns = ['reg_date'])

encoder = OneHotEncoder()
onehot = pd.DataFrame(encoder.fit_transform(encode_data[var]).toarray(), columns=encoder.get_feature_names_out(var), index = encode_data.index)
X_test = pd.concat([onehot, X_test.drop(columns=var)], axis=1)

print(X_test.columns)

### 8. 존재하지 않는 변수 추가하기

In [None]:
desired_order = ['day_of_reg_금요일', 'day_of_reg_목요일', 'day_of_reg_수요일', 'day_of_reg_월요일',
       'day_of_reg_일요일', 'day_of_reg_토요일', 'day_of_reg_화요일',
       'pick_rgn2_nm_강남구', 'pick_rgn2_nm_강동구', 'pick_rgn2_nm_강북구',
       'pick_rgn2_nm_강서구', 'pick_rgn2_nm_관악구', 'pick_rgn2_nm_광진구',
       'pick_rgn2_nm_구로구', 'pick_rgn2_nm_금천구', 'pick_rgn2_nm_노원구',
       'pick_rgn2_nm_도봉구', 'pick_rgn2_nm_동대문구', 'pick_rgn2_nm_동작구',
       'pick_rgn2_nm_마포구', 'pick_rgn2_nm_서대문구', 'pick_rgn2_nm_서초구',
       'pick_rgn2_nm_성동구', 'pick_rgn2_nm_성북구', 'pick_rgn2_nm_송파구',
       'pick_rgn2_nm_양천구', 'pick_rgn2_nm_영등포구', 'pick_rgn2_nm_용산구',
       'pick_rgn2_nm_은평구', 'pick_rgn2_nm_종로구', 'pick_rgn2_nm_중구',
       'pick_rgn2_nm_중랑구', 'hour_reg_9', 'hour_reg_10', 'hour_reg_11',
       'hour_reg_12', 'hour_reg_13', 'hour_reg_14', 'hour_reg_15',
       'hour_reg_16', 'hour_reg_17', 'hour_reg_18', 'hour_reg_19',
       'hour_reg_20', 'hour_reg_21', 'hour_reg_22', 'hour_reg_23', 'group_s_A',
       'group_s_B', 'group_s_C', 'group_s_D', 'group_s_E', 'group_s_F',
       'is_crm_0', 'is_crm_1', 'rain_group_no', 'rain_group_normal',
       'rain_group_strong', 'rain_group_very_strong', 'rain_group_weak',
       'holiday_yn_N', 'holiday_yn_Y', 'rider_cnt_w_1', 'rider_cnt_w_2',
       'rider_cnt_w_3', 'rider_cnt_w_4', 'order_cnt_w_1']

new_variables = [col for col in desired_order if col not in X_test.columns]

# 새로운 변수를 포함한 빈 DataFrame 생성
encode_data = pd.DataFrame(0, columns=new_variables, index= X_test.index)

# 기존 x_test DataFrame과 새로운 변수를 포함한 DataFrame을 병합
X_test = pd.concat([X_test, encode_data], axis=1)
X_test = X_test[desired_order]

# 결과 확인
#print(X_test.head())
print(X_test.columns)

### 9. train, test set split 

In [None]:
print(X_test.shape) 
X_test.isna().sum()

### 10. model 적용 

In [None]:
def get_predict(X_test):

    Lasso_model = joblib.load('model_Lasso.joblib')
    LGBM_model = joblib.load('model_LGBMRegressor.joblib')
    RF_model = joblib.load('model_RandomForestRegressor.joblib')

    result_df = pd.DataFrame({'reg_date': new_data["reg_date"],
                              'pick_rgn2_nm': new_data["pick_rgn2_nm"], 'hour_reg': new_data["hour_reg"],
                              'day_of_reg': new_data["day_of_reg"], 'rain_group': new_data["rain_group"],
                              'holiday_yn': new_data["holiday_yn"]})

    y_pred_Lasso = Lasso_model.predict(X_test)
    y_pred_LGBM = LGBM_model.predict(X_test)
    y_pred_RF = RF_model.predict(X_test)
    
    result_df['y_pred_Lasso'] = y_pred_Lasso
    result_df['y_pred_LGBM'] = y_pred_LGBM
    result_df['y_pred_RF'] = y_pred_RF
    
    result_df['y_pred_avg_three'] = (result_df['y_pred_Lasso'] + result_df['y_pred_LGBM'] + result_df['y_pred_RF']) / 3 #3개 모델 예측값 
    result_df['y_pred_avg_lgbm_rf'] = (result_df['y_pred_LGBM'] + result_df['y_pred_RF']) / 2


    result_df.to_csv('prediction_results_latest6days.csv', index=False, encoding="cp949")


get_predict(X_test)