In [None]:
import pandas as pd
import numpy as np
import requests 
from datetime import datetime, date


In [None]:
# 1. data load
data1 = pd.read_excel("/Users/yj.noh/Desktop/train_data_2022.xlsx")
data2 = pd.read_excel("/Users/yj.noh/Desktop/train_data_2023.xlsx")

data = pd.concat([data1, data2])
data.rename(columns = {'라이더수':'rider_cnt', '주문수':'order_cnt'}, inplace = True)

print(data.shape) #807,452

In [None]:
# 2. seoul, 9~23시만
# Filter data
data['reg_date'] = pd.to_datetime(data['reg_date'])
data = data[data['pick_rgn1_nm'] == '서울특별시']
data = data[data['reg_date'] < data['reg_date'].max()]
data = data[data['hour_reg'].isin(range(9, 24))]

print(data.shape) #189,750

In [None]:
# datetime 컬럼 만들기 

data['datetime'] = data['reg_date'] + pd.to_timedelta(data['hour_reg'], unit='h')
print(data['datetime'].min(), data['datetime'].max())

In [None]:
data['reg_date'] = pd.to_datetime(data['reg_date'])

data['hour_reg2'] = data['datetime'].dt.hour
data['reg_date2'] = data['datetime'].dt.date
data['month'] = data['reg_date'].dt.month
data['week'] = data['reg_date'].dt.day // 7 + 1
print(data.head())

In [None]:
data['reg_date'] = pd.to_datetime(data['reg_date'])
data['weekday'] = data['reg_date'].dt.weekday

weekday_dict = {0:'월요일', 1:'화요일', 2:'수요일', 3:'목요일', 4:'금요일', 5:'토요일', 6:'일요일'}
data['day_of_reg'] = data['weekday'].map(weekday_dict)

print(data)

In [None]:
data.drop(['reg_date', 'hour_reg', 'pick_rgn1_nm'], axis=1, inplace=True)      
data.rename(columns={"hour_reg2": "hour_reg", "reg_date2": "reg_date"}, inplace=True)

print(data.dtypes)

In [None]:
# weather
# Read csv
weather1 = pd.read_csv("/Users/yj.noh/Desktop/weather_2022.csv", encoding='cp949')
weather2 = pd.read_csv("/Users/yj.noh/Desktop/weather_2023.csv", encoding='cp949')

weather = pd.concat([weather1, weather2])

weather.rename(columns={"기온(°C)": "temp_c", "강수량(mm)": "rain_c", "적설(cm)": "snow_c", "일시": "date"}, inplace=True)

weather['date_2'] = pd.to_datetime(weather['date']).dt.date
weather['hour'] = pd.to_datetime(weather['date']).dt.hour

#print(weather['hour'].value_counts())
print(weather.head())

In [None]:
# Filter hours
weather = weather[weather['hour'].isin([9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])]

# Join
combined_data = pd.merge(data, weather[["date_2","hour","temp_c","rain_c", "snow_c"]], left_on=["reg_date", "hour_reg"], right_on=["date_2", "hour"], how='left')

# Fill NA
combined_data['rain_c'].fillna(0, inplace=True)
combined_data['snow_c'].fillna(0, inplace=True)
print(combined_data.isnull().sum())

In [None]:
# 변수 생성 - is_rain
combined_data['is_rain'] = combined_data.apply(lambda row: 1 if row['rain_c'] > 0 or row['snow_c'] > 0 else 0, axis=1)
print(combined_data['is_rain'].value_counts()) # 0: 172,200 1: 17550

print(combined_data['datetime'].min()) # "2022-01-01 09:00:00"
print(combined_data['datetime'].max()) #  2023-05-21 23:00:00 


In [None]:
# 공휴일 유무 
holiday_list = pd.to_datetime(['2022-01-01', '2022-01-31', '2022-02-01', '2022-03-01', '2022-03-09', '2022-05-05', '2022-05-08', '2022-06-01', '2022-06-06', '2022-08-15', 
                '2022-09-09', '2022-09-10', '2022-09-11', '2022-09-12', '2022-10-03', '2022-10-09', '2022-10-10', '2022-12-25', '2023-01-01', '2023-01-21', 
                '2023-01-22', '2023-01-23', '2023-01-24', '2023-03-01', '2023-05-01', '2023-05-05', '2023-05-27', '2023-05-29', '2023-06-06', '2023-08-15', 
                '2023-09-28', '2023-09-29', '2023-09-30', '2023-10-03', '2023-10-09', '2023-12-25'])

combined_data['reg_date'] = pd.to_datetime(combined_data['reg_date'])
combined_data['is_holiday'] = combined_data.apply(lambda row: 1 if (row['reg_date'] in holiday_list) or (row['day_of_reg'] in ['토요일', '일요일']) else 0, axis=1)

print(combined_data['is_holiday'].value_counts()) # 61125

In [None]:
# 이상치(outlier) 여부 파악 
def calculate_quantiles(group):
    q1 = group['rider_cnt'].quantile(0.25)
    q3 = group['rider_cnt'].quantile(0.75)
    IQR1_5 = 1.5 * (q3 - q1)
    group['q1'] = q1
    group['q3'] = q3
    group['IQR1.5'] = IQR1_5
    group['outlier'] = np.where((group['is_rain'] == 0) & ((group['rider_cnt'] < (q1 - IQR1_5)) | (group['rider_cnt'] > (q3 + IQR1_5))), 1, 0)
    return group

combined_data = combined_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg', 'is_rain']).apply(calculate_quantiles)

print(combined_data['outlier'].value_counts()) #7513

In [None]:
# outlier -> median 값으로 대체 

combined_data['rider_cnt_2'] = combined_data['rider_cnt'].copy()
combined_data['rider_cnt_2'] = combined_data.groupby(['pick_rgn2_nm', 'day_of_reg', 'hour_reg'])['rider_cnt_2'].transform(lambda x: x.median() if combined_data['outlier'] == 1 else x)
