# 자전거 대여 데이터

- 2011년부터 2012년까지 2년간의 자전거 대여 데이터
- 캐피털 바이크셰어 회사가 공개한 운행 기록에 다양한 외부 소스에서 얻은 당시 날씨 정보를 조합
- 한 시간 간격으로 기록됨
- 훈련 데이터 : 매달 1일부터 19일 까지의 기록
- 테스트 데이터 : 매달 20일부터 월말까지의 기록
- 피처
    - datetime : 기록 일시(1시간 간격)
    - season : 계절
        - 1 : 봄(1분기)
        - 2 : 여름(2분기)
        - 3 : 가을(3분기)
        - 4 : 겨울(4분기)
        - 공식 문서에는 계절로 설명하고 있지만 실제로는 분기로 나누어져 있음
    - holiday : 공휴일 여부(0 : 공휴일 아님, 1: 공휴일)
    - workingday : 근무일 여부(0 : 근무일 아님, 1 : 근무일)
        - 주말과 공휴일이 아니면 근무일이라고 간주
    - weather : 날씨
        - 1 : 맑음
        - 2 : 옅은 안개, 약간 흐림
        - 3 : 약간의 눈, 약간의 비와 천둥 번개, 흐림
        - 4 : 폭우와 천둥 번개, 눈과 짙은 안개
        - 숫자가 클수록 날씨가 안 좋음
    - temp : 실제 온도
    - atemp : 체감온도
    - humidity : 상대 습도
    - windspeed : 풍속
    - casual : 등록되지 않은 사용자(비회원) 수
    - registered : 등록된 사용자(회원) 수
    - count : 자전거 대여 수량
- 종속변수 : count
- 평가지표 : RMSLE(Root Mean Squared Logarithmic Error), 0.37

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_log_error
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression

In [4]:
def rmsle(y_true, y_pred, convertExp = True):
    '''
    실제 타깃값과 예측값을 인수로 전달하면 RMSLE 수치를 반환하는 함수
    convertExp : 입력 데이터를 지수변환할지 정하는 파라미터
    타깃값으로 log(count)를 사용한 경우에는 지수변환을 해줘야 함
    '''
    # 지수변환
    if convertExp:
        y_true = np.exp(y_true)
        y_pred = np.exp(y_pred)

    # 로그변환 후 결측값을 0으로 변환
    log_true = np.nan_to_num(np.log(y_true + 1))
    log_pred = np.nan_to_num(np.log(y_pred + 1))

    # RMSLE 계산
    output = np.sqrt(np.mean((log_true - log_pred)**2))

    return output

In [297]:
df = pd.read_csv("./data/bike/train.csv")
test_df = pd.read_csv("./data/bike/test.csv")

In [276]:
# season을 제거하고 월을 쓰기위한 데이터 변환
df["year"] = df['datetime'].apply(lambda x: int(x[:4]))
df["month"] = df['datetime'].apply(lambda x: int(x[5:7]))
df["time"] = df['datetime'].apply(lambda x: int(x[11:13]))

In [299]:
df[df["weather"] > 2]

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
18,2011-01-01 18:00:00,1,0,0,3,17.22,21.210,88,16.9979,9,26,35
19,2011-01-01 19:00:00,1,0,0,3,17.22,21.210,88,16.9979,6,31,37
29,2011-01-02 06:00:00,1,0,0,3,17.22,21.210,77,19.9995,0,2,2
31,2011-01-02 08:00:00,1,0,0,3,16.40,20.455,71,15.0013,0,8,8
37,2011-01-02 14:00:00,1,0,0,3,14.76,17.425,76,12.9980,4,55,59
...,...,...,...,...,...,...,...,...,...,...,...,...
10808,2012-12-16 18:00:00,4,0,0,3,15.58,19.695,82,7.0015,8,99,107
10818,2012-12-17 04:00:00,4,0,1,3,14.76,18.940,100,0.0000,2,3,5
10830,2012-12-17 16:00:00,4,0,1,3,16.40,20.455,94,16.9979,15,287,302
10837,2012-12-17 23:00:00,4,0,1,3,17.22,21.210,94,15.0013,6,41,47


In [277]:
use_df = df.drop("season", axis = 1)
onehot_month = pd.get_dummies(df["month"], dtype = int)
onehot_month.columns = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
onehot_time = pd.get_dummies(df["time"], dtype = int)
onehot_time.columns = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                        "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23"]


use_df = pd.concat([use_df, onehot_month, onehot_time], axis = 1).drop(["month", "time"], axis = 1)
use_df["atemp"] = use_df["temp"] - use_df["atemp"]

In [290]:
use_df.loc[:, "windspeed"] = np.round(df["windspeed"], 2)

In [300]:
x = use_df.drop(["datetime", "casual", "registered", "count", "humidity"], axis = 1)
y1 = use_df["casual"]
y2 = use_df["registered"]
y3 = use_df["count"]

In [301]:
x

Unnamed: 0,holiday,workingday,weather,temp,atemp,windspeed,year,Jan,Feb,Mar,...,14,15,16,17,18,19,20,21,22,23
0,0,0,1,9.84,-4.555,0.0,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,9.02,-4.615,0.0,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,9.02,-4.615,0.0,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,9.84,-4.555,0.0,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,9.84,-4.555,0.0,2011,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,0,1,1,15.58,-4.115,26.0,2012,0,0,0,...,0,0,0,0,0,1,0,0,0,0
10882,0,1,1,14.76,-2.665,15.0,2012,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10883,0,1,1,13.94,-1.970,15.0,2012,0,0,0,...,0,0,0,0,0,0,0,1,0,0
10884,0,1,1,13.94,-3.485,6.0,2012,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [302]:
x_train, x_test, y_train, y_test = train_test_split(x, y3, test_size = 0.3, random_state = 0)

In [303]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor

In [304]:
rf = RandomForestRegressor(max_depth = 1000, min_samples_split = 2, random_state = 0)
et = ExtraTreesRegressor(max_depth = 200, min_samples_split = 5, random_state = 0)
gb = GradientBoostingRegressor(max_depth = 100, min_samples_split = 5, random_state = 0)

In [305]:
et.fit(x_train, y_train)
pred = et.predict(x_test)
rmsle(y_test, pred, False)

0.39275359623730166

In [306]:
gb.fit(x_train, y_train)
pred = gb.predict(x_test)
rmsle(y_test, pred, False)

  log_pred = np.nan_to_num(np.log(y_pred + 1))


0.471719828203441

In [273]:
x_train

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,Jan,Feb,...,14,15,16,17,18,19,20,21,22,23
8620,0,1,1,32.80,-4.320,52,11.0014,2012,0,0,...,0,0,0,0,0,0,0,0,0,0
8985,0,1,1,32.80,-2.805,36,0.0000,2012,0,0,...,0,0,0,0,1,0,0,0,0,0
6171,0,1,1,4.92,-1.900,50,12.9980,2012,0,1,...,0,0,0,0,0,0,0,0,0,0
6473,0,1,1,11.48,-2.155,61,15.0013,2012,0,0,...,0,0,0,0,0,0,0,0,0,1
10877,0,1,1,17.22,-3.990,50,19.0012,2012,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4859,0,1,3,22.14,-3.620,60,15.0013,2011,0,0,...,0,0,0,0,0,0,0,0,0,0
3264,0,1,1,28.70,-3.875,65,15.0013,2011,0,0,...,0,0,0,0,0,0,0,0,1,0
9845,0,0,1,27.06,-4.000,39,27.9993,2012,0,0,...,1,0,0,0,0,0,0,0,0,0
10799,0,0,2,14.76,-2.665,87,8.9981,2012,0,0,...,0,0,0,0,0,0,0,0,0,0


In [251]:
x_test

Unnamed: 0,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,Jan,Feb,...,14,15,16,17,18,19,20,21,22,23
6638,0,1,1,23.78,-3.495,56,7.0015,2012,0,0,...,0,0,0,0,0,0,0,1,0,0
7975,0,1,2,27.06,-2.485,89,19.0012,2012,0,0,...,0,0,1,0,0,0,0,0,0,0
5915,0,1,1,18.86,-3.865,55,19.0012,2012,0,1,...,0,0,1,0,0,0,0,0,0,0
8050,0,1,1,28.70,-3.120,42,11.0014,2012,0,0,...,0,0,0,0,0,1,0,0,0,0
5894,0,1,1,22.14,-3.620,52,19.0012,2012,0,1,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5521,0,1,2,8.20,-3.165,59,6.0032,2012,1,0,...,0,0,0,0,0,0,0,0,0,0
10626,0,0,2,16.40,-4.055,94,7.0015,2012,0,0,...,0,0,0,0,0,0,0,0,0,0
8126,0,1,3,24.60,-4.190,78,12.9980,2012,0,0,...,0,0,0,0,0,0,0,0,0,1
1633,0,1,1,16.40,-4.055,71,8.9981,2011,0,0,...,0,0,0,0,0,0,0,0,0,1


In [307]:
# season을 제거하고 월을 쓰기위한 데이터 변환
test_df["year"] = test_df['datetime'].apply(lambda x: int(x[:4]))
test_df["month"] = test_df['datetime'].apply(lambda x: int(x[5:7]))
test_df["time"] = test_df['datetime'].apply(lambda x: int(x[11:13]))

In [313]:
use_test_df = test_df.drop("season", axis = 1)
#use_df = use_df.drop("holiday", axis = 1)
onehot_month = pd.get_dummies(test_df["month"], dtype = int)
onehot_month.columns = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
onehot_time = pd.get_dummies(test_df["time"], dtype = int)
onehot_time.columns = ["00", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                        "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23"]


use_test_df = pd.concat([use_test_df, onehot_month, onehot_time], axis = 1).drop(["month", "time"], axis = 1)
use_test_df["atemp"] = use_test_df["temp"] - use_test_df["atemp"]

In [314]:
x_train.columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'windspeed',
       'year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
       'Oct', 'Nov', 'Dec', '00', '01', '02', '03', '04', '05', '06', '07',
       '08', '09', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19',
       '20', '21', '22', '23'],
      dtype='object')

In [316]:
use_test_df.drop("datetime", axis = 1).columns

Index(['holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity',
       'windspeed', 'year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul',
       'Aug', 'Sep', 'Oct', 'Nov', 'Dec', '00', '01', '02', '03', '04', '05',
       '06', '07', '08', '09', '10', '11', '12', '13', '14', '15', '16', '17',
       '18', '19', '20', '21', '22', '23'],
      dtype='object')

In [310]:
use_test_df = use_test_df.drop("datetime", axis = 1)

In [224]:
pred_test = et.predict(use_test_df)
pred_test = pd.DataFrame(pred_test, columns = ["count"])

In [225]:
pred_df = pd.concat([test_df["datetime"], pred_test], axis = 1)

In [226]:
pred_df

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,7.396667
1,2011-01-20 01:00:00,4.764167
2,2011-01-20 02:00:00,3.078333
3,2011-01-20 03:00:00,4.545000
4,2011-01-20 04:00:00,1.997500
...,...,...
6488,2012-12-31 19:00:00,404.665833
6489,2012-12-31 20:00:00,231.499167
6490,2012-12-31 21:00:00,183.310833
6491,2012-12-31 22:00:00,130.958333


In [227]:
pred_df.to_csv('sample.csv', index=False)

In [231]:
params = {"min_impurity_decrease" : [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [233]:
params = {"min_impurity_decrease" : np.arange(0.0001, 0.001, 0.0001),
         "max_depth" : range(5, 20),
         "min_samples_split" : range(2, 100, 10)}

In [234]:
gs = GridSearchCV(ExtraTreesRegressor(random_state = 0), params)