In [756]:
import numpy as np
import pandas as pd

from sklearn.metrics import r2_score, mean_squared_error

In [757]:
train = pd.read_csv("/kaggle/input/bike-sharing-demand/train.csv")
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

In [758]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10886 entries, 0 to 10885
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10886 non-null  object 
 1   season      10886 non-null  int64  
 2   holiday     10886 non-null  int64  
 3   workingday  10886 non-null  int64  
 4   weather     10886 non-null  int64  
 5   temp        10886 non-null  float64
 6   atemp       10886 non-null  float64
 7   humidity    10886 non-null  int64  
 8   windspeed   10886 non-null  float64
 9   casual      10886 non-null  int64  
 10  registered  10886 non-null  int64  
 11  count       10886 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 1020.7+ KB


In [759]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6493 entries, 0 to 6492
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    6493 non-null   object 
 1   season      6493 non-null   int64  
 2   holiday     6493 non-null   int64  
 3   workingday  6493 non-null   int64  
 4   weather     6493 non-null   int64  
 5   temp        6493 non-null   float64
 6   atemp       6493 non-null   float64
 7   humidity    6493 non-null   int64  
 8   windspeed   6493 non-null   float64
dtypes: float64(3), int64(5), object(1)
memory usage: 456.7+ KB


In [760]:
train.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40


In [761]:
test.head(2)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
0,2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
1,2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0


In [762]:
# 문자열 날짜를 datetime 형식으로 변환 
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

In [763]:
# train에는 있지만 test에는 없는 컬럼을 train에서 삭제
train.drop(['casual', 'registered'], axis=1, inplace=True)

In [764]:
# train 컬럼 별 고유 값 출력
for col in train.columns:
    print(f'unique values of {col}:', np.sort(train[col].unique()))

unique values of datetime: ['2011-01-01T00:00:00.000000000' '2011-01-01T01:00:00.000000000'
 '2011-01-01T02:00:00.000000000' ... '2012-12-19T21:00:00.000000000'
 '2012-12-19T22:00:00.000000000' '2012-12-19T23:00:00.000000000']
unique values of season: [1 2 3 4]
unique values of holiday: [0 1]
unique values of workingday: [0 1]
unique values of weather: [1 2 3 4]
unique values of temp: [ 0.82  1.64  2.46  3.28  4.1   4.92  5.74  6.56  7.38  8.2   9.02  9.84
 10.66 11.48 12.3  13.12 13.94 14.76 15.58 16.4  17.22 18.04 18.86 19.68
 20.5  21.32 22.14 22.96 23.78 24.6  25.42 26.24 27.06 27.88 28.7  29.52
 30.34 31.16 31.98 32.8  33.62 34.44 35.26 36.08 36.9  37.72 38.54 39.36
 41.  ]
unique values of atemp: [ 0.76   1.515  2.275  3.03   3.79   4.545  5.305  6.06   6.82   7.575
  8.335  9.09   9.85  10.605 11.365 12.12  12.88  13.635 14.395 15.15
 15.91  16.665 17.425 18.18  18.94  19.695 20.455 21.21  21.97  22.725
 23.485 24.24  25.    25.76  26.515 27.275 28.03  28.79  29.545 30.305
 31.0

In [765]:
# train 컬럼 별 고유 값 출력
for col in test.columns:
    print(f'unique values of {col}:', np.sort(test[col].unique()))

unique values of datetime: ['2011-01-20T00:00:00.000000000' '2011-01-20T01:00:00.000000000'
 '2011-01-20T02:00:00.000000000' ... '2012-12-31T21:00:00.000000000'
 '2012-12-31T22:00:00.000000000' '2012-12-31T23:00:00.000000000']
unique values of season: [1 2 3 4]
unique values of holiday: [0 1]
unique values of workingday: [0 1]
unique values of weather: [1 2 3 4]
unique values of temp: [ 0.82  1.64  2.46  3.28  4.1   4.92  5.74  6.56  7.38  8.2   9.02  9.84
 10.66 11.48 12.3  13.12 13.94 14.76 15.58 16.4  17.22 18.04 18.86 19.68
 20.5  21.32 22.14 22.96 23.78 24.6  25.42 26.24 27.06 27.88 28.7  29.52
 30.34 31.16 31.98 32.8  33.62 34.44 35.26 36.08 36.9  37.72 38.54 39.36
 40.18]
unique values of atemp: [ 0.     0.76   1.515  2.275  3.03   3.79   4.545  5.305  6.06   6.82
  7.575  8.335  9.09   9.85  10.605 11.365 12.12  12.88  13.635 14.395
 15.15  15.91  16.665 17.425 18.18  18.94  19.695 20.455 21.21  21.97
 22.725 23.485 24.24  25.    25.76  26.515 27.275 28.03  28.79  29.545
 30.30

# 데이터 요약
- datetime : datetime 날짜 데이터. year, month, day, weekday(요일), quarter(분기), week(주차) 파생변수 사용.
- season : 순서형 데이터. 계절의 주기적 특성 반영 위해 사인과 코사인 활용 인코딩
- holiday : 이진 데이터
- workingday : 이진 데이터
- temp : 연속형 데이터. 스케일링 필요
- atemp : 연속형 데이터. 스케일링 필요
- humidity : 이산형 데이터
- windspeed : 연속형 데이터. 스케일링 필요
- cocunt : 이산형 데이터. 타켓 데이터

In [766]:
# 결측값 확인
train.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
count         0
dtype: int64

In [767]:
# 결측값 확인
test.isnull().sum()

datetime      0
season        0
holiday       0
workingday    0
weather       0
temp          0
atemp         0
humidity      0
windspeed     0
dtype: int64

In [768]:
# datetime 컬럼 사용하여 파생변수 year, month, day 생성
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['day'] = train['datetime'].dt.day
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['day'] = test['datetime'].dt.day

In [769]:
# datetime 컬럼 사용하여 파생변수 weekday, quarter, week 생성
train['weekday'] = train['datetime'].dt.weekday
train['quarter'] = train['datetime'].dt.quarter
train['week'] = train['datetime'].dt.isocalendar().week.astype(int)
test['weekday'] = test['datetime'].dt.weekday
test['quarter'] = test['datetime'].dt.quarter
test['week'] = test['datetime'].dt.isocalendar().week.astype(int)

In [770]:
train.drop('datetime', axis=1, inplace=True)
test.drop('datetime', axis=1, inplace=True)

In [771]:
# # 계절 컬럼 주기적 인코딩
train['season_sin'] = np.sin(2 * np.pi * train['season'] / 4)
train['season_cos'] = np.cos(2 * np.pi * train['season'] / 4)
test['season_sin'] = np.sin(2 * np.pi * test['season'] / 4)
test['season_cos'] = np.cos(2 * np.pi * test['season'] / 4)

In [772]:
X = train
y = X.pop('count')

In [773]:
train

Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,day,season_sin,season_cos
0,1,0,0,1,9.84,14.395,81,0.0000,2011,1,1,1.000000e+00,6.123234e-17
1,1,0,0,1,9.02,13.635,80,0.0000,2011,1,1,1.000000e+00,6.123234e-17
2,1,0,0,1,9.02,13.635,80,0.0000,2011,1,1,1.000000e+00,6.123234e-17
3,1,0,0,1,9.84,14.395,75,0.0000,2011,1,1,1.000000e+00,6.123234e-17
4,1,0,0,1,9.84,14.395,75,0.0000,2011,1,1,1.000000e+00,6.123234e-17
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10881,4,0,1,1,15.58,19.695,50,26.0027,2012,12,19,-2.449294e-16,1.000000e+00
10882,4,0,1,1,14.76,17.425,57,15.0013,2012,12,19,-2.449294e-16,1.000000e+00
10883,4,0,1,1,13.94,15.910,61,15.0013,2012,12,19,-2.449294e-16,1.000000e+00
10884,4,0,1,1,13.94,17.425,61,6.0032,2012,12,19,-2.449294e-16,1.000000e+00


In [774]:
# 연속형 데이터 스케일링
float_columns = ['temp', 'atemp', 'windspeed']
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train[float_columns] = scaler.fit_transform(train[float_columns])
test[float_columns] = scaler.transform(test[float_columns])

In [775]:
from sklearn.model_selection import train_test_split
random_state = 42
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [776]:
random_state=42

## 랜덤포레스트

In [777]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=300, max_depth=30, random_state=42)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)
print('r2 :', r2_score(y_val, pred))
print('rmse :', np.sqrt(mean_squared_error(y_val, pred)))

r2 : 0.47509184637175117
rmse : 131.62678208420337


## XGBoost 모델

In [778]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators=500, learning_rate=0.01, max_depth=9, random_state=42)
xgb.fit(X_train, y_train)
pred = xgb.predict(X_val)
print('r2 :', r2_score(y_val, pred))
print('rmse :', np.sqrt(mean_squared_error(y_val, pred)))

r2 : 0.4756716795298196
rmse : 131.554062065683


## test 데이터 예측

In [779]:
# 예측
pred = rf.predict(test)
pred

array([ 87.21666667,  67.63111111,  67.63111111, ..., 206.06966667,
       146.22744444, 146.13077778])

In [780]:
# csv 파일 생성
test = pd.read_csv("/kaggle/input/bike-sharing-demand/test.csv")

submit = pd.DataFrame({
    'datetime': test['datetime'],
    'count': pred
})
submit.to_csv("submission.csv", index=False)

In [781]:
# 점검
print(pd.read_csv("submission.csv"))
print(submit.shape, test.shape)

                 datetime       count
0     2011-01-20 00:00:00   87.216667
1     2011-01-20 01:00:00   67.631111
2     2011-01-20 02:00:00   67.631111
3     2011-01-20 03:00:00   82.683611
4     2011-01-20 04:00:00   82.683611
...                   ...         ...
6488  2012-12-31 19:00:00  224.243056
6489  2012-12-31 20:00:00  224.243056
6490  2012-12-31 21:00:00  206.069667
6491  2012-12-31 22:00:00  146.227444
6492  2012-12-31 23:00:00  146.130778

[6493 rows x 2 columns]
(6493, 2) (6493, 9)
