In [1]:
import pandas as pd
import numpy as np

bike_weather = pd.read_csv('bike_weather.csv')
bike_weather

Unnamed: 0,Date_out,Time_out,Count,date,time,cum_precipitation,humidity,insolation,pressure,sea_lvl_pressure,sunshine,temp,wind,wind_direction
0,2019-10-03,1,64,2019-10-03,1,2.361667,93.846667,0.00,993.010000,1002.910000,0,20.016667,3.290000,178.788333
1,2019-10-03,2,73,2019-10-03,2,3.353333,93.453333,0.00,992.668333,1002.568333,0,19.908333,3.056667,333.400000
2,2019-10-03,3,78,2019-10-03,3,3.930000,91.686667,0.00,992.253333,1002.153333,0,19.923333,2.125000,330.110000
3,2019-10-03,4,57,2019-10-03,4,4.423333,93.061667,0.00,992.316667,1002.216667,0,19.928333,1.931667,251.535000
4,2019-10-03,5,43,2019-10-03,5,4.500000,95.028333,0.00,992.835000,1002.735000,0,19.871667,2.886667,236.116667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,2019-10-05,19,648,2019-10-05,19,0.600000,60.570000,8.24,1008.340000,1018.440000,8400,17.171667,4.085000,146.493333
65,2019-10-05,20,619,2019-10-05,20,0.600000,61.381667,8.24,1009.146667,1019.273333,8400,16.610000,3.631667,125.785000
66,2019-10-05,21,664,2019-10-05,21,0.600000,62.598333,8.24,1009.773333,1019.973333,8400,15.988333,3.686667,225.700000
67,2019-10-05,22,585,2019-10-05,22,0.600000,63.560000,8.24,1010.376667,1020.576667,8400,15.436667,3.680000,225.253333


In [2]:
from sklearn.model_selection import train_test_split

X = bike_weather[['cum_precipitation', 'humidity', 'temp', 'wind']]
y = bike_weather.Count

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

In [3]:
import statsmodels.api as sm

X1 = sm.add_constant(X_train)
model = sm.OLS(y_train, X1)
result = model.fit()
print(result.summary())

                            OLS Regression Results                            
Dep. Variable:                  Count   R-squared:                       0.629
Model:                            OLS   Adj. R-squared:                  0.595
Method:                 Least Squares   F-statistic:                     18.23
Date:                Fri, 13 Jan 2023   Prob (F-statistic):           7.99e-09
Time:                        21:39:59   Log-Likelihood:                -312.08
No. Observations:                  48   AIC:                             634.2
Df Residuals:                      43   BIC:                             643.5
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
const              1765.8805    406.04

  x = pd.concat(x[::order], 1)


In [4]:
X1 = sm.add_constant(X_test)
pred = result.predict(X1)
pred

  x = pd.concat(x[::order], 1)


8     359.689227
37    776.467633
40    596.610515
56    446.904312
23    398.884587
53    215.006054
9     468.031981
43    373.272545
68    662.458239
1     155.301651
60    600.559534
67    658.713630
42    399.725912
63    689.399962
24    351.858574
58    607.149355
59    650.659018
55    409.889723
29    371.885866
6     205.195444
61    543.214005
dtype: float64

add_constant : statsmodels의 OLS 회귀함수의 특성상 절편에 해당하는 값을 처리하기 위해서 적용해주어야 하는 단계  
X1 = sm.add_constant(X_test) : X_test데이터를 앞서 실행한 OLS회귀식에 대입하여 예상되는 y값 산출, 결과를 pred에 담았음  

In [5]:
from sklearn import metrics

print('MAE : ', metrics.mean_absolute_error(y_test, pred))
print('MSE : ', metrics.mean_squared_error(y_test, pred))
print('RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('MAPE : ', np.mean(np.abs((y_test - pred) / y_test))*100)

MAE :  154.96781747396304
MSE :  40194.987815230634
RMSE :  200.48687691525006
MAPE :  37.74978631438895


In [6]:
bike_weather['Rain_YN'] = 'N'
bike_weather.loc[bike_weather.cum_precipitation > 0, 'Rain_YN'] = 'Y'
bike_weather

Unnamed: 0,Date_out,Time_out,Count,date,time,cum_precipitation,humidity,insolation,pressure,sea_lvl_pressure,sunshine,temp,wind,wind_direction,Rain_YN
0,2019-10-03,1,64,2019-10-03,1,2.361667,93.846667,0.00,993.010000,1002.910000,0,20.016667,3.290000,178.788333,Y
1,2019-10-03,2,73,2019-10-03,2,3.353333,93.453333,0.00,992.668333,1002.568333,0,19.908333,3.056667,333.400000,Y
2,2019-10-03,3,78,2019-10-03,3,3.930000,91.686667,0.00,992.253333,1002.153333,0,19.923333,2.125000,330.110000,Y
3,2019-10-03,4,57,2019-10-03,4,4.423333,93.061667,0.00,992.316667,1002.216667,0,19.928333,1.931667,251.535000,Y
4,2019-10-03,5,43,2019-10-03,5,4.500000,95.028333,0.00,992.835000,1002.735000,0,19.871667,2.886667,236.116667,Y
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,2019-10-05,19,648,2019-10-05,19,0.600000,60.570000,8.24,1008.340000,1018.440000,8400,17.171667,4.085000,146.493333,Y
65,2019-10-05,20,619,2019-10-05,20,0.600000,61.381667,8.24,1009.146667,1019.273333,8400,16.610000,3.631667,125.785000,Y
66,2019-10-05,21,664,2019-10-05,21,0.600000,62.598333,8.24,1009.773333,1019.973333,8400,15.988333,3.686667,225.700000,Y
67,2019-10-05,22,585,2019-10-05,22,0.600000,63.560000,8.24,1010.376667,1020.576667,8400,15.436667,3.680000,225.253333,Y


범주형 데이터를 독립변수로 사용하기  
: 범주형 데이터를 연속형 데이터로 변환해야 함
> one_hot_encoding  

N=0, Y=1이라는 식으로 데이터 변경

In [7]:
ohe = pd.get_dummies(bike_weather['Rain_YN'])
ohe

Unnamed: 0,N,Y
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
...,...,...
64,0,1
65,0,1
66,0,1
67,0,1


In [8]:
bike_weather = pd.concat([bike_weather, ohe], axis = 1, sort = False)
bike_weather.to_csv('bike_weather.csv', index = False, encoding = 'utf-8-sig')
bike_weather

Unnamed: 0,Date_out,Time_out,Count,date,time,cum_precipitation,humidity,insolation,pressure,sea_lvl_pressure,sunshine,temp,wind,wind_direction,Rain_YN,N,Y
0,2019-10-03,1,64,2019-10-03,1,2.361667,93.846667,0.00,993.010000,1002.910000,0,20.016667,3.290000,178.788333,Y,0,1
1,2019-10-03,2,73,2019-10-03,2,3.353333,93.453333,0.00,992.668333,1002.568333,0,19.908333,3.056667,333.400000,Y,0,1
2,2019-10-03,3,78,2019-10-03,3,3.930000,91.686667,0.00,992.253333,1002.153333,0,19.923333,2.125000,330.110000,Y,0,1
3,2019-10-03,4,57,2019-10-03,4,4.423333,93.061667,0.00,992.316667,1002.216667,0,19.928333,1.931667,251.535000,Y,0,1
4,2019-10-03,5,43,2019-10-03,5,4.500000,95.028333,0.00,992.835000,1002.735000,0,19.871667,2.886667,236.116667,Y,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,2019-10-05,19,648,2019-10-05,19,0.600000,60.570000,8.24,1008.340000,1018.440000,8400,17.171667,4.085000,146.493333,Y,0,1
65,2019-10-05,20,619,2019-10-05,20,0.600000,61.381667,8.24,1009.146667,1019.273333,8400,16.610000,3.631667,125.785000,Y,0,1
66,2019-10-05,21,664,2019-10-05,21,0.600000,62.598333,8.24,1009.773333,1019.973333,8400,15.988333,3.686667,225.700000,Y,0,1
67,2019-10-05,22,585,2019-10-05,22,0.600000,63.560000,8.24,1010.376667,1020.576667,8400,15.436667,3.680000,225.253333,Y,0,1


In [9]:
from sklearn.model_selection import train_test_split

X = bike_weather[['humidity', 'temp', 'N', 'Y']]
y = bike_weather.Count
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 123)

import statsmodels.api as sm

X1 = sm.add_constant(X_train)
model = sm.OLS(y_train, X1)
result = model.fit()
print(result.summary())

X1 = sm.add_constant(X_test)
pred = result.predict(X1)

from sklearn import metrics

print('MAE : ', metrics.mean_absolute_error(y_test, pred))
print('MSE : ', metrics.mean_squared_error(y_test, pred))
print('RMSE : ', np.sqrt(metrics.mean_squared_error(y_test, pred)))
print('MAPE : ', np.mean(np.abs((y_test - pred) / y_test)) * 100)

                            OLS Regression Results                            
Dep. Variable:                  Count   R-squared:                       0.638
Model:                            OLS   Adj. R-squared:                  0.614
Method:                 Least Squares   F-statistic:                     25.89
Date:                Fri, 13 Jan 2023   Prob (F-statistic):           8.34e-10
Time:                        21:53:00   Log-Likelihood:                -311.48
No. Observations:                  48   AIC:                             631.0
Df Residuals:                      44   BIC:                             638.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1134.2944    251.529      4.510      0.0

  x = pd.concat(x[::order], 1)
  x = pd.concat(x[::order], 1)


Rain_YN_0과 Rain_YN_1을 둘 다 선택하지 않아도 되는 이유  
Rain_YN_0이 값이 0이면 Rain_YN_1은 당연히 값이 1이고
역도 성립하므로 두 컬럼이 사실 똑같은 의미를 전달하는 것  
> Drop Last