In [None]:
# !pip3 install pandas==1.2.4
# !pip3 install numpy==1.20.1
# !pip3 install statsmodels==0.12.2
# !pip3 install scikit-learn==0.24.1

# Contents

* 1. Read Train Datasets
* 2. Standardization
* 3. Transform NaN Values
* 4. Model Implementation
* 5. Model Saving

# 1. Read Train Datasets

* 전처리해 놓은 기상데이터 + 판매량 데이터를 load합니다.

In [18]:
import statsmodels.api as sm
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [19]:
# 기상 데이터 + 판매량 데이터 load
df = pd.read_csv('./wh_finfin.csv', encoding='utf-8')
df

Unnamed: 0,일시,지점,평균기온(°C),최저기온(°C),최저기온 시각(hhmi),최고기온(°C),최고기온 시각(hhmi),강수 계속시간(hr),10분 최다 강수량(mm),10분 최다강수량 시각(hhmi),...,0.5m 지중온도(°C),1.0m 지중온도(°C),1.5m 지중온도(°C),3.0m 지중온도(°C),5.0m 지중온도(°C),합계 대형증발량(mm),합계 소형증발량(mm),9-9강수(mm),안개 계속시간(hr),sale
0,2021-01-01,108.0,-4.2,-9.8,511.0,1.6,1447.0,,,,...,2.9,6.8,9.7,15.9,17.5,1.1,1.6,,,615110
1,2021-01-02,108.0,-5.0,-8.4,805.0,-1.4,1346.0,,,,...,2.6,6.6,9.6,15.8,17.5,1.4,2.0,,,296620
2,2021-01-04,108.0,-3.5,-8.4,656.0,0.3,1535.0,2.33,,,...,2.3,6.3,9.4,15.6,17.4,0.9,1.3,0.0,,259900
3,2021-01-05,108.0,-5.5,-9.9,2356.0,-2.1,1.0,5.42,,,...,2.2,6.2,9.3,15.5,17.3,1.2,1.7,,,248520
4,2021-01-06,108.0,-7.4,-12.0,702.0,-1.9,1547.0,5.33,,,...,2.0,6.0,9.2,15.5,17.3,1.2,1.8,2.3,,469800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,2021-11-18,108.0,11.9,7.7,25.0,16.2,1410.0,2.08,,,...,11.7,14.3,17.0,18.6,18.1,1.1,1.6,0.0,,142000
295,2021-11-19,108.0,12.9,10.2,2356.0,16.7,1425.0,,,,...,12.1,14.2,16.8,18.5,18.1,0.9,1.3,,,141500
296,2021-11-20,108.0,10.4,6.8,728.0,15.6,1446.0,,,,...,12.4,14.2,16.7,18.5,18.1,1.4,2.1,,,307500
297,2021-11-21,108.0,10.4,7.6,2358.0,13.1,1350.0,2.92,,,...,12.3,14.2,16.5,18.4,18.1,1.0,1.4,7.2,,202600


# 2. Standardization

* Standard Scaler로 column들의 값을 표준화합니다.

In [20]:
# dataframe채로 scaling하기 위해 '일시' column을 제외한다.
df.drop(['일시'], axis=1,inplace=True)
df

Unnamed: 0,지점,평균기온(°C),최저기온(°C),최저기온 시각(hhmi),최고기온(°C),최고기온 시각(hhmi),강수 계속시간(hr),10분 최다 강수량(mm),10분 최다강수량 시각(hhmi),1시간 최다강수량(mm),...,0.5m 지중온도(°C),1.0m 지중온도(°C),1.5m 지중온도(°C),3.0m 지중온도(°C),5.0m 지중온도(°C),합계 대형증발량(mm),합계 소형증발량(mm),9-9강수(mm),안개 계속시간(hr),sale
0,108.0,-4.2,-9.8,511.0,1.6,1447.0,,,,,...,2.9,6.8,9.7,15.9,17.5,1.1,1.6,,,615110
1,108.0,-5.0,-8.4,805.0,-1.4,1346.0,,,,,...,2.6,6.6,9.6,15.8,17.5,1.4,2.0,,,296620
2,108.0,-3.5,-8.4,656.0,0.3,1535.0,2.33,,,,...,2.3,6.3,9.4,15.6,17.4,0.9,1.3,0.0,,259900
3,108.0,-5.5,-9.9,2356.0,-2.1,1.0,5.42,,,,...,2.2,6.2,9.3,15.5,17.3,1.2,1.7,,,248520
4,108.0,-7.4,-12.0,702.0,-1.9,1547.0,5.33,,,,...,2.0,6.0,9.2,15.5,17.3,1.2,1.8,2.3,,469800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,108.0,11.9,7.7,25.0,16.2,1410.0,2.08,,,,...,11.7,14.3,17.0,18.6,18.1,1.1,1.6,0.0,,142000
295,108.0,12.9,10.2,2356.0,16.7,1425.0,,,,,...,12.1,14.2,16.8,18.5,18.1,0.9,1.3,,,141500
296,108.0,10.4,6.8,728.0,15.6,1446.0,,,,,...,12.4,14.2,16.7,18.5,18.1,1.4,2.1,,,307500
297,108.0,10.4,7.6,2358.0,13.1,1350.0,2.92,,,,...,12.3,14.2,16.5,18.4,18.1,1.0,1.4,7.2,,202600


In [21]:
# StandardScaler를 통해 값들을 정규화 해준다.
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)
df

Unnamed: 0,지점,평균기온(°C),최저기온(°C),최저기온 시각(hhmi),최고기온(°C),최고기온 시각(hhmi),강수 계속시간(hr),10분 최다 강수량(mm),10분 최다강수량 시각(hhmi),1시간 최다강수량(mm),...,0.5m 지중온도(°C),1.0m 지중온도(°C),1.5m 지중온도(°C),3.0m 지중온도(°C),5.0m 지중온도(°C),합계 대형증발량(mm),합계 소형증발량(mm),9-9강수(mm),안개 계속시간(hr),sale
0,108.0,-4.2,-9.8,511.0,1.6,1447.0,,,,,...,2.9,6.8,9.7,15.9,17.5,1.1,1.6,,,615110
1,108.0,-5.0,-8.4,805.0,-1.4,1346.0,,,,,...,2.6,6.6,9.6,15.8,17.5,1.4,2.0,,,296620
2,108.0,-3.5,-8.4,656.0,0.3,1535.0,2.33,,,,...,2.3,6.3,9.4,15.6,17.4,0.9,1.3,0.0,,259900
3,108.0,-5.5,-9.9,2356.0,-2.1,1.0,5.42,,,,...,2.2,6.2,9.3,15.5,17.3,1.2,1.7,,,248520
4,108.0,-7.4,-12.0,702.0,-1.9,1547.0,5.33,,,,...,2.0,6.0,9.2,15.5,17.3,1.2,1.8,2.3,,469800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,108.0,11.9,7.7,25.0,16.2,1410.0,2.08,,,,...,11.7,14.3,17.0,18.6,18.1,1.1,1.6,0.0,,142000
295,108.0,12.9,10.2,2356.0,16.7,1425.0,,,,,...,12.1,14.2,16.8,18.5,18.1,0.9,1.3,,,141500
296,108.0,10.4,6.8,728.0,15.6,1446.0,,,,,...,12.4,14.2,16.7,18.5,18.1,1.4,2.1,,,307500
297,108.0,10.4,7.6,2358.0,13.1,1350.0,2.92,,,,...,12.3,14.2,16.5,18.4,18.1,1.0,1.4,7.2,,202600


# 3. Transform NaN Values

* NaN value들을 모두 0으로 변환합니다.

In [22]:
# 수집되지 않은 데이터 혹은 비가 오지 않는 날의 강수량 등은 NaN으로 표기되어 있음.
# 이 NaN 값들을 모두 0으로 변환해줌.
df.fillna(0)

Unnamed: 0,지점,평균기온(°C),최저기온(°C),최저기온 시각(hhmi),최고기온(°C),최고기온 시각(hhmi),강수 계속시간(hr),10분 최다 강수량(mm),10분 최다강수량 시각(hhmi),1시간 최다강수량(mm),...,0.5m 지중온도(°C),1.0m 지중온도(°C),1.5m 지중온도(°C),3.0m 지중온도(°C),5.0m 지중온도(°C),합계 대형증발량(mm),합계 소형증발량(mm),9-9강수(mm),안개 계속시간(hr),sale
0,108.0,-4.2,-9.8,511.0,1.6,1447.0,0.00,0.0,0.0,0.0,...,2.9,6.8,9.7,15.9,17.5,1.1,1.6,0.0,0.0,615110
1,108.0,-5.0,-8.4,805.0,-1.4,1346.0,0.00,0.0,0.0,0.0,...,2.6,6.6,9.6,15.8,17.5,1.4,2.0,0.0,0.0,296620
2,108.0,-3.5,-8.4,656.0,0.3,1535.0,2.33,0.0,0.0,0.0,...,2.3,6.3,9.4,15.6,17.4,0.9,1.3,0.0,0.0,259900
3,108.0,-5.5,-9.9,2356.0,-2.1,1.0,5.42,0.0,0.0,0.0,...,2.2,6.2,9.3,15.5,17.3,1.2,1.7,0.0,0.0,248520
4,108.0,-7.4,-12.0,702.0,-1.9,1547.0,5.33,0.0,0.0,0.0,...,2.0,6.0,9.2,15.5,17.3,1.2,1.8,2.3,0.0,469800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,108.0,11.9,7.7,25.0,16.2,1410.0,2.08,0.0,0.0,0.0,...,11.7,14.3,17.0,18.6,18.1,1.1,1.6,0.0,0.0,142000
295,108.0,12.9,10.2,2356.0,16.7,1425.0,0.00,0.0,0.0,0.0,...,12.1,14.2,16.8,18.5,18.1,0.9,1.3,0.0,0.0,141500
296,108.0,10.4,6.8,728.0,15.6,1446.0,0.00,0.0,0.0,0.0,...,12.4,14.2,16.7,18.5,18.1,1.4,2.1,0.0,0.0,307500
297,108.0,10.4,7.6,2358.0,13.1,1350.0,2.92,0.0,0.0,0.0,...,12.3,14.2,16.5,18.4,18.1,1.0,1.4,7.2,0.0,202600


In [25]:
df.columns

Index(['지점', '평균기온(°C)', '최저기온(°C)', '최저기온 시각(hhmi)', '최고기온(°C)',
       '최고기온 시각(hhmi)', '강수 계속시간(hr)', '10분 최다 강수량(mm)', '10분 최다강수량 시각(hhmi)',
       '1시간 최다강수량(mm)', '1시간 최다 강수량 시각(hhmi)', '일강수량(mm)', '최대 순간 풍속(m/s)',
       '최대 순간 풍속 풍향(16방위)', '최대 순간풍속 시각(hhmi)', '최대 풍속(m/s)', '최대 풍속 풍향(16방위)',
       '최대 풍속 시각(hhmi)', '평균 풍속(m/s)', '풍정합(100m)', '최다풍향(16방위)',
       '평균 이슬점온도(°C)', '최소 상대습도(%)', '최소 상대습도 시각(hhmi)', '평균 상대습도(%)',
       '평균 증기압(hPa)', '평균 현지기압(hPa)', '최고 해면기압(hPa)', '최고 해면기압 시각(hhmi)',
       '최저 해면기압(hPa)', '최저 해면기압 시각(hhmi)', '평균 해면기압(hPa)', '가조시간(hr)',
       '합계 일조시간(hr)', '1시간 최다일사 시각(hhmi)', '1시간 최다일사량(MJ/m2)', '합계 일사량(MJ/m2)',
       '일 최심신적설(cm)', '일 최심신적설 시각(hhmi)', '일 최심적설(cm)', '일 최심적설 시각(hhmi)',
       '합계 3시간 신적설(cm)', '평균 전운량(1/10)', '평균 중하층운량(1/10)', '평균 지면온도(°C)',
       '최저 초상온도(°C)', '평균 5cm 지중온도(°C)', '평균 10cm 지중온도(°C)',
       '평균 20cm 지중온도(°C)', '평균 30cm 지중온도(°C)', '0.5m 지중온도(°C)',
       '1.0m 지중온도(°C)', '1.5m 지중온도(°C)', '3.0m 지중온도(°C)', '5.0m

# 4. Model Implementation

* 일부 변수들을 독립변수로 사용해 Linear Regression을 시행합니다.
* Linear Regression은 statsmodels의 sm.OLS()함수를 사용합니다.
* Adjusted R-Squared와 각 변수들의 P-value를 중점적으로 확인합니다.

In [26]:
# model의 종속변수 : sale
# model의 독립변수 : 최저기온, 최대 순간 풍속, 평균 상대습도(%), 합계 일조시간(hr)
model = sm.OLS(df['sale'],
               df[['최저기온(°C)', '최대 순간 풍속(m/s)',
                  '평균 상대습도(%)', '합계 일조시간(hr)']])
results = model.fit()
results.summary()
# model의 설명력을 나타내는 Adjusted R-Squared의 값은 0.79로 높게 나왔다.
# 각 변수들의 p value 또한 매우 작은 수준(0.05 이하)로 나타났기 때문에 변수들이 통계적으로 유의미하다는 가설은 기각되지 않는다.

0,1,2,3
Dep. Variable:,sale,R-squared (uncentered):,0.793
Model:,OLS,Adj. R-squared (uncentered):,0.79
Method:,Least Squares,F-statistic:,282.8
Date:,"Sat, 09 Apr 2022",Prob (F-statistic):,1.32e-99
Time:,01:57:05,Log-Likelihood:,-3787.8
No. Observations:,299,AIC:,7584.0
Df Residuals:,295,BIC:,7598.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
최저기온(°C),-2928.1511,499.735,-5.859,0.000,-3911.650,-1944.653
최대 순간 풍속(m/s),5061.2040,1470.621,3.442,0.001,2166.965,7955.443
평균 상대습도(%),1779.7369,216.901,8.205,0.000,1352.868,2206.606
합계 일조시간(hr),3075.2834,1020.240,3.014,0.003,1067.412,5083.155

0,1,2,3
Omnibus:,96.359,Durbin-Watson:,1.318
Prob(Omnibus):,0.0,Jarque-Bera (JB):,389.316
Skew:,1.315,Prob(JB):,2.89e-85
Kurtosis:,7.932,Cond. No.,24.4


# 5. Model Saving

* 모델 배포를 위해 pickle 라이브러리를 사용해 pkl 파일로 모델을 저장합니다.

In [24]:
import pickle

# pickle 패키지를 이용해 모델 저장
pickle.dump(model, open('./ols_model.pkl', 'wb'))