In [1]:
import numpy as np
import pandas as pd 

In [2]:
from sklearn import datasets, tree

In [3]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [4]:
featuresData = pd.read_csv("../dataset./feature_regression_example.csv")

In [5]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


데이터 타입을 모델정의서와 같이 설정하며  
홀리데이 및 프로모션 여부 컬럼에 대해서 Y→1, N→0 컬럼을  
추가로 생성하세요 

In [6]:
featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [7]:
featuresData["HD_YN"] = np.where(featuresData["HOLIDAY"] == "Y", 1, np.where(featuresData["HOLIDAY"]=="N",0,featuresData["HOLIDAY"]))

In [8]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HD_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0


In [9]:
featuresData["PM_YN"] = np.where(featuresData["PROMOTION"] == "Y", 1, 0)

In [10]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HD_YN,PM_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1


데이터 범위를 YEARWEEK 기준  
201501 ~ 201652 구간만 조회하여  
FeaturesData 변수에 저장하세요.

In [11]:
featuresData2 = featuresData[(featuresData.YEARWEEK.astype(int)>=201501) & (featuresData.YEARWEEK.astype(int)<=201652)]

In [12]:
featuresData2

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HD_YN,PM_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201506,2015,6,867,N,4,Y,0.208155,0,1
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201507,2015,7,1187,N,4,Y,0.208155,0,1
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201508,2015,8,970,Y,1,Y,0.208155,1,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201509,2015,9,542,N,4,Y,0.208155,0,1
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201510,2015,10,350,N,4,N,0.000000,0,0


## 01 상관관계 확인

In [13]:
corrDf = featuresData2.corr()
corrDf

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,PM_YN
YEARWEEK,1.0,0.956598,0.275593,0.112267,-0.071586,0.404889,0.205916
YEAR,0.956598,1.0,-0.016493,0.028931,0.028593,0.321193,0.195931
WEEK,0.275593,-0.016493,1.0,0.289766,-0.339943,0.329705,0.060206
QTY,0.112267,0.028931,0.289766,1.0,-0.53723,0.700195,0.612451
HCLUS,-0.071586,0.028593,-0.339943,-0.53723,1.0,-0.545619,-0.374072
PRO_PERCENT,0.404889,0.321193,0.329705,0.700195,-0.545619,1.0,0.898554
PM_YN,0.205916,0.195931,0.060206,0.612451,-0.374072,0.898554,1.0


In [14]:
# 인덱스컬럼 초기화
corrResult = corrDf.reset_index()
corrResult

Unnamed: 0,index,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,PM_YN
0,YEARWEEK,1.0,0.956598,0.275593,0.112267,-0.071586,0.404889,0.205916
1,YEAR,0.956598,1.0,-0.016493,0.028931,0.028593,0.321193,0.195931
2,WEEK,0.275593,-0.016493,1.0,0.289766,-0.339943,0.329705,0.060206
3,QTY,0.112267,0.028931,0.289766,1.0,-0.53723,0.700195,0.612451
4,HCLUS,-0.071586,0.028593,-0.339943,-0.53723,1.0,-0.545619,-0.374072
5,PRO_PERCENT,0.404889,0.321193,0.329705,0.700195,-0.545619,1.0,0.898554
6,PM_YN,0.205916,0.195931,0.060206,0.612451,-0.374072,0.898554,1.0


In [15]:
# 상관관계 계수 비율 선정
featuresStd = 0.5

# feature 선택
features = corrResult[(corrResult.QTY != 1)\
                      &(abs(corrResult.QTY)>featuresStd)]["index"].tolist()
features

['HCLUS', 'PRO_PERCENT', 'PM_YN']

In [16]:
label = ["QTY"]
label

['QTY']

## 02 특성 선정

In [17]:
yearweekStd = 201630

In [18]:
# 학습하려면 1,2번 사용이 적합

trainingData_features = featuresData2[featuresData2.YEARWEEK <= yearweekStd][features]
trainingData_label = featuresData2[featuresData2.YEARWEEK <= yearweekStd][label]
testData_features = featuresData2[featuresData2.YEARWEEK > yearweekStd][features]
testData_label = featuresData2[featuresData2.YEARWEEK > yearweekStd]

In [19]:
testData_label = testData_label.reset_index(drop=True)
testData_label

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HD_YN,PM_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258,0,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43,N,4,N,0.0,0,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1


In [20]:
trainingData_label

Unnamed: 0,QTY
0,1225
1,968
2,1209
3,1810
4,1773
5,867
6,1187
7,970
8,542
9,350


## 03 모델 적용

In [21]:
# Define Model
model_method = tree.DecisionTreeClassifier()

In [22]:
# Learning (학습지표 & 결과값)
model = model_method.fit(trainingData_features,trainingData_label)

model

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## 04 예측

In [23]:
# 예측
predict = model.predict(testData_features)
predict

array([ 973,  973,   16,  968,  968,  968,  973,  973,  973,  973, 1172,
        973,   16, 2033, 2033, 2033, 2033, 2033, 2033, 2033, 2033, 2033],
      dtype=int64)

In [24]:
predictData = pd.DataFrame(predict, columns=["PREDICT"])

predictData

Unnamed: 0,PREDICT
0,973
1,973
2,16
3,968
4,968
5,968
6,973
7,973
8,973
9,973


In [25]:
testData = pd.concat([testData_label, predictData], axis=1)

In [26]:
testData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HD_YN,PM_YN,PREDICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201631,2016,31,1522,N,4,Y,0.280258,0,1,973
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201632,2016,32,2100,N,4,Y,0.280258,0,1,973
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201633,2016,33,43,N,4,N,0.0,0,0,16
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201634,2016,34,1700,Y,1,Y,0.308584,1,1,968
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201635,2016,35,1514,Y,1,Y,0.308584,1,1,968
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201636,2016,36,1501,Y,1,Y,0.308584,1,1,968
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201637,2016,37,1491,N,4,Y,0.308584,0,1,973
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201638,2016,38,806,N,4,Y,0.308584,0,1,973
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201639,2016,39,2111,N,4,Y,0.280258,0,1,973
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201640,2016,40,2400,N,4,Y,0.280258,0,1,973


## 결과 검증 (정확도 측정)

In [27]:
### MAE
# 실제값과 예측값의 평균오차
mean_absolute_error(testData['QTY'], testData['PREDICT'])

### RMSE (MSE)
# 예측값 오차를 제곱 -> 평균 -> 루트
# 예측값 에러 잡기.
# MAE와 RMSE값 차이가 심하면, 안정적 모델이 아님. 쓸모없는 모델. useless
mean_squared_error(testData['QTY'], testData['PREDICT'])

### R2
r2_score(testData['QTY'], testData['PREDICT']) 

-0.2162570590202968

In [31]:
mean_absolute_error(testData['QTY'], testData['PREDICT'])

617.3636363636364

In [32]:
mean_squared_error(testData['QTY'], testData['PREDICT'])

567875.9090909091

In [33]:
r2_score(testData['QTY'], testData['PREDICT']) 

-0.2162570590202968