## Decision Tree Example

##### 라이브러리 선언

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 모델 라이브러리 선언
from sklearn import datasets, tree

In [3]:
# 모델 정확도 라이브러리 선언
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [4]:
# CSV파일을 읽어 DataFrame변수에 저장
featuresData = pd.read_csv("../dataset2/feature_regression_example.csv")
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155


##### [실습]  ----------------------------------------------------------------------------------------------------------------------------------------------------
YEARWEEK, YEAR, WEEK를 int 타입으로 설정해보고   
홀리데이 및 프로모션 여부 컬럼에 대해서 Y→1, N→0 컬럼을  
HO_YN 컬럼 -> HOLIDAY(Y) -> 1, HOLIDAY(N) -> 0  
PRO_YN -> PROMOTION(Y)->1, PROMOTION(N) -> 0  
추가로 생성하세요

In [5]:
 featuresData.dtypes

REGIONID         object
PRODUCTGROUP     object
PRODUCT          object
ITEM             object
YEARWEEK          int64
YEAR              int64
WEEK              int64
QTY               int64
HOLIDAY          object
HCLUS             int64
PROMOTION        object
PRO_PERCENT     float64
dtype: object

In [6]:
featuresData['HO_YN'] = np.where(featuresData.HOLIDAY =='Y', 1, 0)
featuresData['PRO_YN'] = np.where(featuresData.PROMOTION  =='Y', 1, 0)

In [7]:
featuresData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1


In [8]:
pd.get_dummies(prefix="PROMO",
             data=featuresData.PROMOTION)

Unnamed: 0,PROMO_N,PROMO_Y
0,0,1
1,0,1
2,0,1
3,0,1
4,0,1
5,0,1
6,0,1
7,0,1
8,0,1
9,1,0


##### 다른 풀이방식 1-----------------------------------------------------------------------------------------------------------------------------------------

In [9]:
pd.concat( [featuresData,
              pd.get_dummies(prefix="PROMO",
                                        data=featuresData.PROMOTION) ],
             axis=1).head(2)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PROMO_N,PROMO_Y
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,0,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,0,1


##### 다른 풀이방식 2  -----------------------------------------------------------------------------------------------------------------------------------------
: 딥러닝에 자주 사용

In [10]:
from sklearn.preprocessing import LabelEncoder

le_pro = LabelEncoder()
le_holy = LabelEncoder()

featuresData["PRO_ENCO"] = le_pro.fit_transform(featuresData.PROMOTION)
featuresData["HOLY_ENCO"] = le_pro.fit_transform(featuresData.HOLIDAY)

In [13]:
featuresData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PRO_ENCO,HOLY_ENCO
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,1,0
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201506,2015,6,867,N,4,Y,0.208155,0,1,1,0
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201507,2015,7,1187,N,4,Y,0.208155,0,1,1,0
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201508,2015,8,970,Y,1,Y,0.208155,1,1,1,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201509,2015,9,542,N,4,Y,0.208155,0,1,1,0
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201510,2015,10,350,N,4,N,0.000000,0,0,0,0


In [42]:
#디코딩
le_pro.inverse_transform(featuresData["HOLY_ENCO"])

  if diff:


array(['Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y', 'Y',
       'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'N', 'N', 'Y', 'N', 'N', 'N', 'Y', 'N', 'N', 'N', 'N', 'N',
       'N', 'N', 'N', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N', 'Y',
       'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N', 'N', 'Y', 'Y', 'Y', 'N', 'N',
       'N', 'N', 'Y', 'N', 'N', 'Y', 'Y', 'Y', 'Y', 'Y', 'N', 'N', 'N',
       'Y', 'N', 'N'], dtype=object)

##### 다른 풀이방식 3 (dictionary mapping)  ----------------------------------------------------------------------------------------------------------
: 커스터마이징 가능

In [15]:
binarymap = {'Y':1, "N":0}

featuresData["PRO_DICT"] = featuresData.PROMOTION.map(binarymap)
featuresData["HOLY_DICT"] = featuresData.HOLIDAY.map(binarymap)
featuresData.head(3)

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PRO_ENCO,HOLY_ENCO,PRO_DICT,HOLY_DICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,1,1,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1,0,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1,0,1,0


##### [실습] ----------------------------------------------------------------------------------------------------------------------------------------------------  
데이터 범위를 YEARWEEK 기준  
201501 ~ 201652 구간만 조회하여  
FeaturesData 변수에 저장하세요.  

In [16]:
featuresData2 = featuresData[ (featuresData.YEARWEEK >= 201501) &
   (featuresData.YEARWEEK <= 201652) ]

In [17]:
len(featuresData2)

105

In [18]:
len(featuresData)

107

## 2. 특성선정 및 데이터 분리

In [19]:
corrdf = featuresData.corr()

In [20]:
corrstd = 0.5

In [21]:
# 음의 상관관계는 판매량과 반비례, 양의 상관관계는 판매량과 비례
corrdf[abs(corrdf.QTY) > corrstd]

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,HO_YN,PRO_YN,PRO_ENCO,HOLY_ENCO,PRO_DICT,HOLY_DICT
QTY,0.037392,-0.048803,0.307541,1.0,-0.54492,0.712772,0.514813,0.630081,0.630081,0.514813,0.630081,0.514813
HCLUS,-0.030681,0.067443,-0.349205,-0.54492,1.0,-0.552991,-0.974902,-0.386926,-0.386926,-0.974902,-0.386926,-0.974902
PRO_PERCENT,0.30032,0.208435,0.347462,0.712772,-0.552991,1.0,0.496585,0.903477,0.903477,0.496585,0.903477,0.496585
HO_YN,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,1.0,0.378861,0.378861,1.0,0.378861,1.0
PRO_YN,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,0.378861,1.0,0.378861
PRO_ENCO,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,0.378861,1.0,0.378861
HOLY_ENCO,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,1.0,0.378861,0.378861,1.0,0.378861,1.0
PRO_DICT,0.108551,0.085606,0.089293,0.630081,-0.386926,0.903477,0.378861,1.0,1.0,0.378861,1.0,0.378861
HOLY_DICT,0.009395,-0.070803,0.284231,0.514813,-0.974902,0.496585,1.0,0.378861,0.378861,1.0,0.378861,1.0


In [22]:
list(corrdf[abs(corrdf.QTY) > corrstd].index)

['QTY',
 'HCLUS',
 'PRO_PERCENT',
 'HO_YN',
 'PRO_YN',
 'PRO_ENCO',
 'HOLY_ENCO',
 'PRO_DICT',
 'HOLY_DICT']

##### [실습] ----------------------------------------------------------------------------------------------------------------------------------------------------  
YEARWEEK 기준 오름차순 정렬한후
전체데이터 개수 * 0.8 인덱스 기준
 데이터를 분리하세요

In [23]:
featuresData = featuresData.sort_values(["YEARWEEK"], ascending=[True])
featuresData

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,HO_YN,PRO_YN,PRO_ENCO,HOLY_ENCO,PRO_DICT,HOLY_DICT
0,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201501,2015,1,1225,Y,1,Y,0.209442,1,1,1,1,1,1
1,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201502,2015,2,968,N,4,Y,0.209442,0,1,1,0,1,0
2,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201503,2015,3,1209,N,4,Y,0.208155,0,1,1,0,1,0
3,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201504,2015,4,1810,Y,2,Y,0.208155,1,1,1,1,1,1
4,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201505,2015,5,1773,N,4,Y,0.208155,0,1,1,0,1,0
5,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201506,2015,6,867,N,4,Y,0.208155,0,1,1,0,1,0
6,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201507,2015,7,1187,N,4,Y,0.208155,0,1,1,0,1,0
7,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201508,2015,8,970,Y,1,Y,0.208155,1,1,1,1,1,1
8,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201509,2015,9,542,N,4,Y,0.208155,0,1,1,0,1,0
9,SEOUL_BANK_001,PG02,PRODUCT0010,ITEM0115,201510,2015,10,350,N,4,N,0.000000,0,0,0,0,0,0


In [24]:
list(featuresData.YEARWEEK)[2]

201503

In [52]:
indexStd = round(len(featuresData)*0.8)
yearweekStd = list(featuresData.YEARWEEK)[indexStd]

yearweekStd

201634

In [75]:
#  features컬럼, label컬럼 선정

### 기준비율 선정 시 0.5로 하는 이유? 
### 7~80% 이상으로 설정하면 선정되는 feature가 없음^^,, 그래서 일단 50%로 잡음 ㅎ
featuresStd = 0.5

features = corrdf[(corrdf.QTY != 1)\
                      &(abs(corrdf.QTY)>featuresStd)].index.tolist()
label = ['QTY']

In [76]:
features

['HCLUS',
 'PRO_PERCENT',
 'HO_YN',
 'PRO_YN',
 'PRO_ENCO',
 'HOLY_ENCO',
 'PRO_DICT',
 'HOLY_DICT']

In [77]:
# 기준연주차 이전의 데이터를 트레이닝 데이터로 분리
trainingData_features = (featuresData.iloc[0:indexStd,:])[features]
trainingData_label = (featuresData.iloc[0:indexStd,:])[label]
len(trainingData_features)

86

In [102]:
# 기준연주차 이후의 데이터를 테스트 데이터로 분리
testData_features = (featuresData.iloc[indexStd:,:])[features]
testData_label = (featuresData.iloc[indexStd:,:])[label]
len(testData_features)

21

## 3. 모델 선언 및 학습

In [103]:
# Define Model
model_method = tree.DecisionTreeRegressor(random_state=1)

In [104]:
# Learning (Feature & Label)
model = model_method.fit(trainingData_features, trainingData_label)

In [105]:
model

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [106]:
# Predict
predict = model.predict(testData_features)
predict = pd.DataFrame(predict)

In [107]:
testData_label = testData_label.reset_index(drop=True)

In [116]:
predict.reset_index(drop=True)

Unnamed: 0,0
0,1434.6
1,1434.6
2,1434.6
3,1434.6
4,1434.6
5,1708.75
6,1708.75
7,2620.428571
8,1708.75
9,336.727273


In [124]:
predict.columns = ["PREDICT"]

In [125]:
predict

Unnamed: 0,PREDICT
0,1434.6
1,1434.6
2,1434.6
3,1434.6
4,1434.6
5,1708.75
6,1708.75
7,2620.428571
8,1708.75
9,336.727273


In [126]:
pd.concat( [testData_label,predict] , axis=1)

Unnamed: 0,QTY,PREDICT
0,1700,1434.6
1,1514,1434.6
2,1501,1434.6
3,1491,1434.6
4,806,1434.6
5,2111,1708.75
6,2400,1708.75
7,2010,2620.428571
8,1900,1708.75
9,141,336.727273
