In [1]:
import numpy as np
import pandas as pd

from sklearn import datasets, tree

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

## 지역, 상품, 아이템별 14별 146개 아이템 추출

In [3]:
selloutData = pd.read_csv("../dataset2/kopo_decision_tree_all_new.csv")

In [2]:
groupKey = ["REGIONID","PRODUCTGROUP","PRODUCT","ITEM"]

In [8]:
groupData = selloutData.groupby(groupKey)["YEARWEEK"].agg(["size"]).reset_index()

In [9]:
groupData.rename(columns={"size":"KNOB"}, inplace=True)

In [13]:
groupData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,KNOB
0,A01,PG01,P01,ITEM001,3
1,A01,PG01,P01,ITEM002,2
2,A01,PG01,P01,ITEM003,1
3,A01,PG01,P01,ITEM004,7
4,A01,PG01,P01,ITEM005,8


In [12]:
mergedData = pd.merge(left=selloutData,
                        right=groupData,
                        on=groupKey,
                        how="left")

mergedData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,KNOB
0,A01,PG01,P01,ITEM001,201538,2015,38,1,N,4,N,0.0,3
1,A01,PG01,P01,ITEM001,201548,2015,48,1,Y,0,N,0.0,3
2,A01,PG01,P01,ITEM001,201549,2015,49,2,Y,0,N,0.0,3
3,A01,PG01,P01,ITEM002,201526,2015,26,1,Y,1,N,0.0,2
4,A01,PG01,P01,ITEM002,201532,2015,32,1,N,4,N,0.0,2


In [16]:
maxKnob = mergedData.KNOB.max()
maxKnob

146

In [19]:
refinedData = mergedData[mergedData.KNOB >= maxKnob]
len(refinedData)

7592

In [24]:
sortKey = ["REGIONID","PRODUCTGROUP","PRODUCT","ITEM","YEARWEEK"]

In [26]:
sortedData = refinedData.sort_values(sortKey).reset_index(drop=True)
sortedData.head()

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,KNOB
0,A01,PG02,P03,ITEM043,201501,2015,1,87,Y,1,Y,0.19759,146
1,A01,PG02,P03,ITEM043,201502,2015,2,60,N,4,Y,0.19759,146
2,A01,PG02,P03,ITEM043,201503,2015,3,51,N,4,N,0.0,146
3,A01,PG02,P03,ITEM043,201504,2015,4,37,Y,2,N,0.0,146
4,A01,PG02,P03,ITEM043,201505,2015,5,136,N,4,Y,0.201205,146


In [27]:
sortedData.YEARWEEK.min()

201501

In [29]:
sortedData.describe()

Unnamed: 0,YEARWEEK,YEAR,WEEK,QTY,HCLUS,PRO_PERCENT,KNOB
count,7592.0,7592.0,7592.0,7592.0,7592.0,7592.0,7592.0
mean,201616.917808,2015.917808,25.136986,173.690595,2.80137,0.242486,146.0
std,78.836685,0.798226,14.534164,338.459991,1.533384,0.116065,0.0
min,201501.0,2015.0,1.0,1.0,0.0,0.0,146.0
25%,201537.0,2015.0,13.0,18.0,1.0,0.180645,146.0
50%,201620.5,2016.0,25.0,49.0,4.0,0.249953,146.0
75%,201705.0,2017.0,37.0,175.0,4.0,0.318242,146.0
max,201741.0,2017.0,53.0,5798.0,4.0,0.550644,146.0


In [46]:
stdYearweek = 201701

In [30]:
groupKey

['REGIONID', 'PRODUCTGROUP', 'PRODUCT', 'ITEM']

In [32]:
groupData2 = sortedData.groupby(groupKey)

In [33]:
groupData2.get_group(list(groupData2.groups)[0])

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,KNOB
0,A01,PG02,P03,ITEM043,201501,2015,1,87,Y,1,Y,0.197590,146
1,A01,PG02,P03,ITEM043,201502,2015,2,60,N,4,Y,0.197590,146
2,A01,PG02,P03,ITEM043,201503,2015,3,51,N,4,N,0.000000,146
3,A01,PG02,P03,ITEM043,201504,2015,4,37,Y,2,N,0.000000,146
4,A01,PG02,P03,ITEM043,201505,2015,5,136,N,4,Y,0.201205,146
5,A01,PG02,P03,ITEM043,201506,2015,6,117,N,4,Y,0.201205,146
6,A01,PG02,P03,ITEM043,201507,2015,7,106,N,4,Y,0.201205,146
7,A01,PG02,P03,ITEM043,201508,2015,8,56,Y,1,N,0.000000,146
8,A01,PG02,P03,ITEM043,201509,2015,9,70,N,4,N,0.000000,146
9,A01,PG02,P03,ITEM043,201510,2015,10,167,N,4,Y,0.201205,146


In [55]:
def dtPredict(onegroup):
    onegroup = groupData2.get_group(list(groupData2.groups)[2])
    eachgroup = onegroup.reset_index(drop=True)
    
    corrdf = eachgroup.corr()
    corrStd = 0.5
    features = list(corrdf[ (abs(corrdf["QTY"] ) >= corrStd) & (corrdf["QTY"] != 1) ].index)
    label = ["QTY"]
    
    testData_all = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ]
    testDataSize = len(testData_all)
    
    if testDataSize != 0:

        trainingData_features = eachgroup[ eachgroup.YEARWEEK < stdYearweek ][features]
        trainingData_label = eachgroup[ eachgroup.YEARWEEK < stdYearweek ][label]
        testData_features = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ][features]
        testData_label = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ][label]


        from sklearn import tree
        model_method = tree.DecisionTreeRegressor(random_state=32)
        model = model_method.fit(trainingData_features, trainingData_label)

        testData_all["PREDICT"] = model.predict(testData_features)

        return testData_all

In [None]:
predictDF = sortedData.groupby(groupKey).apply(dtPredict)

In [34]:
onegroup = groupData2.get_group(list(groupData2.groups)[2])
eachgroup = onegroup.reset_index(drop=True)

##### 1. 특성 선별

In [44]:
corrdf = eachgroup.corr()
corrStd = 0.5
features = list(corrdf[ (abs(corrdf["QTY"] ) >= corrStd) & (corrdf["QTY"] != 1) ].index)
label = ["QTY"]

In [45]:
print(features)
print(label)

['HCLUS']
['QTY']


##### 2. 데이터 분리(훈련/테스트)

In [48]:
trainingData_features = eachgroup[ eachgroup.YEARWEEK < stdYearweek ][features]
trainingData_label = eachgroup[ eachgroup.YEARWEEK < stdYearweek ][label]
testData_features = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ][features]
testData_label = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ][label]
testData_all = eachgroup[ eachgroup.YEARWEEK >= stdYearweek ]

##### 3. 모델 선언

In [49]:
from sklearn import tree

In [50]:
model_method = tree.DecisionTreeRegressor(random_state=32)

##### 4. 훈련 (훈련데이터 특성/답지 활용)

In [51]:
model = model_method.fit(trainingData_features, trainingData_label)

##### 5. 예측 (테스트 데이터의 feature)

In [52]:
model.predict(testData_features)

array([29.        , 19.88888889, 23.75      , 19.88888889, 19.88888889,
       19.88888889, 19.88888889, 29.        , 19.88888889, 19.88888889,
       19.88888889, 19.88888889, 19.88888889, 19.88888889, 19.88888889,
       19.88888889, 19.88888889, 19.88888889, 19.88888889, 29.        ,
       29.        , 29.        , 19.88888889, 19.88888889, 29.        ,
       29.        , 29.        , 29.        , 29.        , 19.88888889,
       19.88888889, 19.88888889, 19.88888889, 29.        , 29.        ,
       29.        , 19.88888889, 19.88888889, 19.88888889, 19.88888889,
       23.75      ])

In [53]:
testData_all["PREDICT"] = model.predict(testData_features)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [54]:
testData_all

Unnamed: 0,REGIONID,PRODUCTGROUP,PRODUCT,ITEM,YEARWEEK,YEAR,WEEK,QTY,HOLIDAY,HCLUS,PROMOTION,PRO_PERCENT,KNOB,PREDICT
105,A01,PG02,P03,ITEM051,201701,2017,1,17,Y,1,Y,0.180645,146,29.0
106,A01,PG02,P03,ITEM051,201702,2017,2,21,N,4,Y,0.180645,146,19.888889
107,A01,PG02,P03,ITEM051,201703,2017,3,18,Y,2,Y,0.180645,146,23.75
108,A01,PG02,P03,ITEM051,201704,2017,4,32,N,4,Y,0.180645,146,19.888889
109,A01,PG02,P03,ITEM051,201705,2017,5,23,N,4,Y,0.145161,146,19.888889
110,A01,PG02,P03,ITEM051,201706,2017,6,25,N,4,Y,0.180645,146,19.888889
111,A01,PG02,P03,ITEM051,201707,2017,7,25,N,4,Y,0.180645,146,19.888889
112,A01,PG02,P03,ITEM051,201708,2017,8,39,Y,1,Y,0.180645,146,29.0
113,A01,PG02,P03,ITEM051,201709,2017,9,22,N,4,Y,0.180645,146,19.888889
114,A01,PG02,P03,ITEM051,201710,2017,10,15,N,4,N,0.0,146,19.888889
