# Import Dataset

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999
%matplotlib inline 

In [None]:
sales19 = pd.read_csv('sales_case1_above19.csv')
sales19 = sales19.iloc[:,1:]
sales19 = sales19[sales19.판매단가!=0]

# Divide Dataset

In [29]:
Word = [col for col in sales19.columns if 'W_' in col] # 상품명
Cluster = [col for col in sales19.columns if 'C_' in col] # 상품군
Mother = [col for col in sales19.columns if 'M_' in col] # 마더코드
Product = [col for col in sales19.columns if 'P_' in col] # 상품코드

In [33]:
sales19_basic = sales19[['판매단가','월','일','시','분','노출(분)']]
sales19_word = sales19[Word]
sales19_cluster = sales19[Cluster]
sales19_mother = sales19[Mother]
sales19_product = sales19[Product]

## sales19_basic
- 판매단가, 월, 일, 시, 분, 노출(분)

### "월" dummy화

In [34]:
month_dummies = pd.get_dummies(sales19_basic['월'])
month_dummies.columns = ['month_' + str(col) for col in month_dummies.columns]

### "일"의 조건부 dummy화
- 월급일은 10일 or 25일
- dummies : 10일+3일 / 25일+3일 / 그 외

In [35]:
sales19_basic.loc[(sales19_basic['일']>=10)&(sales19_basic['일']<=13), '월급일1'] = 1
sales19_basic.loc[(sales19_basic['일']>=25)&(sales19_basic['일']<=28), '월급일2'] = 1
sales19_basic.iloc[:,[6,7]] = sales19_basic.iloc[:,[6,7]].fillna(0)

In [36]:
sales19_basic.drop(['월','일'],axis=1,inplace=True)

### "노출(분)"의 scaling
- 60분이 최대! 

In [37]:
sales19_basic['노출(분)'] = sales19_basic['노출(분)']/60

### "시"의 dummy화

In [38]:
night = [23,0,1,2]
dawn = [6,7,8]
morning = [9,10,11]
afternoon = [12,13,14,15,16,17]
dinner = [18,19,20,21,22]

In [39]:
sales19_basic.loc[(sales19_basic['시'].isin(night)), 'night'] = 1
sales19_basic.loc[(sales19_basic['시'].isin(dawn)), 'dawn'] = 1
sales19_basic.loc[(sales19_basic['시'].isin(morning)), 'morning'] = 1
sales19_basic.loc[(sales19_basic['시'].isin(afternoon)), 'afternoon'] = 1
sales19_basic.loc[(sales19_basic['시'].isin(dinner)), 'dinner'] = 1

In [40]:
sales19_basic.iloc[:,range(6,11)] = sales19_basic.iloc[:,range(6,11)].fillna(0)

In [41]:
sales19_basic.drop('시',axis=1,inplace=True)

### 각 제품의 마지막
- 각 홈쇼핑 코너의 시작 = "마더코드"의 변경점
- 각 홈쇼핑 코너의 시작 : change_start =1
- 각 홈쇼핑 코너의 끝 : change_finish =1

In [42]:
change_point = sales19[['마더코드']].ne(sales19[['마더코드']].shift()).apply(lambda x: x.index[x].tolist())

In [43]:
change_start = np.array(change_point.values[0])
change_finish = np.array(change_point.values[0])-1
change_finish = change_finish[1:]

In [44]:
sales19_basic.loc[(sales19_basic.index.isin(change_start)), 'change_start'] = 1
sales19_basic.loc[(sales19_basic.index.isin(change_finish)), 'change_finish'] = 1

In [45]:
sales19_basic[['change_start','change_finish']] = sales19_basic[['change_start','change_finish']].fillna(0)

In [46]:
sales19_basic.drop('분',axis=1,inplace=True)

### 계절

In [47]:
month_dummies['봄'] = month_dummies.iloc[:,[2,3,4,]].sum(axis=1)
month_dummies['여름'] = month_dummies.iloc[:,[5,6,7]].sum(axis=1)
month_dummies['가을'] = month_dummies.iloc[:,[8,9,10]].sum(axis=1)
month_dummies['겨울'] = month_dummies.iloc[:,[11,0,1]].sum(axis=1)

### 종합하기

In [48]:
sales19_final1 = pd.concat([sales19_basic,month_dummies],axis=1)

## sales19_word
- test에 19회 이상 등장 단어
- 불필요한것 빼주기 ( ex.1글자 )

In [49]:
long_word = [col for col in sales19_word.columns if len(col) >3 ] # 상품명

In [50]:
len(long_word) / len(sales19_word.columns)

0.8656716417910447

In [51]:
sales19_Lword = sales19_word.loc[:,long_word]

In [52]:
#sales19_Lword.sum().sort_values(ascending=True)

In [53]:
sales19_final2 = sales19_Lword.copy()

## sales19_cluster

In [54]:
sales19_final3 = sales19_cluster.copy()

## sales19_mother
- 모든 mothercode를 더미화 하지는 않을 것
- 후보 : 10회 / 20회 / 30회 이상 등장

In [57]:
temp10 = sales19_mother.sum().sort_values(ascending=True)>=10
above10 = [col for col in sales19_mother.columns if col in temp10[temp10==True].index]

temp20 = sales19_mother.sum().sort_values(ascending=True)>=20
above20 = [col for col in sales19_mother.columns if col in temp20[temp20==True].index]

temp30 = sales19_mother.sum().sort_values(ascending=True)>=30
above30 = [col for col in sales19_mother.columns if col in temp30[temp30==True].index]

In [58]:
sales19_mother_ab10 = sales19_mother.loc[:,above10]
sales19_mother_ab20 = sales19_mother.loc[:,above20]
sales19_mother_ab30 = sales19_mother.loc[:,above30]

- 일단은 20회 이상으로 선택

In [59]:
sales19_mother_ab20.shape

(37372, 602)

In [60]:
sales19_final4 = sales19_mother_ab20.copy()

## sales19_product
- 정확히 제품을 알려주는 것이기 때문에....
- 더미화 보류!

In [None]:

#sales19_y = sales19[['취급액']]

# sales19의 X변수 총정리 

### sales19_final1 (기본 정보)
- [시간 정보] 노출(분), 월급일 전후, 방영 시간대, 방송초반/후반부, 월, 계절,
- [가격 정보] 판매단가

### sales19_final2 (상품명 정보)
- 상품명에 자주 등장하는 단어 더미화
- 조건 1) test data에서 19회 이상 등장한 단어
- 조건 2) 2글자 이상 단어만

### sales19_final3 (상품군 정보)
- 12가지 상품군 더미화

### sales19_final4 (마더코드 정보)
- 20회 이상 등장한 마더코드만 더미화

### (sales19_final5) (상품코드 정보)
- 아직 사용할지 미정

In [61]:
sales_X = pd.concat([sales19_final1,sales19_final2,sales19_final3,sales19_final4],axis=1)

In [62]:
sales_X.shape

(37372, 815)

In [65]:
sales_Y = sales19_y.copy()

In [82]:
sales_X.to_csv('X_ver1.csv')
sales_Y.to_csv('Y.csv')

# Modeling

In [80]:
import time

In [67]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [68]:
x_train,x_test,y_train,y_test = train_test_split(sales_X,sales_Y,
                                                 test_size=0.2,random_state=42)

In [78]:
X = np.array(x_train)
y = np.array(y_train)

In [79]:
names = ['RandomForest', 'LGBM', 'XGB']
reg_list = [RandomForestRegressor(random_state=42), 
            lgb.LGBMRegressor(random_state=42), 
            xgb.XGBRegressor(random_state=42) ]

In [81]:
for name, reg in zip(names, reg_list):
    start = time.time()
    reg.fit(X,y)
    print('---- {} ----'.format(name))    
    print('cv score : ', cross_val_score(reg,X,y, cv=5).mean())
    print('time spent : ',time.time()-start)
    print('-----------'.format(name))    

---- RandomForest ----
cv score :  0.6434664125561793
time spent :  194.32971382141113
-----------
---- LGBM ----


MemoryError: 

In [None]:
def MAPE(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100