In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import catboost as cat

import warnings

warnings.filterwarnings(action='ignore')

In [2]:
train = pd.read_csv('../data/preprocessed_train_final_v2.csv')
test = pd.read_csv('../data/preprocessed_test_final_v2.csv')
train.reset_index(drop=True,inplace=True)
test.reset_index(drop=True,inplace=True)

In [3]:
def mape(y_true, y_pred):
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

def print_score(pred, test):
    print('MSE:',mean_squared_error(pred,test))
    print('MAE:',mean_absolute_error(pred,test))
    print('r2_score:',r2_score(pred,test))
    print('MAPE:',mape(pred,test))

In [4]:
train.head()

Unnamed: 0,노출(분),마더코드,상품코드,판매단가,취급액,방송진행도,기온(°C),풍속(m/s),습도(%),1시간평균 미세먼지농도(㎍/㎥),...,1,2,3,4,5,6,7,8,9,히트상품
0,30.0,100075,200203,399000,50000,0.5,9.610555,1.909455,83.912505,22.797014,...,0,0,0,0,0,1568,1569,1133,939,0
1,30.0,100075,200203,399000,863000,1.0,9.610555,1.909455,83.912505,22.797014,...,0,0,0,0,0,1568,1569,1133,939,0
2,30.0,100075,200204,1190000,2281000,0.5,9.610555,1.909455,83.912505,22.797014,...,0,0,0,0,1568,1569,1,1,1,0
3,30.0,100075,200204,1190000,2281000,1.0,9.610555,1.909455,83.912505,22.797014,...,0,0,0,0,1568,1569,1,1,1,0
4,30.0,100075,200205,1390000,59023000,1.0,9.610555,1.909455,83.912505,22.797014,...,0,0,0,0,1568,1569,1,236,1,1


In [5]:
test.head()

Unnamed: 0,노출(분),마더코드,상품코드,판매단가,취급액,방송진행도,기온(°C),풍속(m/s),습도(%),1시간평균 미세먼지농도(㎍/㎥),...,1,2,3,4,5,6,7,8,9,히트상품
0,20.0,100650,201971,59800,-1.0,0.333333,17.587617,2.248339,91.765786,12.016986,...,0,0,0,0,0,1,8,1,26,0
1,20.0,100650,201971,59800,-1.0,0.666667,17.587617,2.248339,91.765786,12.016986,...,0,0,0,0,0,1,8,1,26,0
2,20.0,100650,201971,59800,-1.0,1.0,17.981129,2.50696,88.190405,11.258748,...,0,0,0,0,0,1,8,1,26,0
3,20.0,100445,202278,69900,-1.0,0.333333,17.981129,2.50696,88.190405,11.258748,...,0,0,0,0,235,130,861,763,740,0
4,20.0,100445,202278,69900,-1.0,0.666667,17.981129,2.50696,88.190405,11.258748,...,0,0,0,0,235,130,861,763,740,0


In [6]:
print(train.shape, test.shape)

(37372, 118) (2716, 118)


In [7]:
train.columns

Index(['노출(분)', '마더코드', '상품코드', '판매단가', '취급액', '방송진행도', '기온(°C)', '풍속(m/s)',
       '습도(%)', '1시간평균 미세먼지농도(㎍/㎥)',
       ...
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '히트상품'],
      dtype='object', length=118)

In [8]:
test.columns

Index(['노출(분)', '마더코드', '상품코드', '판매단가', '취급액', '방송진행도', '기온(°C)', '풍속(m/s)',
       '습도(%)', '1시간평균 미세먼지농도(㎍/㎥)',
       ...
       '1', '2', '3', '4', '5', '6', '7', '8', '9', '히트상품'],
      dtype='object', length=118)

In [9]:
Ttrain = train.drop('취급액',axis=1)
Ttarget = train['취급액']
Ttest = test.drop('취급액',axis=1)

In [10]:
# min - max 스케일링
mm = MinMaxScaler()
mm = mm.fit(Ttrain)
Ttrain = mm.transform(Ttrain)
Ttest = mm.transform(Ttest)

In [11]:
Ttrain

array([[7.33570160e-01, 8.83392226e-02, 8.07799443e-02, ...,
        7.17542749e-01, 5.94800254e-01, 0.00000000e+00],
       [7.33570160e-01, 8.83392226e-02, 8.07799443e-02, ...,
        7.17542749e-01, 5.94800254e-01, 0.00000000e+00],
       [7.33570160e-01, 8.83392226e-02, 8.11778750e-02, ...,
        6.33312223e-04, 0.00000000e+00, 0.00000000e+00],
       ...,
       [4.67140320e-01, 9.68197880e-01, 9.63788301e-01, ...,
        8.73970868e-02, 4.36905517e-01, 1.00000000e+00],
       [4.67140320e-01, 9.68197880e-01, 9.63788301e-01, ...,
        8.73970868e-02, 4.36905517e-01, 1.00000000e+00],
       [4.67140320e-01, 9.68197880e-01, 9.63788301e-01, ...,
        8.73970868e-02, 4.36905517e-01, 1.00000000e+00]])

In [12]:
Ttest

array([[4.67140320e-01, 7.65606596e-01, 7.84321528e-01, ...,
        6.33312223e-04, 1.58528852e-02, 0.00000000e+00],
       [4.67140320e-01, 7.65606596e-01, 7.84321528e-01, ...,
        6.33312223e-04, 1.58528852e-02, 0.00000000e+00],
       [4.67140320e-01, 7.65606596e-01, 7.84321528e-01, ...,
        6.33312223e-04, 1.58528852e-02, 0.00000000e+00],
       ...,
       [2.00710480e-01, 1.16607774e-01, 1.09033028e-01, ...,
        1.77327422e-02, 9.25808497e-02, 0.00000000e+00],
       [4.67140320e-01, 3.07420495e-01, 3.48189415e-01, ...,
        6.33312223e-04, 0.00000000e+00, 0.00000000e+00],
       [3.58792185e-01, 3.07420495e-01, 3.48189415e-01, ...,
        6.33312223e-04, 0.00000000e+00, 0.00000000e+00]])

In [13]:
print(Ttrain.shape, Ttest.shape)

(37372, 117) (2716, 117)


In [14]:
folds = 10
KF = KFold(n_splits = folds, shuffle = True, random_state=42)

In [15]:
xgb_avg = 0
lgb_avg = 0
cat_avg = 0

xgb_preds = np.zeros([Ttest.shape[0],])
lgb_preds = np.zeros([Ttest.shape[0],])
cat_preds = np.zeros([Ttest.shape[0],])

# Catboost

In [16]:
cat_params = {
    'iterations':1000,
    'learning_rate':0.05,
    'depth':16,
    'loss_function':'MAE',
    'eval_metric':'MAPE',
    'thread_count':16,
    'rsm':0.95,
    'bagging_temperature':0.8
}
    
for i,(train_ind, test_ind) in enumerate(KF.split(Ttrain)):
    print('========Fold',i+1)
    Xtrain, XCV, ytrain, yCV = Ttrain[train_ind], Ttrain[test_ind], Ttarget[train_ind], Ttarget[test_ind]
    
    cat_model = cat.CatBoostRegressor(**cat_params)
    
    cat_model.fit(Xtrain,ytrain,eval_set=[(XCV,yCV)],early_stopping_rounds=30,verbose=False)
    
    pred = cat_model.predict(XCV)
    
    print_score(pred,yCV)
    
    cat_preds += cat_model.predict(Ttest)/folds
    cat_avg += mape(pred,yCV)/folds

print('\n\nAverage MAPE Score:',cat_avg)

MSE: 105860141751357.89
MAE: 6029612.948834241
r2_score: 0.6260189223285452
MAPE: 46.855937601339775
MSE: 91124741873847.38
MAE: 5928682.545741984
r2_score: 0.6445794876234305
MAPE: 43.97311691734133
MSE: 88240701945234.12
MAE: 5826442.897895915
r2_score: 0.6620942273350834
MAPE: 43.80996796829579
MSE: 101088877907491.8
MAE: 5968301.316213514
r2_score: 0.6084921506600804
MAPE: 63.604305331899845
MSE: 91926708060629.16
MAE: 5975684.739033878
r2_score: 0.670151168611727
MAPE: 46.40178509395499
MSE: 83769489206025.28
MAE: 5732536.441860436
r2_score: 0.6791684159694867
MAPE: 46.340723446797945
MSE: 97935622402459.12
MAE: 6096946.304048937
r2_score: 0.5991110012363692
MAPE: 44.256113574669065
MSE: 79031447229358.86
MAE: 5855724.420476607
r2_score: 0.703321319524371
MAPE: 45.602874751898554
MSE: 93609776088150.86
MAE: 6250709.269981254
r2_score: 0.6342034973373141
MAPE: 43.16660684762426
MSE: 112202658433972.3
MAE: 6135567.556116151
r2_score: 0.5698295564615918
MAPE: 41.28952059266409


Aver

# XGB model

In [17]:
xgb_params = {
    'n_estimators':1000,
    'learning_rate':0.05,
    'gamma':0,
    'subsample':0.75,
    'max_depth':10,
}

def xgb_mape(preds, dtrain):
    labels = dtrain.get_label()
    return np.mean(np.abs((labels - preds) / (labels)))*100

for i,(train_ind, test_ind) in enumerate(KF.split(Ttrain)):
    print('========Fold',i+1)
    Xtrain, XCV, ytrain, yCV = Ttrain[train_ind], Ttrain[test_ind], Ttarget[train_ind], Ttarget[test_ind]
    
    xgb_model = xgb.XGBRegressor(**xgb_params)
    
    xgb_model.fit(Xtrain,ytrain,eval_set=[(XCV,yCV)],eval_metric=['rmse','mae'],early_stopping_rounds=30,verbose=False)
    
    pred = xgb_model.predict(XCV)
    
    print_score(pred,yCV)
    
    xgb_preds += xgb_model.predict(Ttest)/folds
    xgb_avg += mape(pred,yCV)/folds
    

print('\n\nAverage MAPE Score:',xgb_avg)

MSE: 80108094964035.22
MAE: 5244255.035318749
r2_score: 0.7732094370182959
MAPE: 69.94010294647025
MSE: 65674680596600.375
MAE: 4960651.468806431
r2_score: 0.8011037771782186
MAPE: 49.81067730235827
MSE: 59974523395227.4
MAE: 4846814.700314607
r2_score: 0.816449874021059
MAPE: 65.95278497117275
MSE: 73968587515062.56
MAE: 4960265.690842839
r2_score: 0.7736382839235586
MAPE: 48.80221256786112
MSE: 59668417100341.414
MAE: 4800819.801984641
r2_score: 0.832474061038252
MAPE: 50.37384405668909
MSE: 60369780632733.32
MAE: 4841396.923433528
r2_score: 0.814231857866573
MAPE: 52.46713678147906
MSE: 58502534759186.2
MAE: 4746717.1256943345
r2_score: 0.8187458949272984
MAPE: 57.898460961605394
MSE: 54785862574793.44
MAE: 4901045.46642841
r2_score: 0.8348431573297925
MAPE: 47.47350714726585
MSE: 59134331722357.16
MAE: 5030829.119106915
r2_score: 0.8290748985724584
MAPE: 71.97888178755068
MSE: 73249683834261.34
MAE: 4844160.201452209
r2_score: 0.777843039426273
MAPE: 56.81824585119395


Average MAP

# LGBM

In [18]:
lgb_params = {
    'n_estimators':1000,
    'num_leaves':512,
    'learning_rate':0.05,
    'max_depth':100,
    'boosting_type':'gbdt',
    'n_jobs':-1
}

for i,(train_ind, test_ind) in enumerate(KF.split(Ttrain)):
    print('========Fold',i+1)
    Xtrain, XCV, ytrain, yCV = Ttrain[train_ind], Ttrain[test_ind], Ttarget[train_ind], Ttarget[test_ind]
    
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    
    lgb_model.fit(Xtrain,ytrain,eval_set=[(XCV,yCV)],eval_metric='mape',early_stopping_rounds=30,verbose=False)
    
    pred = lgb_model.predict(XCV)
    
    print_score(pred,yCV)
    
    lgb_preds += lgb_model.predict(Ttest)/folds
    lgb_avg += mape(pred,yCV)/folds

print('\n\nAverage MAPE Score:',lgb_avg)

MSE: 86051560307609.6
MAE: 5478309.366375023
r2_score: 0.756645657886337
MAPE: 67.38915964178315
MSE: 73374505149578.39
MAE: 5259503.676043381
r2_score: 0.7746448820871941
MAPE: 40.61685913119562
MSE: 67653095231970.13
MAE: 5117039.980933403
r2_score: 0.7947746332027354
MAPE: 39.79381310609464
MSE: 81477349122518.33
MAE: 5259168.257461325
r2_score: 0.7469326038722635
MAPE: 47.98799621232065
MSE: 62118991869197.98
MAE: 4962203.700013014
r2_score: 0.8231158726645014
MAPE: 49.71989494838952
MSE: 64022038665899.47
MAE: 5019156.114829858
r2_score: 0.8051785007994121
MAPE: 49.45497114735147
MSE: 62946572778428.12
MAE: 4967715.439755333
r2_score: 0.8037380415810579
MAPE: 43.65349266106319
MSE: 61529399389368.06
MAE: 5209570.351945942
r2_score: 0.8096825898418936
MAPE: 36.43568074437462
MSE: 62215412124482.91
MAE: 5266145.542579846
r2_score: 0.8179231063458294
MAPE: 50.20616974615598
MSE: 79504037379901.62
MAE: 5092143.779152997
r2_score: 0.7577685388799371
MAPE: 44.562279578543986


Average M

# Ensemble

In [19]:
total_preds = (xgb_preds + lgb_preds + cat_preds)/3
total_preds.shape

(2716,)

In [28]:
total_preds = list(total_preds)

In [29]:
submission = pd.read_excel('../data/2020 빅콘테스트 데이터분석분야-챔피언리그_2020년 6월 판매실적예측데이터(평가데이터).xlsx', header=1)

In [38]:
count = 0
for idx in range(len(submission)):
    if submission.loc[idx,'상품군'] == '무형':
        submission.loc[idx,'취급액'] = 0
        continue
    submission.loc[idx,'취급액'] = total_preds[count]
    count+=1
    if count==2716:
        break

In [39]:
submission

Unnamed: 0,방송일시,노출(분),마더코드,상품코드,상품명,상품군,판매단가,취급액
0,2020-06-01 06:20:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,8.811414e+06
1,2020-06-01 06:40:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,1.322434e+07
2,2020-06-01 07:00:00,20.000000,100650,201971,잭필드 남성 반팔셔츠 4종,의류,59800,2.580127e+07
3,2020-06-01 07:20:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,1.867226e+07
4,2020-06-01 07:40:00,20.000000,100445,202278,쿠미투니카 쿨 레이시 란쥬쉐이퍼&팬티,속옷,69900,3.108103e+07
...,...,...,...,...,...,...,...,...
2886,2020-07-01 00:20:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,0.000000e+00
2887,2020-07-01 00:40:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,0.000000e+00
2888,2020-07-01 01:00:00,20.000000,100660,201989,쉴렉스 안마의자 렌탈서비스,무형,0,0.000000e+00
2889,2020-07-01 01:20:00,20.000000,100261,200875,아놀드파마 티셔츠레깅스세트,의류,69900,9.985345e+06


In [40]:
submission.to_excel('../data/final_submission.xlsx',index=False)