In [1]:
import os
os.chdir('..')
import pandas as pd
import numpy as np
from tqdm import tqdm
from algorithms.Model_LightGBM import LightGBM
from algorithms.Model_Generator import Generator

from preprocessing.preprocessing import preprocessing
from metrics.MAPE import MAPE

from utils import add_all_features

train = pd.read_csv("dataset/original/train.csv")
test = pd.read_csv("dataset/original/x_test.csv")

In [2]:
train.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope,target
0,WE 10 December 2016,2689,SINGLE,395.41,BRAND1,1.16,,,,0,24175.0
1,WE 17 December 2016,2689,SINGLE,395.41,BRAND1,1.15,1.0,17.676112,24175.0,0,23521.0
2,WE 24 December 2016,2689,SINGLE,395.41,BRAND1,1.16,1.0,24.482803,23521.0,0,22075.0
3,WE 31 December 2016,2689,SINGLE,395.41,BRAND1,1.16,0.0,19.410646,22075.0,0,16492.0
4,WE 07 January 2017,2689,SINGLE,395.41,BRAND1,1.16,0.0,29.81203,16492.0,0,25971.0


In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope
0,WE 29 June 2019,1027,SINGLE,114.23,BRAND2,0.56,10.0,24.386541,66337.0,1
1,WE 06 July 2019,1027,SINGLE,114.23,BRAND2,0.55,4.0,37.437842,49992.0,1
2,WE 13 July 2019,1027,SINGLE,114.23,BRAND2,0.45,0.0,28.039623,58062.0,1
3,WE 20 July 2019,1027,SINGLE,114.23,BRAND2,0.5,19.0,100.0,90908.0,1
4,WE 27 July 2019,1027,SINGLE,114.23,BRAND2,0.42,26.0,99.191839,108957.0,1


In [4]:
useTest = True
useScope = True
isEvaluation = False
useSampleWeights, weights_type = True, 2
save = False

if isEvaluation:
    useTest = False
    useScope = False

In [5]:
df = preprocessing(train, test, useTest=useTest)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')

#   --------------- Model -----------------

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster',
#              'year','week_of_the_year','brand','month'
            ]
categorical_f = [x for x in categorical_f if x not in drop_cols]

CLUSTER = [1,2]      # Set CLUSTER = None if you want NOT to consider any cluster
NAME = 'lgb_no_cluster_1'

model = LightGBM()
model_gen = Generator(df, model,
                        categorical_features=categorical_f,
                        drop_columns=drop_cols,
                        isScope=useScope,
                        sample_weights_type=weights_type,
                        evaluation=isEvaluation,
                        useTest=useTest,
                        cluster=None,
                        name=NAME)

prediction = model_gen.run_generator(save)

model_gen.compute_MAPE()

6019it [00:00, 14471.23it/s]
6019it [00:00, 74489.66it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 1389.60it/s]
New categorical_feature is ['brand', 'sku']
25it [00:25,  1.03s/it]

Index(['Date', 'sku', 'target', 'real_target',
       'log_prediction_lgb_no_cluster_1', 'prediction_lgb_no_cluster_1'],
      dtype='object')
Standard MAPE = 8.658111203247575





# Linear Regression

In [6]:
from utils import dfs_gen
from sklearn.linear_model import LinearRegression
tot_mape=0
date = [] 
sku = []
pred = []
for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df[df.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    lr = []
    for df_train, df_test in generator:
        X = df_train.drop(drop_cols+['target','sku'], axis=1).fillna(0)
        y = df_train.real_target.dropna()
        reg = LinearRegression().fit(X, y)
        date.append(list(df_test.Date)[0])
        sku.append(s)
        pred.append(float(reg.predict(df_test.drop(drop_cols+['target','sku'], axis=1).fillna(0))[0]))
        lr.append(reg.predict(df_test.drop(drop_cols+['target','sku'], axis=1).fillna(0))[0])
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")

546: 20.28145929488786
1027: 21.824647227944485
1058: 16.66231048479628
549: 18.803192095840895
1065: 10.205832609757405
554: 9.878990915397127
1035: 18.15535088220804
686: 37.23712152675513
144: 43.31507748450789
688: 14.63137799072224
1206: 14.65073324085718
1051: 25.675719469530083


In [7]:
preds_lr = pd.DataFrame()
preds_lr['Date'] = date
preds_lr['sku'] = sku
preds_lr['pred_linear_regression'] = pred
preds_lr.head()

Unnamed: 0,Date,sku,pred_linear_regression
0,2019-06-29,546,65464.221979
1,2019-07-06,546,83863.717336
2,2019-07-13,546,126157.56741
3,2019-07-20,546,160809.786634
4,2019-07-27,546,123505.347475


In [8]:
df_ens = pd.merge(prediction, preds_lr, how='left')
df_ens['pred_linear_regression'] = df_ens['pred_linear_regression'].astype(float) 
df_ens.head()

Unnamed: 0,Date,sku,target,real_target,log_prediction_lgb_no_cluster_1,prediction_lgb_no_cluster_1,pred_linear_regression
0,2019-06-29,144,9.694555,16228.0,9.651065,15537.326992,38592.924306
1,2019-06-29,546,11.025295,61407.0,11.144993,69215.377494,65464.221979
2,2019-06-29,549,10.427565,33777.0,10.58459,39520.086357,37277.180659
3,2019-06-29,554,11.717286,122673.0,11.791167,132079.522759,140701.251675
4,2019-06-29,686,9.960718,21177.0,9.776227,17609.086558,47368.718446


In [9]:
print('\033[1m'+ "LIGHTGBM MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().prediction_lgb_no_cluster_1)}")


[1mLIGHTGBM MAPE:[0m8.658111203247575


In [10]:
print('\033[1m'+ "LINEAR REGRESSION MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().pred_linear_regression)}")

[1mLINEAR REGRESSION MAPE:[0m20.94348443526705


In [29]:
a = .1
b = .9
df_ens['ensemble'] = a*df_ens.pred_linear_regression+b*df_ens.prediction_lgb_no_cluster_1
print('\033[1m'+ "ENSEMBLE MAPE:" +'\033[0m' + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().ensemble)}")

[1mENSEMBLE MAPE:[0m7.763895006761777


# ARIMA

In [59]:
from statsmodels.tsa.arima_model import ARIMA
tot_mape=0
date = [] 
sku = []
pred = []
for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df[df.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    lr = []
    for df_train, df_test in generator:
        model = ARIMA(df_train.real_target, order=(1, 1, 1))  
        fitted = model.fit(disp=-1)  
        # Forecast
        fc, se, conf = fitted.forecast(1, alpha=0.05)  # 95% conf
        date.append(list(df_test.Date)[0])
        sku.append(s)
        pred.append(fc[0])
        lr.append(fc[0])
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")

546: 18.123661004919597
1027: 19.700399799487
1058: 18.26791818527407
549: 22.404726130483528
1065: 16.344559878272015
554: 14.70656070642605
1035: 21.69367083453861


  warn("Maximum Likelihood optimization failed to converge. "


686: 22.671956605264633
144: 24.134782392981183


  warn("Maximum Likelihood optimization failed to converge. "


688: 17.51576256467541
1206: 16.025944655404693
1051: 22.846641707578538


In [60]:
preds_AR = pd.DataFrame()
preds_AR['Date'] = date
preds_AR['sku'] = sku
preds_AR['pred_arima'] = pred
preds_AR.head()

Unnamed: 0,Date,sku,pred_arima
0,2019-06-29,546,63702.375926
1,2019-07-06,546,59120.868624
2,2019-07-13,546,82707.618556
3,2019-07-20,546,143454.492894
4,2019-07-27,546,147948.413415


In [61]:
df_ens = pd.merge(df_ens, preds_AR, how='left')
df_ens.head()

Unnamed: 0,Date,sku,target,real_target,log_prediction_lgb_no_cluster_1,prediction_lgb_no_cluster_1,pred_linear_regression,ensemble,pred_arima
0,2019-06-29,144,9.694555,16228.0,9.651065,15537.326992,38592.924306,17301.585382,11707.875336
1,2019-06-29,546,11.025295,61407.0,11.144993,69215.377494,65464.221979,67493.472318,63702.375926
2,2019-06-29,549,10.427565,33777.0,10.58459,39520.086357,37277.180659,38527.826873,37564.084818
3,2019-06-29,554,11.717286,122673.0,11.791167,132079.522759,140701.251675,130213.901166,132601.011086
4,2019-06-29,686,9.960718,21177.0,9.776227,17609.086558,47368.718446,19935.273209,15119.18028


In [63]:
df_ens.to_csv('preds.csv', index=False)

In [62]:
print('\033[1m'+ "LIGHTGBM MAPE: " +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().prediction_lgb_no_cluster_1)}")
print('\033[1m'+ "LINEAR REGRESSION MAPE: " +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().pred_linear_regression)}")
print('\033[1m'+ "ARIMA MAPE: " +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().pred_arima)}")
a = .09
b = .89
df_ens['ensemble'] = a*df_ens.pred_linear_regression+b*df_ens.prediction_lgb_no_cluster_1+c*df_ens.pred_arima
print('\033[1m'+ "ENSEMBLE MAPE: " +'\033[0m' + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().ensemble)}")

[1mLIGHTGBM MAPE: [0m8.658111203247575
[1mLINEAR REGRESSION MAPE: [0m20.94348443526705
[1mARIMA MAPE: [0m19.536382038775443
[1mENSEMBLE MAPE: [0m7.445710821901784
