In [1]:
import os
os.chdir('..')
import pandas as pd
import numpy as np
from tqdm import tqdm
from algorithms.Model_LightGBM import LightGBM
from algorithms.Model_Generator import Generator

from preprocessing.preprocessing import preprocessing
from metrics.MAPE import MAPE

from utils import add_all_features

train = pd.read_csv("dataset/original/train.csv")
test = pd.read_csv("dataset/original/x_test.csv")

In [2]:
train.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope,target
0,WE 10 December 2016,2689,SINGLE,395.41,BRAND1,1.16,,,,0,24175.0
1,WE 17 December 2016,2689,SINGLE,395.41,BRAND1,1.15,1.0,17.676112,24175.0,0,23521.0
2,WE 24 December 2016,2689,SINGLE,395.41,BRAND1,1.16,1.0,24.482803,23521.0,0,22075.0
3,WE 31 December 2016,2689,SINGLE,395.41,BRAND1,1.16,0.0,19.410646,22075.0,0,16492.0
4,WE 07 January 2017,2689,SINGLE,395.41,BRAND1,1.16,0.0,29.81203,16492.0,0,25971.0


In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope
0,WE 29 June 2019,1027,SINGLE,114.23,BRAND2,0.56,10.0,24.386541,66337.0,1
1,WE 06 July 2019,1027,SINGLE,114.23,BRAND2,0.55,4.0,37.437842,49992.0,1
2,WE 13 July 2019,1027,SINGLE,114.23,BRAND2,0.45,0.0,28.039623,58062.0,1
3,WE 20 July 2019,1027,SINGLE,114.23,BRAND2,0.5,19.0,100.0,90908.0,1
4,WE 27 July 2019,1027,SINGLE,114.23,BRAND2,0.42,26.0,99.191839,108957.0,1


In [4]:
useTest = True
useScope = True
isEvaluation = False
useSampleWeights, weights_type = True, 2
save = False

if isEvaluation:
    useTest = False
    useScope = False

In [5]:
df = preprocessing(train, test, useTest=useTest)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')

#   --------------- Model -----------------

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster',
#              'year','week_of_the_year','brand','month'
            ]
categorical_f = [x for x in categorical_f if x not in drop_cols]

CLUSTER = [1,2]      # Set CLUSTER = None if you want NOT to consider any cluster
NAME = 'lgb_no_cluster_1'

model = LightGBM()
model_gen = Generator(df, model,
                        categorical_features=categorical_f,
                        drop_columns=drop_cols,
                        isScope=useScope,
                        sample_weights_type=weights_type,
                        evaluation=isEvaluation,
                        useTest=useTest,
                        cluster=None,
                        name=NAME)

prediction = model_gen.run_generator(save)

model_gen.compute_MAPE()

6019it [00:00, 14200.14it/s]
6019it [00:00, 73575.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 1346.96it/s]
New categorical_feature is ['brand', 'sku']
25it [00:25,  1.02s/it]

Index(['Date', 'sku', 'target', 'real_target',
       'log_prediction_lgb_no_cluster_1', 'prediction_lgb_no_cluster_1'],
      dtype='object')
Standard MAPE = 8.658111203247575





# Linear Regression

In [6]:
from utils import dfs_gen
from sklearn.linear_model import LinearRegression
tot_mape=0
date = [] 
sku = []
pred = []
for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df[df.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    lr = []
    for df_train, df_test in generator:
        X = df_train.drop(drop_cols+['target','sku'], axis=1).fillna(0)
        y = df_train.real_target.dropna()
        reg = LinearRegression().fit(X, y)
        date.append(list(df_test.Date)[0])
        sku.append(s)
        pred.append(float(reg.predict(df_test.drop(drop_cols+['target','sku'], axis=1).fillna(0))[0]))
        lr.append(reg.predict(df_test.drop(drop_cols+['target','sku'], axis=1).fillna(0))[0])
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")

546: 20.28145929488786
1027: 21.824647227944485
1058: 16.66231048479628
549: 18.803192095840895
1065: 10.205832609757405
554: 9.878990915397127
1035: 18.15535088220804
686: 37.23712152675513
144: 43.31507748450789
688: 14.63137799072224
1206: 14.65073324085718
1051: 25.675719469530083


In [7]:
from utils import dfs_gen
from sklearn.linear_model import LinearRegression
import numpy as np
tot_mape=0
date = [] 
sku = []
pred = []
for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df[df.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    lr = []
    for df_train, df_test in generator:
        X = df_train.drop(drop_cols+['target','sku'], axis=1).fillna(0)
        y = df_train.target.dropna()
        reg = LinearRegression().fit(X, y)
        date.append(list(df_test.Date)[0])
        sku.append(s)
        p = int(np.expm1(reg.predict(df_test.drop(drop_cols+['target','sku'], axis=1).fillna(0))[0]))
        pred.append(p)
        lr.append(p)
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")
print("=====================")
print(f'tot mape = {tot_mape/12}')

546: 5.754520455578618
1027: 7.085369991199207
1058: 6.4740100903294255
549: 6.318900252589818
1065: 3.9855990673228523
554: 3.7326927622325345
1035: 6.440088245729246
686: 11.865192836190865
144: 11.632010905202469
688: 6.850704565263428
1206: 5.048049336195392
1051: 9.943215423777653
tot mape = 7.094196160967626


In [8]:
from utils import dfs_gen
from sklearn.linear_model import LinearRegression
tot_mape=0
date = [] 
sku = []
pred = []
df_lr = df.copy()
df_lr = pd.concat([df_lr, pd.get_dummies(df_lr.seasons, prefix = "season_")]).drop(['seasons'],axis=1)

for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df_lr[df_lr.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    error = 0
    lr = []
    for df_train, df_test in generator:
        X = df_train.drop(drop_cols+['target','sku','brand'], axis=1).fillna(0)
        y = df_train.target.dropna()
        reg = LinearRegression().fit(X, y)
        date.append(list(df_test.Date)[0])
        sku.append(s)
        p = np.expm1(reg.predict(df_test.drop(drop_cols+['target','sku','brand'], axis=1).fillna(0))[0])
#         print(f"model pre: {p}")
#         print(f"prev error: {error}")
        p = int(p + p*error/200)
        tar = list(df_sku[(df_sku.Date==list(df_test.Date)[0])]['real_target'])[0]
        pred.append(p)
#         print(f"new prediction: {p}")
#         print(f"true: {tar}")
        error = (tar-p)*100/tar
#         print(f"error: {error}")
#         print("=========")
        lr.append(p)
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")
print("=====================")
print(f'tot mape = {tot_mape/12}')

546: 5.701044628852281
1027: 5.953423717950823
1058: 5.878893072805778
549: 5.550557394766109
1065: 3.618579227649542
554: 3.4444186943748623
1035: 5.459331757986799
686: 8.189448624694965
144: 8.671162158657559
688: 6.451723061365031
1206: 4.68061663224037
1051: 7.91724235586518
tot mape = 5.9597034439341074


In [9]:
preds_lr = pd.DataFrame()
preds_lr['Date'] = date
preds_lr['sku'] = sku
preds_lr['pred_linear_regression'] = pred
preds_lr.head()

Unnamed: 0,Date,sku,pred_linear_regression
0,2019-06-29,546,61942
1,2019-07-06,546,75812
2,2019-07-13,546,123899
3,2019-07-20,546,152217
4,2019-07-27,546,128648


In [10]:
df_ens = pd.merge(prediction, preds_lr, how='left')
df_ens['pred_linear_regression'] = df_ens['pred_linear_regression'].astype(float) 
df_ens.head()

Unnamed: 0,Date,sku,target,real_target,log_prediction_lgb_no_cluster_1,prediction_lgb_no_cluster_1,pred_linear_regression
0,2019-06-29,144,9.694555,16228.0,9.651065,15537.326992,21209.0
1,2019-06-29,546,11.025295,61407.0,11.144993,69215.377494,61942.0
2,2019-06-29,549,10.427565,33777.0,10.58459,39520.086357,33814.0
3,2019-06-29,554,11.717286,122673.0,11.791167,132079.522759,124900.0
4,2019-06-29,686,9.960718,21177.0,9.776227,17609.086558,27030.0


In [11]:
print('\033[1m'+ "LIGHTGBM MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().prediction_lgb_no_cluster_1)}")


[1mLIGHTGBM MAPE:[0m8.658111203247575


In [12]:
print('\033[1m'+ "LINEAR REGRESSION MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().pred_linear_regression)}")

[1mLINEAR REGRESSION MAPE:[0m5.959703443934108


In [13]:
a = .7
b = .3
df_ens['ensemble'] = a*df_ens.pred_linear_regression+b*df_ens.prediction_lgb_no_cluster_1
print('\033[1m'+ "ENSEMBLE MAPE:" +'\033[0m' + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().ensemble)}")

[1mENSEMBLE MAPE:[0m4.704022029830501


In [14]:
df_ens.to_csv('preds.csv', index=False)