In [1]:
import os
os.chdir('..')
import pandas as pd
import numpy as np
from tqdm import tqdm
from algorithms.Model_LightGBM import LightGBM
from algorithms.Model_Generator import Generator

from preprocessing.preprocessing import preprocessing
from metrics.MAPE import MAPE

from utils import add_all_features

train = pd.read_csv("dataset/original/train.csv")
test = pd.read_csv("dataset/original/x_test.csv")

In [2]:
train.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope,target
0,WE 10 December 2016,2689,SINGLE,395.41,BRAND1,1.16,,,,0,24175.0
1,WE 17 December 2016,2689,SINGLE,395.41,BRAND1,1.15,1.0,17.676112,24175.0,0,23521.0
2,WE 24 December 2016,2689,SINGLE,395.41,BRAND1,1.16,1.0,24.482803,23521.0,0,22075.0
3,WE 31 December 2016,2689,SINGLE,395.41,BRAND1,1.16,0.0,19.410646,22075.0,0,16492.0
4,WE 07 January 2017,2689,SINGLE,395.41,BRAND1,1.16,0.0,29.81203,16492.0,0,25971.0


In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,sku,pack,size (GM),brand,price,POS_exposed w-1,volume_on_promo w-1,sales w-1,scope
0,WE 29 June 2019,1027,SINGLE,114.23,BRAND2,0.56,10.0,24.386541,66337.0,1
1,WE 06 July 2019,1027,SINGLE,114.23,BRAND2,0.55,4.0,37.437842,49992.0,1
2,WE 13 July 2019,1027,SINGLE,114.23,BRAND2,0.45,0.0,28.039623,58062.0,1
3,WE 20 July 2019,1027,SINGLE,114.23,BRAND2,0.5,19.0,100.0,90908.0,1
4,WE 27 July 2019,1027,SINGLE,114.23,BRAND2,0.42,26.0,99.191839,108957.0,1


In [4]:
useTest = True
useScope = True
isEvaluation = False
useSampleWeights, weights_type = True, 2
save = False

if isEvaluation:
    useTest = False
    useScope = False

In [None]:
df = preprocessing(train, test, useTest=useTest)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')

#   --------------- Model -----------------

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster',
#              'year','week_of_the_year','brand','month'
            ]
categorical_f = [x for x in categorical_f if x not in drop_cols]

CLUSTER = [1,2]      # Set CLUSTER = None if you want NOT to consider any cluster
NAME = 'lgb_no_cluster_1'

model = LightGBM()
model_gen = Generator(df, model,
                        categorical_features=categorical_f,
                        drop_columns=drop_cols,
                        isScope=useScope,
                        sample_weights_type=weights_type,
                        evaluation=isEvaluation,
                        useTest=useTest,
                        cluster=None,
                        name=NAME)

prediction = model_gen.run_generator(save)

model_gen.compute_MAPE()

6019it [00:00, 13471.11it/s]
6019it [00:00, 71850.24it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 43/43 [00:00<00:00, 1346.18it/s]
New categorical_feature is ['brand', 'sku']
11it [00:12,  1.10s/it]

# Linear Regression

In [None]:
from utils import dfs_gen
from sklearn.linear_model import LinearRegression
tot_mape=0
date = [] 
sku = []
pred = []
for s in list(set(prediction.sku)):
    current_sku = s
    df_sku = df[df.sku == current_sku]
    test_dates = df_sku[df_sku.Date >= '2019-06-29']
    test_dates = test_dates.drop_duplicates('Date').Date
    generator = dfs_gen(df_sku, test_dates)
    lr = []
    for df_train, df_test in generator:
        X = df_train.drop(drop_cols, axis=1).fillna(0)
        y = df_train.real_target.dropna()
        reg = LinearRegression().fit(X, y)
        date.append(list(df_test.Date)[0])
        sku.append(s)
        pred.append(float(reg.predict(df_test.drop(drop_cols, axis=1).fillna(0))[0]))
        lr.append(reg.predict(df_test.drop(drop_cols, axis=1).fillna(0))[0])
    tmp_mape = MAPE(df_sku[df_sku.Date >= '2019-06-29']['real_target'].dropna(), lr[:24])
    tot_mape+=tmp_mape
    print(f"{s}: {tmp_mape}")

In [None]:
preds_lr = pd.DataFrame()
preds_lr['Date'] = date
preds_lr['sku'] = sku
preds_lr['pred_linear_regression'] = pred
preds_lr.head()

In [None]:
df_ens = pd.merge(prediction, preds_lr, how='left')
df_ens['pred_linear_regression'] = df_ens['pred_linear_regression'].astype(float) 
df_ens.head()

In [None]:
print('\033[1m'+ "LIGHTGBM MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().prediction_lgb_no_cluster_1)}")


In [None]:
print('\033[1m'+ "LINEAR REGRESSION MAPE:" +'\033[0m' \
      + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().pred_linear_regression)}")

In [None]:
a = .8
b = .2
df_ens['ensemble'] = a*df_ens.pred_linear_regression+b*df_ens.prediction_lgb_no_cluster_1
print('\033[1m'+ "ENSEMBLE MAPE:" +'\033[0m' + f"{MAPE(df_ens.dropna().real_target, df_ens.dropna().ensemble)}")