In [111]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sys
sys.path.append('../')
from algorithms.Base_Model import BaseModel

In [112]:
class LinearRegressionClass(BaseModel):

    def __init__(self):
        super(LinearRegressionClass, self).__init__()

    def create(self, train, test, categorical_features=[], drop_columns=[], name='', isScope=True, sample_weights=None, evaluation=False):
        super().create(train=train, test=test, categorical_features=categorical_features, drop_columns=drop_columns,
                       name=name, isScope=isScope, sample_weights=sample_weights, evaluation=evaluation)

        self.model = LinearRegression()
        return self


    def fit(self,):
        self.model.fit(self.X_train_tmp, self.y_train_tmp)

    def predict(self,):
        self.X_test_tmp['log_prediction_' + self.name] = self.model.predict(self.X_test_tmp.drop(['target','sku'] + self.drop_columns, axis=1))
        self.X_test_tmp['prediction_' + self.name] = np.expm1(self.X_test_tmp['log_prediction_' + self.name])

        return self.X_test_tmp[['Date', 'sku', 'target', 'real_target', 'log_prediction_' + self.name, 'prediction_' + self.name]]

    def plot_feature_importance(self):
        print(self.model.coef_)

    def run(self):
        self.X_train = self.X_train.fillna(0)
        self.X_test = self.X_test.fillna(0)
        predictions = pd.DataFrame()
        if self.evaluation:
            print('No Evaluation for Linear Regression')
        else:
            for s in set(self.X_test.sku):
                mask_train = self.X_train.sku == s
                mask_test = self.X_test.sku == s
                self.X_train_tmp = self.X_train[mask_train].drop('sku', axis=1).copy()
                self.y_train_tmp = self.y_train.loc[self.X_train_tmp.index]
                self.X_test_tmp = self.X_test[mask_test].copy()
                self.fit()
                predictions = pd.concat([predictions, self.predict()])
        return predictions

    def get_model(self):
        return self.model


In [113]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from algorithms.Model_Generator import Generator

import sys
#sys.path.append('.')

from preprocessing.preprocessing import preprocessing, convert_date
from metrics.MAPE import MAPE

from utils import add_all_features

train = pd.read_csv("../dataset/original/train.csv")
test = pd.read_csv("../dataset/original/x_test.csv")

In [114]:
useTest = True
useScope = True
isEvaluation = False
useSampleWeights, weights_type = True, 2
save = False
completeCV = False
dataAugm = False

if isEvaluation:
    useTest = False
    useScope = False

if completeCV:
    useTest = False
    useScope = False

df = preprocessing(train, test, useTest=useTest, dataAugmentation=dataAugm)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')


0it [00:00, ?it/s][A
616it [00:00, 6157.54it/s][A
1187it [00:00, 6015.57it/s][A
1754it [00:00, 5904.25it/s][A
2365it [00:00, 5963.37it/s][A
2864it [00:00, 5631.92it/s][A
3329it [00:00, 5275.11it/s][A
3792it [00:00, 4878.06it/s][A
4381it [00:00, 5141.36it/s][A
4942it [00:00, 5271.57it/s][A
5454it [00:01, 4854.67it/s][A
6019it [00:01, 4890.74it/s][A

0it [00:00, ?it/s][A
3162it [00:00, 31618.97it/s][A
6019it [00:00, 26756.42it/s][A

  0%|          | 0/43 [00:00<?, ?it/s][A
100%|██████████| 43/43 [00:00<00:00, 331.69it/s][A


In [115]:
#   --------------- Model -----------------

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster']
categorical_f = [x for x in categorical_f if x not in drop_cols]

#CLUSTER = [1,2,3]      # Set CLUSTER = None if you want NOT to consider any cluster
CLUSTER = None
NAME = 'linear_reg'

model = LinearRegressionClass()
model_gen = Generator(df, model,
                        categorical_features=categorical_f,
                        drop_columns=drop_cols,
                        isScope=useScope,
                        sample_weights_type=weights_type,
                        evaluation=isEvaluation,
                        useTest=useTest,
                        cluster=CLUSTER,
                        name=NAME,
                        completeCV=completeCV,
                        dataAugmentation=dataAugm,
                        )

prediction = model_gen.run_generator(False)

#print(model_gen.compute_MAPE())
#model_gen.plot_feature_importance()


0it [00:00, ?it/s][A
1it [00:00,  6.18it/s][A
2it [00:00,  6.27it/s][A
3it [00:00,  6.39it/s][A
4it [00:00,  6.42it/s][A
5it [00:00,  6.50it/s][A
6it [00:00,  6.49it/s][A
7it [00:01,  6.45it/s][A
8it [00:01,  6.39it/s][A
9it [00:01,  6.39it/s][A
10it [00:01,  6.41it/s][A
11it [00:01,  6.25it/s][A
12it [00:01,  6.07it/s][A
13it [00:02,  6.18it/s][A
14it [00:02,  6.28it/s][A
15it [00:02,  6.16it/s][A
16it [00:02,  5.97it/s][A
17it [00:02,  5.82it/s][A
18it [00:02,  5.71it/s][A
19it [00:03,  5.72it/s][A
20it [00:03,  5.73it/s][A
21it [00:03,  5.89it/s][A
22it [00:03,  5.98it/s][A
23it [00:03,  6.07it/s][A
24it [00:03,  5.92it/s][A
25it [00:04,  6.12it/s][A


In [122]:
prediction[prediction.real_target == 0.0]

Unnamed: 0,Date,sku,target,real_target,log_prediction_linear_reg,prediction_linear_reg
315,2019-12-14,546,0.0,0.0,11.16109,70338.57355
1105,2019-12-14,1027,0.0,0.0,10.91955,55244.927711
1579,2019-12-14,1058,0.0,0.0,10.31732,30251.087468
473,2019-12-14,549,0.0,0.0,10.571782,39017.143092
1737,2019-12-14,1065,0.0,0.0,11.397453,89093.50861
631,2019-12-14,554,0.0,0.0,11.691284,119524.403902
1263,2019-12-14,1035,0.0,0.0,10.622325,41039.942721
789,2019-12-14,686,0.0,0.0,10.243523,28098.957523
157,2019-12-14,144,0.0,0.0,10.141482,25373.055402
947,2019-12-14,688,0.0,0.0,10.455362,34729.095999


In [117]:
from metrics.MAPE import MAPE

In [83]:
mapes = {}
for d in prediction.Date.drop_duplicates():
    mask = prediction.Date == d
    mapes[d] = MAPE(prediction[mask].real_target, prediction[mask].prediction_linear_reg)

In [84]:
mapes

{Timestamp('2016-12-17 00:00:00'): 5.62601046401973,
 Timestamp('2016-12-24 00:00:00'): 4.123158516494981,
 Timestamp('2016-12-31 00:00:00'): 14.07147689404084,
 Timestamp('2017-01-07 00:00:00'): 66749.235305555,
 Timestamp('2017-01-14 00:00:00'): 5.792441245936549,
 Timestamp('2017-01-21 00:00:00'): 6.583473714793452,
 Timestamp('2017-01-28 00:00:00'): 3.9930042475310414,
 Timestamp('2017-02-04 00:00:00'): 2.8355709931961637,
 Timestamp('2017-02-11 00:00:00'): 8.278234532729547,
 Timestamp('2017-02-18 00:00:00'): 4.964193720406588,
 Timestamp('2017-02-25 00:00:00'): 3.780187342219423,
 Timestamp('2017-03-04 00:00:00'): 3.798165271265652,
 Timestamp('2017-03-11 00:00:00'): 4.304021856239639,
 Timestamp('2017-03-18 00:00:00'): 3.3538797806078895,
 Timestamp('2017-03-25 00:00:00'): 7.281167366905929,
 Timestamp('2017-04-01 00:00:00'): 8.973002917763234,
 Timestamp('2017-04-08 00:00:00'): 6.218221804421045,
 Timestamp('2017-04-15 00:00:00'): 6.165336291398189,
 Timestamp('2017-04-22 00:00

In [86]:
prediction.shape

(5676, 6)

In [87]:
mask = (prediction.Date=='2017-01-07')
prediction = prediction.drop(prediction[mask].index)

In [88]:
prediction.shape

(5633, 6)

In [118]:
MAPE(prediction[prediction.Date != '2019-12-14'].real_target, round(prediction[prediction.Date != '2019-12-14'].prediction_linear_reg))

7.094275374070776

In [110]:
s = 546
mask = (prediction.Date >= '2016-12-31') & (prediction.Date <= '2017-01-07') & (prediction.sku == s)
prediction[mask]['target']

Series([], Name: target, dtype: float64)

In [77]:
mapes = {}
for d in prediction.Date.drop_duplicates():
    mask = prediction.Date == d
    mapes[d] = MAPE(prediction[mask].real_target, prediction[mask].prediction_linear_reg)
mapes

{Timestamp('2016-12-17 00:00:00'): 5.62601046401973,
 Timestamp('2016-12-24 00:00:00'): 4.123158516494981,
 Timestamp('2016-12-31 00:00:00'): 14.07147689404084,
 Timestamp('2017-01-07 00:00:00'): 81983.34510083805,
 Timestamp('2017-01-14 00:00:00'): 5.792441245936549,
 Timestamp('2017-01-21 00:00:00'): 6.583473714793452,
 Timestamp('2017-01-28 00:00:00'): 3.9930042475310414,
 Timestamp('2017-02-04 00:00:00'): 2.8355709931961637,
 Timestamp('2017-02-11 00:00:00'): 8.278234532729547,
 Timestamp('2017-02-18 00:00:00'): 4.964193720406588,
 Timestamp('2017-02-25 00:00:00'): 3.780187342219423,
 Timestamp('2017-03-04 00:00:00'): 3.798165271265652,
 Timestamp('2017-03-11 00:00:00'): 4.304021856239639,
 Timestamp('2017-03-18 00:00:00'): 3.3538797806078895,
 Timestamp('2017-03-25 00:00:00'): 7.281167366905929,
 Timestamp('2017-04-01 00:00:00'): 8.973002917763234,
 Timestamp('2017-04-08 00:00:00'): 6.218221804421045,
 Timestamp('2017-04-15 00:00:00'): 6.165336291398189,
 Timestamp('2017-04-22 00: