In [1]:
import os
os.chdir('..')
import pandas as pd
import numpy as np
from tqdm import tqdm
from algorithms.Model_LightGBM import LightGBM
from algorithms.Model_Generator import Generator
from lightgbm import LGBMRegressor

from preprocessing.preprocessing import preprocessing
from metrics.MAPE import MAPE

from utils import add_all_features

train = pd.read_csv("dataset/original/train.csv")
test = pd.read_csv("dataset/original/x_test.csv")

In [2]:
useTest = True
useScope = True
isEvaluation = False
useSampleWeights, weights_type = True, 2
save = False

if isEvaluation:
    useTest = False
    useScope = False

In [3]:
def create_dict_of_series(df):
    series_dict = {}
    
    for sku in sorted(set(df.sku)):
        single_df = pd.DataFrame()
        for col in df.columns:
            single_df[str(col)] = df[df.sku == sku][str(col)]
                
        single_df = single_df.reset_index(drop = True)
        single_df = single_df.sort_values(['Date'])
        series_dict[sku] = single_df
    
    return series_dict

In [4]:
df = preprocessing(train, test, useTest=useTest)

df, categorical_f = add_all_features(df)
categorical_f = ['sku', 'pack', 'brand'] + categorical_f

df = df.sort_values('Date')

df_dict = create_dict_of_series(df[df.scope == 1])

6019it [00:01, 5010.40it/s]
6019it [00:00, 22732.44it/s]
100%|██████████| 43/43 [00:00<00:00, 522.35it/s]


In [33]:
#   --------------- Model -----------------

df = df.sort_values('Date')
test_dates = df[df.Date >= '2019-06-29']
test_dates = test_dates.drop_duplicates('Date').Date

real_values = []
predictions = []

for key in df_dict.keys():
    drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster', 'sku'
    #            'year','week_of_the_year','brand','month'
                ]
    categorical_f = [x for x in categorical_f if x not in drop_cols]
    
    df_test = df_dict[key].tail(25)
    df_train = df_dict[key].drop(df_dict[key].tail(25).index)
    y_train = df_train['target']

    params = {
       # 'metric': 'huber',   # Se si cambia la metrica non si cambia l'ottimizzazione
       'verbose':-1,
       'boosting_type':'gbdt',
        'num_leaves':31,
        'max_depth':- 1,
        'learning_rate':0.1,
       'n_estimators':600,
       'min_split_gain':0.0,
       'subsample':1.0,
       'subsample_freq':0,
       'colsample_bytree':1.0,
       'reg_alpha':0.0,
       'reg_lambda':0.0,
       'random_state':None,
       'silent':True,
       'importance_type':'split',
        'tree_learner':'feature',
    }
    
    model = LGBMRegressor(**params)
    fitted = model.fit(df_train.drop(columns=['target'] + drop_cols), y_train, categorical_feature=categorical_f)
    
    df_test['log_predictions'] = fitted.predict(df_test.drop(['target'] + drop_cols, axis=1))
    #df_test['predictions'] = np.expm1(df_test['log_predictions'])
    
    real_values.extend(df_test['sales w-1'][1:]) 
    predictions.extend(df_test['log_predictions'][:-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [34]:
print(MAPE(real_values, predictions))

1.5932146853477585


In [56]:
df = df.sort_values(['Date', 'sku'])
test_dates = df[df.Date >= '2019-06-29']
test_dates = test_dates.drop_duplicates('Date').Date

real_values = []
predictions = []

drop_cols = ['scope', 'Date', 'real_target', 'pack', 'size (GM)', 'cluster',
#             'year','week_of_the_year','brand','month'
            ]
categorical_f = [x for x in categorical_f if x not in drop_cols]

df_test = df[df.Date >= '2019-06-29']
df_train = df[df.Date < '2019-06-29']
y_train = df_train['target']

params = {
   # 'metric': 'huber',   # Se si cambia la metrica non si cambia l'ottimizzazione
   'verbose':-1,
   'boosting_type':'gbdt',
    'num_leaves':31,
    'max_depth':- 1,
    'learning_rate':0.1,
   'n_estimators':600,
   'min_split_gain':0.0,
   'subsample':1.0,
   'subsample_freq':0,
   'colsample_bytree':1.0,
   'reg_alpha':0.0,
   'reg_lambda':0.0,
   'random_state':None,
   'silent':True,
   'importance_type':'split',
    'tree_learner':'feature',
}

model = LGBMRegressor(**params)
fitted = model.fit(df_train.drop(columns=['target'] + drop_cols), y_train, categorical_feature=categorical_f)

df_test['predictions'] = fitted.predict(df_test.drop(['target'] + drop_cols, axis=1))

predictions.extend(df_test['predictions'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [20]:
df_test['real_target']

133     42589.0
134     46479.0
135     74608.0
136     86665.0
137     78182.0
138     51190.0
139     37043.0
140     37938.0
141     39576.0
142     42876.0
143     45584.0
144     69866.0
145     83078.0
146     80379.0
147     52082.0
148     37638.0
149     36387.0
150     86079.0
151    112674.0
152    111353.0
153     63359.0
154     35238.0
155     34242.0
156     37889.0
157         NaN
Name: real_target, dtype: float64

In [30]:
df = df.sort_values(['sku', 'Date'])
df[['real_target', 'sales w-1', 'target']].head(20)

Unnamed: 0,real_target,sales w-1,target
0,51320.0,10.497091,10.845855
1,66431.0,10.845855,11.103934
2,57001.0,11.103934,10.950842
3,15052.0,10.950842,9.619333
4,22016.0,9.619333,9.99957
5,21762.0,9.99957,9.987967
6,46125.0,9.987967,10.739132
7,65842.0,10.739132,11.095028
8,70514.0,11.095028,11.163581
9,41614.0,11.163581,10.636216
