In [2]:
import pandas as pd
path = "/home/ichida/dev_env/ml/data/zara_challenge/zara_data_go_2019_all_dataset"

sales_stock_df = pd.read_csv(f"{path}/sales_stock.csv")
products_df = pd.read_csv(f"{path}/products.csv")
positions_df = pd.read_csv(f"{path}/positions.csv")

In [3]:
position_features = positions_df.groupby(['date_number', 'product_id']).agg({'position':['max', 'mean', 'min']}).reset_index()
position_features.columns = ['date_number', 'product_id', 'max_position', 'mean_position', 'position']

In [4]:
product_sales_stock = pd.merge(products_df, sales_stock_df, on='product_id')
groupby_columns = ['product_id', 'family_id', 'subfamily_id', 'price', 'date_number', 'color_id', 'size_id']
product_sales_stock = product_sales_stock.groupby(groupby_columns).agg({'sales':'sum', 'stock':'sum'}).reset_index()

In [5]:
product_sales_stock.columns

Index(['product_id', 'family_id', 'subfamily_id', 'price', 'date_number',
       'color_id', 'size_id', 'sales', 'stock'],
      dtype='object')

In [6]:
all_features = pd.merge(product_sales_stock, position_features, on=['date_number', 'product_id'])
all_features.loc[:, 'product_id'] = all_features.loc[:, 'product_id'].astype('category')
all_features.loc[:, 'family_id'] = all_features.loc[:, 'family_id'].astype('category')
all_features.loc[:, 'subfamily_id'] = all_features.loc[:, 'subfamily_id'].astype('category')
all_features.loc[:, 'size_id'] = all_features.loc[:, 'size_id'].astype('category')
all_features.loc[:, 'color_id'] = all_features.loc[:, 'color_id'].astype('category')
all_features = all_features.drop('product_id', axis=1)
all_features.dtypes

family_id        category
subfamily_id     category
price             float64
date_number         int64
color_id         category
size_id          category
sales               int64
stock               int64
max_position        int64
mean_position     float64
position            int64
dtype: object

In [7]:
test_values = all_features[all_features['date_number'] > 85]
val_values = all_features[(all_features['date_number'] > 79)&(all_features['date_number'] <= 85)]
train_features = all_features[all_features['date_number'] <=79]

In [8]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import numpy as np
import xgboost as xgb

class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

transformer = Pipeline([
    ('features', FeatureUnion(n_jobs=1, transformer_list=[        
        ('numericals', Pipeline([
            ('selector', TypeSelector(np.number))
        ])),  
        # Categorical features
        ('categoricals', Pipeline([
            ('selector', TypeSelector('category')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))  
    ])), 
    ('clf', xgb.XGBRegressor(objective="reg:linear", booster="gbtree", nthread=3))
])  



 ### Hyperparemeter tuning

In [9]:
from sklearn.model_selection import RandomizedSearchCV
param_grid = {
    'clf__max_depth': np.arange(3, 10, 1)
}

randomized_mse = RandomizedSearchCV(param_distributions=param_grid, estimator=transformer, n_iter=2, 
                                    scoring="neg_mean_squared_error", verbose=1, cv=3)

x, y = train_features.drop('sales', axis=1), train_features['sales']
randomized_mse.fit(x, y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 11.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('features',
                                              FeatureUnion(n_jobs=1,
                                                           transformer_list=[('numericals',
                                                                              Pipeline(memory=None,
                                                                                       steps=[('selector',
                                                                                               TypeSelector(dtype=<class 'numpy.number'>))],
                                                                                       verbose=False)),
                                                                             ('categoricals',
                                                                              Pipeline(memory=None,
                                          

In [10]:
print(randomized_mse.best_score_)
print(randomized_mse.best_estimator_)

-4.02988627543959
Pipeline(memory=None,
         steps=[('features',
                 FeatureUnion(n_jobs=1,
                              transformer_list=[('numericals',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  TypeSelector(dtype=<class 'numpy.number'>))],
                                                          verbose=False)),
                                                ('categoricals',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  TypeSelector(dtype='category')),
                                                                 ('encoder',
                                                                  OneHotEncoder(categories='auto',
   

In [12]:
from sklearn.metrics import mean_squared_error

x_test, y_test = test_values.drop('sales', axis=1), test_values['sales']
preds_test = randomized_mse.best_estimator_.predict(x_test)
mean_squared_error(y_test.values, preds_test)

5.234818480361037

In [19]:
df_test = x_test.copy()
df_test.loc[:, 'predicted_sales'] = preds_test
df_test.loc[:, 'sales'] = y_test
df_test.loc[:, 'residual'] = df_test.loc[:, 'predicted_sales'] - df_test.loc[:, 'sales'] 
df_test[(df_test['sales'] != 0)&(df_test['date_number'] == 86)][['sales','predicted_sales', 'residual']]

Unnamed: 0,sales,predicted_sales,residual
129,1,2.634530,1.634530
132,1,2.060550,1.060550
133,3,2.210149,-0.789851
725,1,1.787987,0.787987
727,2,0.791457,-1.208543
...,...,...,...
3394524,1,1.104982,0.104982
3394525,2,2.170871,0.170871
3395445,3,1.557298,-1.442702
3395446,1,1.387907,0.387907
