In [1]:
import pandas as pd
path = "/Users/ichida/dev/datasets/zara_challenge/" \
       "zara_data_go_2019_all_dataset"

sales_stock_df = pd.read_csv(f"{path}/sales_stock.csv")
products_df = pd.read_csv(f"{path}/products.csv")
positions_df = pd.read_csv(f"{path}/positions.csv")

In [2]:
position_features = positions_df.groupby(['date_number', 'product_id']).agg({'position':['max', 'mean', 'min']}).reset_index()
position_features.columns = ['date_number', 'product_id', 'max_position', 'mean_position', 'position']

In [3]:
product_sales_stock = pd.merge(products_df, sales_stock_df, on='product_id')
groupby_columns = ['product_id', 'family_id', 'subfamily_id', 'price', 'date_number', 'color_id', 'size_id']
product_sales_stock = product_sales_stock.groupby(groupby_columns).agg({'sales':'sum', 'stock':'sum'}).reset_index()

In [4]:
product_sales_stock.columns

Index(['product_id', 'family_id', 'subfamily_id', 'price', 'date_number',
       'color_id', 'size_id', 'sales', 'stock'],
      dtype='object')

In [31]:
all_features = pd.merge(product_sales_stock, position_features, on=['date_number', 'product_id'])
all_features.loc[:, 'product_id'] = all_features.loc[:, 'product_id'].astype('category')
all_features.loc[:, 'family_id'] = all_features.loc[:, 'family_id'].astype('category')
all_features.loc[:, 'subfamily_id'] = all_features.loc[:, 'subfamily_id'].astype('category')
all_features.loc[:, 'size_id'] = all_features.loc[:, 'size_id'].astype('category')
all_features.loc[:, 'color_id'] = all_features.loc[:, 'color_id'].astype('category')
all_features = all_features.drop('product_id', axis=1)
all_features.dtypes

family_id        category
subfamily_id     category
price             float64
date_number         int64
color_id         category
size_id          category
sales               int64
stock               int64
max_position        int64
mean_position     float64
position            int64
dtype: object

In [32]:
test_values = all_features[all_features['date_number'] > 85]
val_values = all_features[(all_features['date_number'] > 79)&(all_features['date_number'] <= 85)]
train_features = all_features[all_features['date_number'] <=79]

In [37]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
import numpy as np

class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

transformer = Pipeline([
    ('features', FeatureUnion(n_jobs=1, transformer_list=[        
        ('numericals', Pipeline([
            ('selector', TypeSelector(np.number))
        ])),  # numericals close
        
        # Part 2
        ('categoricals', Pipeline([
            ('selector', TypeSelector('category'))
            ('encoder', OneHotEncoder(handle_unknown='ignore')),
        ]))  # categoricals close
    ])),  # features close
])  # pipeline close

