In [3]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as prep

from collections import Counter

In [4]:
DAT_DIR = '/Users/gfg/data/venture=Zalora/sg/'
CLEAN_DIR = DAT_DIR + 'clean/'; FIG_DIR = DAT_DIR + 'fig/'
GROUP_DIR = CLEAN_DIR + 'groups/'; FEAT_DIR = GROUP_DIR + 'feats/'

## Helpers

In [5]:
def date_range(df):
    return (min(df['date']), max(df['date']))

def load_feats(gname='dresses_female_autumn-winter'): # Load group feats
    gid = tuple(gname.split('_'))
    fname = FEAT_DIR + '{}.csv'.format(gname)
    group_feat = pd.read_csv(fname, parse_dates=['snapshot_date'])

    group_feat.dropna(subset=['sku_config'], inplace=True)
    print('dropped NA configs')
    return group_feat

def categ_encode(cat_values):
    pd.Series(encoder.fit_transform(cat_values)).apply(str)

def mask(c, df, encoder):
    print('\t {}'.format(c))
    df['masked_{}'.format(c)] = categ_encode(df[c])
    return df

def mask_data(df, to_mask, gname='dresses_female_autumn-winter'):
    
    encoder = prep.LabelEncoder()
    
    print('Masking sensitive info...')
    masked_df = df.copy()
    for c in to_mask:
        masked_df[c].fillna('na', inplace=True)
        masked_df = mask(c, masked_df, encoder)

    print('Removing original sensitive info...')
    poc_df = masked_df
    for c in to_mask:
        del poc_df[c]

    masked_df.to_csv(CLEAN_DIR + '{}_with_mask.csv'.format(gname), index=False)
    poc_df.to_csv(CLEAN_DIR + 'poc_{}.csv'.format(gname), index=False)
    return masked_df

# Preparation

## Load

In [None]:
# catalog data
prod_df = pd.read_csv(CLEAN_DIR + 'products.csv')

In [None]:
# sale data of simples
demand_simples = pd.read_csv(CLEAN_DIR + 'demand_simple.csv', parse_dates=['ordered_date'])
demand_simples.info()

In [None]:
# sale data of configs
demand_config = pd.read_csv(CLEAN_DIR + 'demand_config.csv', parse_dates=['ordered_date'])
demand_config.info()

In [None]:
demand_config['date'] = demand_config['ordered_date']

## Join tables

### Simple level
We join `demand_simples` table with `prod_df` table to join properties/features of each SKU simple with it daily demand (the response).

In [None]:
joined = pd.merge(demand_simples, prod_df)
# joined.shape[0] == demand_simples.shape[0]

# delete redundant cols
del joined['sub_cat']

In [None]:
del joined['image_url']

### Config level

To join all available config features with response, we join tables `demand_config`, `prod_df` and `group_df` as follows:
+ join `group_df` with `prod_df` by key `sku_config` to obtain `joined_config`
+ join `joined_config` with `demand_config` by tuple `(sku_config, date)` to obtain final `joined_config`

In [None]:
# group feats (price, relative price, no. of competitors...)
gname='dresses_female_autumn-winter'
group_df = load_feats(gname)

In [None]:
group_df['date'] = group_df['snapshot_date']

In [None]:
joined_config = pd.merge(group_df, prod_df.drop_duplicates('sku_config'))

In [None]:
print(joined_config.shape[0])
joined_config.shape[0] < group_df.shape[0]

In [None]:
list(joined_config.columns)

In [None]:
list(demand_config.columns)

In [None]:
joined_config = pd.merge(joined_config, demand_config)
del joined_config['ordered_date']; del joined_config['snapshot_date']
print(joined_config.shape[0])

In [None]:
print(date_range(demand_config))
print(date_range(joined_config))

## Mask sensitive info

We mask sensitive info via `sklearn.preprocessing.LabelEncoder`

### Encode

We should encode/mask the following info:
+ all name columns
+ `catalog_attribute_set_label`
+ `buying_planning_cat_type`
+ `sub_category_type`
+ `sub_cat_gender`
+ `supplier_email`
+ `supplier_source`
+ `short_description`

In [None]:
def columns_to_mask():
    name_cols = [c for c in joined.columns if ('name' in c) and ('size' not in c)]

    cat_cols = ['catalog_attribute_set_label', 'buying_planning_cat_type', 
                'sub_category_type', 'sub_cat_gender']

    supplier_cols = ['supplier_email', 'supplier_source']
    to_mask = name_cols + cat_cols + supplier_cols + ['short_description']
    return to_mask

In [None]:
to_mask = columns_to_mask()

#### All records

In [None]:
masked_df = mask_data(joined, to_mask)

In [None]:
masked_df[masked_cols].info()

#### Sample of 1 group

In [None]:
group_masked = mask_data(joined_config, to_mask)

### Decode

In [None]:
def get_brand_name(masked_id):
    return masked_df.query('masked_brand_name == "{}"'.format(masked_id))['brand_name'].iloc[0]

In [None]:
masked_df = pd.read_csv(CLEAN_DIR + 'masked_data.csv')
masked_df.info()

In [None]:
top5 = ['566', '468', '315', '209', '459']
for mid in top5:
    print('(masked_id, brand): {}'.format(tuple([mid, get_brand_name(mid)])))

# Inspect results returned by DataRobot 

In [6]:
RES_DIR = '/Users/gfg/projects/daas-markdown/res/'

## Reasons of predictions
First we inspect top reasons of high/low predictions where:
+ high predictions: predicted daily demand > 1.849=`ht`
+ low predictions: predicted daily demand < 0.975=`lt`

The thresholds `ht`, `lt` are defined by DataRobot based on AUC.

In [23]:
# load reasons
reason_df = pd.read_csv(RES_DIR + 'xgboost_early_stop_compact_feats_RC_3.csv')
reason_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23388 entries, 0 to 23387
Data columns (total 11 columns):
row_id               23388 non-null int64
Prediction           23388 non-null float64
Reason 1 Strength    5386 non-null object
Reason 1 Feature     5386 non-null object
Reason 1 Value       5386 non-null object
Reason 2 Strength    5386 non-null object
Reason 2 Feature     5386 non-null object
Reason 2 Value       5386 non-null object
Reason 3 Strength    5386 non-null object
Reason 3 Feature     5386 non-null object
Reason 3 Value       5386 non-null object
dtypes: float64(1), int64(1), object(9)
memory usage: 2.0+ MB


In [29]:
def rename_cols(df, old_names, new_names):
    cols = dict(zip(old_names, new_names))
    return df.rename(columns=cols)

In [None]:
# rm blanks in column names
old_names = [c for c in reason_df.columns if ' ' in c]
new_names = [c.replace(' ', '_') for c in old_names]
rename_cols(reason_df, old_names, new_names)

In [31]:
# lower all column names
old_names = list(reason_df.columns)
new_names = [str.lower(n) for n in old_names]
reason_df = rename_cols(reason_df, old_names, new_names)

In [32]:
reason_df.columns

Index(['row_id', 'prediction', 'reason_1_strength', 'reason_1_feature',
       'reason_1_value', 'reason_2_strength', 'reason_2_feature',
       'reason_2_value', 'reason_3_strength', 'reason_3_feature',
       'reason_3_value'],
      dtype='object')

### Top 3 reasons of high predictions

In [34]:
ht = 1.849
high_df = reason_df.query('prediction > {}'.format(ht))

#### Top features for increased demand

In [38]:
plus = ['+', '++', '+++']
is_increase = high_df.reason_1_strength.isin(plus) | high_df.reason_2_strength.isin(plus) | high_df.reason_3_strength.isin(plus)

In [53]:
feats = ['reason_1_feature', 'reason_2_feature', 'reason_3_feature']
res = high_df[is_increase][feats]

In [54]:
up_demand_feats = list(res.reason_1_feature) + list(res.reason_2_feature) + list(res.reason_3_feature)

In [56]:
counter = Counter(up_demand_feats)

In [58]:
counter.most_common(20)

[('sku_config', 1818),
 ('snapshot_date', 1426),
 ('config_updated_at', 668),
 ('catalog_type', 583),
 ('config_group', 446),
 ('rel_price', 412),
 ('discount_from_rrp', 374),
 ('snapshot_date (Day of Week)', 364),
 ('n_competitor', 330),
 ('config_created_at', 279),
 ('current_price', 257),
 ('supplier_currency', 245),
 ('snapshot_date (Day of Month)', 195),
 ('stock', 186),
 ('special_price_to_date', 180),
 ('activated_at_date', 128),
 ('dwh_created_at', 45),
 ('brand_identifier (Categorical Int)', 40),
 ('special_price_from_date', 30),
 ('color', 18)]

### Top 3 reasons of low predictions

In [None]:
lt = 0.981
low_df = reason_df.query('Prediction')

## Model X-ray

## Lift chart