# Inits

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

## Load data

In [21]:
def load_correct(fname, dat_dir):
    df = pd.read_csv(dat_dir + fname)
    yr = fname.split('_')[0]
    df['year'] = yr
    return df

def mk_md_input(country='sg'):
    dat_dir = '/Users/gfg/data/markdown/clean/venture=Zalora/{}/'.format(country)
    frames = [load_correct(fname, dat_dir) for fname in os.listdir(dat_dir)]
    md_input = pd.concat(frames)
    print('Shape of md_input of country {}: {}'.format(str.upper(country) , md_input.shape))
    print('Years in md_input: {}'.format(md_input['year'].unique()))
    md_input.to_csv(dat_dir + 'md_input.csv', index=False)
    return md_input

In [23]:
id_md_input = mk_md_input(country='id')

Shape of md_input of country ID: (689422, 36)
Years in md_input: ['2017' '2018']


In [24]:
ph_md_input = mk_md_input(country='ph')

Shape of md_input of country PH: (739362, 36)
Years in md_input: ['2017' '2018']


In [25]:
my_md_input = mk_md_input(country='my')

Shape of md_input of country MY: (754603, 36)
Years in md_input: ['2017' '2018']


In [26]:
tw_md_input = mk_md_input(country='tw')

Shape of md_input of country TW: (135828, 36)
Years in md_input: ['2017' '2018']


In [27]:
hk_md_input = mk_md_input(country='hk')

Shape of md_input of country HK: (295250, 36)
Years in md_input: ['2017' '2018']


In [17]:
cols = ['sku_config_id', 'brand_name', 'config_page_views', 'snapshot_date']
df[cols].sort_values(['sku_config_id', 'snapshot_date']).head()

Unnamed: 0,sku_config_id,brand_name,config_page_views,snapshot_date
13928,00037SH9704B3DGS,ZALORA,5.0,2018-01-04
98787,00037SH9704B3DGS,ZALORA,12.0,2018-01-21
146108,00037SH9704B3DGS,ZALORA,18.0,2018-01-29
194820,00037SH9704B3DGS,ZALORA,14.0,2018-02-07
246550,00037SH9704B3DGS,ZALORA,16.0,2018-02-18


In [88]:
df.columns

Index(['sku_config_id', 'group_id', 'md_sub_category_raw', 'md_sub_category',
       'gender', 'activation_date', 'product_lifecycle', 'season',
       'season_year', 'season_duration', 'is_new', 'n_remain_days',
       'brand_name', 'color', 'tax_class', 'current_price', 'black_price',
       'is_visible', 'config_page_views', 'n_stock', 'n_sold', 'color_pop',
       'brand_pop', 'percent_discount_from_rrp', 'n_competitor',
       'rel_price_as_ratio', 'psv', 'tsv', 'is_slow_sku', 'weekday',
       'is_weekend', 'is_workday', 'gfg_created_at', 'snapshot_date',
       'mean_page_view', 'total_page_view'],
      dtype='object')

In [10]:
df.config_page_views.describe()

count    317126.000000
mean         43.693450
std          82.093147
min           0.000000
25%           9.000000
50%          21.000000
75%          48.000000
max        4752.000000
Name: config_page_views, dtype: float64

## Helpers

In [57]:
def load_from_tmp(fname):
    path = os.path.join(tmp_dat_dir, fname)
    return pd.read_csv(path)

In [128]:
def dump(df, fname, folder):
    path = os.path.join(folder, fname)
    df.to_csv(path, index=False)
    print('dumped to file {}'.format(path))

In [4]:
def to_percent(vals):
    total = sum(vals)
    return 100 * np.array(vals) / total

In [5]:
def agg_sale(df, attribs):
#     Agg sales based on given attributes
    res = df.groupby(attribs).agg({'n_sold': sum})
    res = res.reset_index().rename(columns={'n_sold': 'total_sale'})
    return res

In [129]:
del dump2tmp

### For color popularity

Color pop is measured in __percent__, and it should depend on:
+ sub-cat (current version)
+ year or better season (test?)

In [69]:
def color_popularity(sc, df):
#     Compute popularity of diff colors under a given sub-cat
#     print(sc)
    sub_df = df.query('md_sub_category == "{}"'.format(sc))
    sale_by_color = agg_sale(sub_df, attribs=['color'])
    sale_by_color['md_sub_category'] = sc
    sale_by_color['percent'] = to_percent(sale_by_color['total_sale'])

    return sale_by_color.sort_values('percent', ascending=False)

In [76]:
color_popularity('backpacks', df)['percent'].describe()

count    398.000000
mean       0.251256
std        1.336229
min        0.012770
25%        0.025540
50%        0.051079
75%        0.153237
max       25.015962
Name: percent, dtype: float64

In [79]:
def cal_color_pop_by_cat(df, sub_cats):
    print('computing color popularity under diff sub-cats...')
    frames = [color_popularity(sc, df) for sc in sub_cats]
    by_cat_color = pd.concat(frames)
    by_cat_color.rename(columns={'percent': 'color_pop'}, inplace=True)
    cols = ['color', 'md_sub_category', 'total_sale', 'color_pop']
    by_cat_color = by_cat_color[cols].sort_values('color')

    return by_cat_color

In [111]:
def cal_color_pop_in_year(yr):
    fname = os.path.join(tmp_dat_dir, '{}_md_input.csv'.format(yr))
    df = pd.read_csv(fname)
    print('# rows in loaded data: {}'.format(df.shape[0]))
    sub_cats = df.md_sub_category.unique()
    
    print('# sub-cats: {}'.format(len(sub_cats)))
    color_pop = cal_color_pop_by_cat(df, sub_cats)

    print('merging back to md_input...')
    # if there are old color pop data, drop them to avoid conflicts
    if 'color_pop' in df.columns: 
        df = df.drop(['color_pop'], axis='columns')
        
    cols = ['color', 'md_sub_category', 'color_pop']
    md_input = pd.merge(df, color_pop[cols], how='left')
    
    return md_input

### Brand popularity

In [26]:
def brand_pop_by_page_view(df):
    #   return  total & avg views of each brand taken over all of its configs 
    
    # mean views
    mean_views = df.groupby(['brand_name', 'md_sub_category']).agg({'config_page_views': np.mean}) # 'impressions': np.mean
    mean_views = mean_views.rename(columns={'config_page_views': 'mean_page_view'}).reset_index()
    # total views
    brand_views = df.groupby(['brand_name', 'md_sub_category']).agg({'config_page_views': sum})  # 'impressions': sum
    brand_views = brand_views.rename(columns={'config_page_views': 'total_page_view'}).reset_index()

    # merge total views and mean views
    brand_views = brand_views.merge(mean_views)
    return brand_views

In [92]:
def cal_brand_pop(md_input):
    
    print('Aggregating views of configs under each brand...')
    brand_pop = brand_pop_by_page_view(md_input)
    
    # drop old data to avoid conflict
    to_drop = ['brand_pop', 'mean_page_view', 'total_page_view']
    for c in to_drop:
        if c in md_input.columns:
            md_input.drop(c, axis='columns', inplace=True)
    
    print('merging back to md_input...')
    cols = ['brand_name', 'md_sub_category', 'mean_page_view', 'total_page_view']
    res = pd.merge(md_input, brand_pop[cols], how='left')
    
    return res

# Compute static features

As both color pop and brand pop depend on sub-cats, we first need to extract sub-cats existing in data.

In [17]:
sub_cats = df.md_sub_category.unique()
print('# sub-cats: {}'.format(len(sub_cats)))

# sub-cats: 85


In [20]:
sub_cats[:3]

array(['backpacks', 'ballerina_flats', 'beauty_acc_tools'], dtype=object)

## Color popularity

### Compute

In [47]:
color_pop = cal_color_pop_by_cat(df, sub_cats)
dump2tmp(color_pop, fname='{}_color_pop.csv'.format(yr))
print('saved color popularity to {}'.format(fname))

computing color popularity under diff sub-cats...
backpacks
ballerina_flats
beauty_acc_tools
belts_buckles
blouses_tunics
boots
bra
briefs
business_dress_shoes
cardigans_knitwear
cleansers_toners
dresses
eyewear
face_serums_treatments
fashion_bags
hats_caps
heels
hoodies_sweatshirts
jackets_coats
jeans
jewellery
leggings_tights
loafers_moccasins_boat_shoes
long_pants
nightwear_sleepwear
other_accessories
other_bags
other_shoes
outdoor_shoes
panties
playsuits_jumpsuits
polo_shirts
sandals_flip_flops
shirts
shorts
shoulder_bags
skirts
sling_bags
slip_ons_espadrilles
smart_casual_shoes
sneakers
socks
sports_bra
sports_lifestyle_accessories
sports_lifestyle_backpacks
sports_lifestyle_bags
sports_lifestyle_bottoms
sports_lifestyle_shoes
sports_lifestyle_tops
sports_performance_accessories
sports_performance_backpacks
sports_performance_bags
sports_performance_bottoms
sports_performance_shoes
sports_performance_tops
sports_swimwear
suits
swimwear_bottoms
swimwear_tops
t_shirts
top_handles
to

NameError: name 'yr' is not defined

In [48]:
color_pop.sort_values(['md_sub_category', 'color_pop'], ascending=[True, False]).head()

Unnamed: 0,color,md_sub_category,total_sale,color_pop
18,Black,backpacks,2482.0,28.042029
225,Navy,backpacks,395.0,4.462773
158,Grey,backpacks,222.0,2.508191
191,Light Grey,backpacks,154.0,1.739916
130,Eclipse Xhatch/Black Rubber,backpacks,127.0,1.434866


### Merge back with md_input

In [54]:
cols = ['color', 'md_sub_category', 'color_pop']
md_input = pd.merge(df, color_pop[cols], how='outer')

In [56]:
# check if we lost any color
print('No loss = {}'.format(md_input.shape[0] == df.shape[0]))

No loss = True


### Compare color popularity of years

In [67]:
_17_md_input = md_input
del md_input

In [82]:
_17_color_pop.rename(columns={'color_pop': '_17_color_pop'}, inplace=True)

In [83]:
_18_color_pop.rename(columns={'color_pop': '_18_color_pop'}, inplace=True)

In [84]:
cols = ['color', 'md_sub_category']
join_df = pd.merge(_17_color_pop, _18_color_pop, on=cols)

In [87]:
join_df.drop(['total_sale_x', 'total_sale_y'], axis='columns', inplace=True)

In [88]:
join_df.head()

Unnamed: 0,color,md_sub_category,_17_color_pop,_18_color_pop
0,33005602,wallets_purses,0.260586,0.019604
1,70'S Rinse,leggings_tights,0.443787,0.050556
2,80s Blue With Rips,jeans,0.021213,0.167138
3,90'S Bleach,leggings_tights,0.147929,0.050556
4,90'S Blue,shorts,0.113387,0.014257


In [89]:
join_df.query('color == "Black"').head()

Unnamed: 0,color,md_sub_category,_17_color_pop,_18_color_pop
617,Black,panties,20.101458,9.723546
618,Black,long_pants,27.733879,27.177368
619,Black,briefs,17.550411,10.28128
620,Black,jackets_coats,31.738035,29.285123
621,Black,traditional_dresses,8.235453,11.351428


## Brand popularity

In [32]:
_17_md_input = cal_brand_pop(2017)

Aggregating views in year 2017 of configs under each brand...
merging back to md_input...


In [36]:
cols = ['sku_config_id', 'brand_name', 'md_sub_category', 'mean_page_view', 'total_page_view']
_17_md_input[cols].sort_values(['md_sub_category', 'mean_page_view']).head(20)

Unnamed: 0,sku_config_id,brand_name,md_sub_category,mean_page_view,total_page_view
44927,HO685AC0RRAXMY,House of Avenues,backpacks,2.0,2.0
152175,NE182AC85EFQMY,New Look,backpacks,2.5,5.0
244133,NE182AC85EFQMY,New Look,backpacks,2.5,5.0
212210,FA880AC28YHVMY,Factorie,backpacks,3.0,3.0
9,SU527AC23IQIMY,Sunnydaysweety,backpacks,4.0,8.0
177724,SU527AC38IPTMY,Sunnydaysweety,backpacks,4.0,8.0
16519,PL604AC45AZIMY,PLAYBOY BUNNY,backpacks,5.642857,79.0
84739,PL604AC83MTOMY,PLAYBOY BUNNY,backpacks,5.642857,79.0
129114,PL604AC46AZHMY,PLAYBOY BUNNY,backpacks,5.642857,79.0
129118,PL604AC65MUGMY,PLAYBOY BUNNY,backpacks,5.642857,79.0


In [40]:
_17_md_input.brand_pop.describe()

count    173330.000000
mean       6940.283257
std        7205.770832
min           0.000000
25%         401.000000
50%        3636.000000
75%       14357.000000
max       18881.000000
Name: brand_pop, dtype: float64

## Compute all

### year 2017

In [104]:
_17_md_input = cal_color_pop_in_year(2017)

# sub-cats: 85
computing color popularity under diff sub-cats...
merging back to md_input...


In [105]:
_17_md_input.color_pop.describe()

count    335699.000000
mean          6.931715
std          11.007803
min           0.000589
25%           0.230899
50%           1.285297
75%           9.777187
max         100.000000
Name: color_pop, dtype: float64

In [106]:
_17_md_input.shape[0]

336317

In [107]:
_17_md_input = cal_brand_pop(_17_md_input)

Aggregating views of configs under each brand...
merging back to md_input...


In [108]:
cols = ['mean_page_view', 'total_page_view']
_17_md_input[cols].describe()

Unnamed: 0,mean_page_view,total_page_view
count,336306.0,336306.0
mean,43.752601,380296.5
std,30.692754,858047.2
min,0.0,0.0
25%,19.989011,5025.0
50%,34.722736,25600.0
75%,62.426072,147970.0
max,419.0,2933873.0


In [109]:
_17_md_input.shape[0]

336317

In [110]:
fname='{}_md_input.csv'.format(yr)
dump2tmp(_17_md_input, fname)

dumped to file /Users/gfg/data/venture=zalora/sg/clean/groups/tmp/2018_md_input.csv


### year 2018

In [113]:
_18_md_input = cal_color_pop_in_year(2018)
_18_md_input.color_pop.describe()

# rows in loaded data: 438599
# sub-cats: 85
computing color popularity under diff sub-cats...
merging back to md_input...


count    437657.000000
mean          6.565534
std          11.066952
min           0.000450
25%           0.194058
50%           1.099756
75%           8.548028
max         100.000000
Name: color_pop, dtype: float64

In [114]:
_18_md_input = cal_brand_pop(_18_md_input)
_18_md_input[cols].describe()

Aggregating views of configs under each brand...
merging back to md_input...


Unnamed: 0,mean_page_view,total_page_view
count,438594.0,438594.0
mean,39.955174,680814.2
std,30.52279,1478696.0
min,0.0,0.0
25%,15.475983,4832.0
50%,29.713904,31397.0
75%,58.811068,197884.0
max,134.972222,4743483.0


Why in results in two years are exactly the same? Smth weird, need to check.
Cause: accidentally overwrite data of 2018 by data of 2017. 

## Make final md_input

In [115]:
md_input = pd.concat([_17_md_input, _18_md_input])

In [116]:
md_input.sort_values('snapshot_date', inplace=True)
cols = ['sku_config_id', 'md_sub_category', 'color', 'color_pop','brand_name',  'mean_page_view', 'total_page_view']
md_input[cols].head()

Unnamed: 0,sku_config_id,md_sub_category,color,color_pop,brand_name,mean_page_view,total_page_view
0,222D0AC98E04A7GS,backpacks,OLIVE,0.101683,Something Borrowed,35.159919,17369.0
3126,DF044ZZ05BA578GS,sneakers,Silver,1.510398,Something Borrowed,43.613394,28654.0
3125,CO302SH0RNHPMY,sneakers,White,20.591025,Converse,72.628239,81271.0
3124,AD479SH64HMTMY,sneakers,Silver,1.510398,addicts anonymous,44.821429,6275.0
3123,VA142SH14GZHMY,sneakers,Black/True White,1.240423,VANS,75.035287,131837.0


In [117]:
dump2tmp(md_input, fname='md_input.csv')

dumped to file /Users/gfg/data/venture=zalora/sg/clean/groups/tmp/md_input.csv


In [124]:
md_input.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 774916 entries, 0 to 438598
Data columns (total 35 columns):
sku_config_id                774916 non-null object
group_id                     774916 non-null object
md_sub_category_raw          774915 non-null object
md_sub_category              774916 non-null object
gender                       774916 non-null object
activation_date              774138 non-null object
product_lifecycle            773996 non-null float64
season                       774916 non-null object
season_year                  774916 non-null int64
season_duration              773996 non-null float64
is_new                       774138 non-null float64
n_remain_days                773218 non-null float64
brand_name                   774916 non-null object
color                        773356 non-null object
tax_class                    774916 non-null float64
current_price                774916 non-null float64
black_price                  774916 non-null float64

In [127]:
md_input['year'] = pd.to_datetime(md_input['snapshot_date']).apply(lambda x: x.year)

In [131]:
dump(md_input, 'md_input.csv', ready_dir)

dumped to file /Users/gfg/data/venture=zalora/sg/clean/groups/ready/md_input.csv


## Make md_static_data
+ color pop 
+ brand pop

In [172]:
# add columns to match required format of md_static_data
def add_meta_data(df, static_param='feature_color_pop', country='sg', venture='Zalora'):
    res = df.copy()
    res['venture'] = venture
    res['country'] = country
    res['creation_date'] = pd.datetime.today().date()
    res['static_param'] = static_param
    return res    

### Color pop

In [174]:
def query_color_pop(md_input, country='sg'):
    cols = ['color', 'md_sub_category', 'year', 'color_pop']
    color_pop = md_input[cols]
    color_pop = add_meta_data(color_pop, static_param='feature_color_pop', country=country)
    color_pop = color_pop.rename(columns={'md_sub_category': 'var_1', 'color': 'var_2', 'year': 'var_3', 
                              'color_pop': 'metric'})
#     color_pop.to_csv(clean_dir + country + 'color_pop.csv', index=False)
    return color_pop

### Brand pop

In [175]:
def query_brand_pop(md_input, country='sg'):
    cols = ['brand_name', 'md_sub_category', 'year', 'mean_page_view']
    brand_pop = md_input[cols].copy()
    brand_pop.rename(columns={'mean_page_view': 'brand_pop'}, inplace=True)
    brand_pop = add_meta_data(brand_pop, static_param='feature_brand_pop', country=country)
    
    brand_pop = brand_pop.rename(columns={'md_sub_category': 'var_1', 'brand_name': 'var_2', 'year': 'var_3',
                                         'brand_pop': 'metric'})
    return brand_pop

### Static data

In [176]:
color_pop = query_color_pop(md_input, country='sg')
brand_pop = query_brand_pop(md_input, country='sg')
md_static_data = pd.concat([color_pop, brand_pop])

In [177]:
md_static_data.head()

Unnamed: 0,var_2,var_1,var_3,metric,venture,country,creation_date,static_param
0,OLIVE,backpacks,2017,0.101683,Zalora,sg,2018-04-12,feature_color_pop
3126,Silver,sneakers,2017,1.510398,Zalora,sg,2018-04-12,feature_color_pop
3125,White,sneakers,2017,20.591025,Zalora,sg,2018-04-12,feature_color_pop
3124,Silver,sneakers,2017,1.510398,Zalora,sg,2018-04-12,feature_color_pop
3123,Black/True White,sneakers,2017,1.240423,Zalora,sg,2018-04-12,feature_color_pop


In [178]:
md_static_data.query('static_param == "feature_brand_pop"').head()

Unnamed: 0,var_2,var_1,var_3,metric,venture,country,creation_date,static_param
0,Something Borrowed,backpacks,2017,35.159919,Zalora,sg,2018-04-12,feature_brand_pop
3126,Something Borrowed,sneakers,2017,43.613394,Zalora,sg,2018-04-12,feature_brand_pop
3125,Converse,sneakers,2017,72.628239,Zalora,sg,2018-04-12,feature_brand_pop
3124,addicts anonymous,sneakers,2017,44.821429,Zalora,sg,2018-04-12,feature_brand_pop
3123,VANS,sneakers,2017,75.035287,Zalora,sg,2018-04-12,feature_brand_pop


In [179]:
md_static_data.to_csv(clean_dir + 'md_static_data.csv')

### Check data created in python script

In [38]:
country='id'
data_country = '/Users/gfg/data/markdown/clean/venture=Zalora/{}/'.format(country)

In [39]:
md_static_data = pd.read_csv(data_country + 'md_static_data.csv')
md_static_data.shape

(41299, 10)

In [40]:
md_static_data.query('static_param == "feature_brand_pop"')['metric'].describe()

count    6512.000000
mean       39.096759
std        53.727679
min         0.000000
25%        10.453154
50%        22.000000
75%        48.744647
max      1389.950000
Name: metric, dtype: float64

In [43]:
id_md_input['brand_cat_year'] = id_md_input.brand_name + id_md_input.md_sub_category + id_md_input.year.apply(lambda x: str(x))

In [44]:
id_md_input['brand_cat_year'].nunique()

6560

In [41]:
md_static_data.query('static_param == "feature_color_pop"')['metric'].describe()

count    34739.000000
mean         0.466335
std          2.544698
min          0.002770
25%          0.018518
50%          0.054785
75%          0.187836
max        100.000000
Name: metric, dtype: float64

In [34]:
fname = data_country + 'full_md_input.csv'
id_md_input = pd.read_csv(fname)

In [29]:
cols = ['color_pop', 'brand_pop']
id_md_input[cols].describe()

Unnamed: 0,color_pop,brand_pop
count,1337943.0,1337910.0
mean,7.954642,71.38253
std,11.28177,62.73531
min,0.002673868,0.0
25%,0.3527337,31.97537
50%,2.592908,57.15293
75%,11.31429,90.99307
max,100.0,1034.5


In [35]:
id_md_input.shape

(1305820, 36)

In [36]:
id_md_input.drop_duplicates().shape

(1305820, 36)

In [31]:
test_df = pd.read_csv(data_country + 'test_md_input.csv')

In [32]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8260 entries, 0 to 8259
Data columns (total 35 columns):
sku_config_id                8260 non-null object
group_id                     8260 non-null object
md_sub_category_raw          8260 non-null object
md_sub_category              8260 non-null object
gender                       8260 non-null object
activation_date              8260 non-null object
product_lifecycle            8220 non-null float64
season                       8260 non-null object
season_year                  8260 non-null int64
season_duration              8220 non-null float64
is_new                       8260 non-null float64
n_remain_days                8220 non-null float64
brand_name                   8260 non-null object
color                        8260 non-null object
tax_class                    8260 non-null float64
current_price                8260 non-null float64
black_price                  8260 non-null float64
is_visible                   8260 non

In [33]:
test_df.year.describe()

count    8260.000000
mean     2017.460291
std         0.498451
min      2017.000000
25%      2017.000000
50%      2017.000000
75%      2018.000000
max      2018.000000
Name: year, dtype: float64

In [None]:
test_df.to_csv()

# Sanity check `static_data`

In [3]:
zal_dir = '/Users/gfg/data/markdown/clean/venture=Zalora/'

In [4]:
def load_data(country):
    country_dir = zal_dir + '{}/'.format(country)
    return pd.read_csv(country_dir + 'md_input.csv')

In [5]:
hk_md_input = load_data(country='hk')
hk_md_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 287993 entries, 0 to 287992
Data columns (total 36 columns):
sku_config_id                287993 non-null object
group_id                     287993 non-null object
md_sub_category_raw          287993 non-null object
md_sub_category              287993 non-null object
gender                       287993 non-null object
activation_date              287720 non-null object
product_lifecycle            287718 non-null float64
season                       287993 non-null object
season_year                  287993 non-null int64
season_duration              287718 non-null float64
is_new                       287720 non-null float64
n_remain_days                287445 non-null float64
brand_name                   287993 non-null object
color                        287768 non-null object
tax_class                    287993 non-null float64
current_price                287993 non-null float64
black_price                  287993 non-null float64

HK data is OK.

In [6]:
tw_md_input = load_data(country='tw')
tw_md_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135828 entries, 0 to 135827
Data columns (total 36 columns):
Unnamed: 0                   0 non-null float64
activation_date              135754 non-null object
black_price                  135828 non-null float64
brand_name                   135828 non-null object
brand_pop                    64241 non-null float64
color                        135409 non-null object
color_pop                    2 non-null float64
config_page_views            131898 non-null float64
current_price                135828 non-null float64
gender                       135828 non-null object
gfg_created_at               135828 non-null object
group_id                     135828 non-null object
is_new                       135754 non-null float64
is_slow_sku                  135736 non-null float64
is_visible                   135828 non-null float64
is_weekend                   135828 non-null float64
is_workday                   135828 non-null float64
md_su

TW data is currently not OK with `color_pop` and `brand_pop`, need to re-compute the static features.

In [7]:
# PH data
ph_md_input = load_data(country='ph')

In [12]:
ph_md_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 739362 entries, 0 to 739361
Data columns (total 35 columns):
activation_date              737524 non-null object
black_price                  739362 non-null float64
brand_name                   739362 non-null object
brand_pop                    205948 non-null float64
color                        726421 non-null object
color_pop                    5527 non-null float64
config_page_views            715957 non-null float64
current_price                739362 non-null float64
gender                       739362 non-null object
gfg_created_at               739362 non-null object
group_id                     739362 non-null object
is_new                       737524 non-null float64
is_slow_sku                  737933 non-null float64
is_visible                   739362 non-null float64
is_weekend                   739362 non-null float64
is_workday                   739362 non-null float64
md_sub_category              739362 non-null obje

We can see a problem with attributes `color_pop` and `brand_pop`, they have much less values than total. So there is some bug here!

## Dive into `color_pop`

In [16]:
n_color = ph_md_input.color.nunique()
print('# unique colors in data: {}'.format(n_color))

# unique colors in data: 22244


In [17]:
bug_df = ph_md_input[ph_md_input.color.notnull() & ph_md_input.color_pop.isnull()]
bug_df.shape

(720894, 35)

In [18]:
cols = ['sku_config_id', 'group_id', 'color', 'color_pop']
bug_df[cols].head()

Unnamed: 0,sku_config_id,group_id,color,color_pop
0,345D8AC86F2665GS,backpacks_female_autumn_winter,Black,
1,FD2E8ZZE53A879GS,backpacks_female_autumn_winter,Black,
2,24D9FAC657BCE7GS,backpacks_female_autumn_winter,Black,
3,MA134AC97NCOPH,backpacks_female_autumn_winter,Black,
4,119C3ZZ4B1AF6BGS,backpacks_female_autumn_winter,Burgundy,


In [13]:
ph_dir = zal_dir + 'ph/'; fname = ph_dir + 'md_input.csv'
ph_md_input.to_csv(fname, index=False, encoding='utf8')

In [14]:
static_feats = ['color_pop', 'brand_pop']  
price_feats = ['current_price', 'percent_discount_from_rrp']
canib_feats = ['n_competitor', 'rel_price_as_ratio']  
ga_feats = ['mean_views', 'mean_impressions', 'total_impressions']
time_feats = ['is_weekend', 'is_workday']
feats = static_feats + price_feats + canib_feats + time_feats

In [15]:
ph_md_input.dropna(subset=static_feats).shape[0]

218