In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

%matplotlib inline

## Helpers

In [48]:
# get stats on no. of rows per configs
def n_row_per_config(group_id, md_input):
    sub_df = md_input[md_input['group_id'] == group_id]
    res = sub_df.groupby('sku_config_id').size().reset_index()
    res.rename(columns={0: 'n_row'}, inplace=True)
    stats = res['n_row'].describe()
    return pd.DataFrame({'n_config': stats['count'], 'avg_rows_per_config': stats['mean'], 
                         'med_rows_per_config': stats['50%'], 
                         'min_rows_per_config': stats['min'], 'max_rows_per_config': stats['max']}, 
                        index=[group_id])

In [62]:
def load_md_input(dat_dir):
    md_input = pd.read_csv(dat_dir + 'md_input.csv')
    country = dat_dir.split('/')[-2]
    print('Shape of md_input of country {0}: {1}'.format(country, md_input.shape))
    return md_input

In [66]:
def cal_statistics(md_input):
    print('computing statistics for groups...')
    groups = md_input['group_id'].unique()
    
    frames = [n_row_per_config(gr_id, md_input) for gr_id in groups]
    stats = pd.concat(frames).reset_index()
    
    stats.sort_values('med_rows_per_config', ascending=False).head()
    stats.rename(columns={'index': 'group_id'}, inplace=True)
    return stats

In [70]:
def query_stats(country='sg'):
    dat_dir = '/Users/gfg/data/markdown/clean/venture=Zalora/{}/'.format(country)
    md_input = load_md_input(dat_dir)
    stats = cal_statistics(md_input)
    fname = dat_dir + 'stats.csv'
    stats.to_csv(fname, index=False)
    print('Dumped statistics to file {}'.format(fname))
    return stats

# Get statistics per country

In [69]:
tw_stats = query_stats(country = 'tw')

Shape of md_input of country tw: (135828, 36)
computing statistics for groups...
Dumped statistics to file /Users/gfg/data/markdown/clean/venture=Zalora/tw/stats.csv


In [68]:
sg_stats = query_stats(country = 'sg')

Shape of md_input of country sg: (774916, 36)
computing statistics for groups...
Dumped statistics to file /Users/gfg/data/markdown/clean/venture=Zalora/sg/stats.csv


In [None]:
my_stats = query_stats(country = 'my')

In [73]:
ph_stats = query_stats(country = 'ph')

Shape of md_input of country ph: (739362, 36)
computing statistics for groups...
Dumped statistics to file /Users/gfg/data/markdown/clean/venture=Zalora/ph/stats.csv


Unnamed: 0,group_id,avg_rows_per_config,max_rows_per_config,med_rows_per_config,min_rows_per_config,n_config
0,backpacks_female_autumn_winter,4.740458,30.0,3.0,1.0,262.0
1,backpacks_female_spring_summer,3.629060,25.0,2.0,1.0,585.0
2,backpacks_female_year_round,3.125000,40.0,2.0,1.0,80.0
3,backpacks_male_autumn_winter,4.553459,21.0,4.0,1.0,318.0
4,backpacks_male_spring_summer,2.760417,12.0,2.0,1.0,192.0
5,backpacks_unisex_autumn_winter,4.113043,17.0,3.0,1.0,115.0
6,backpacks_unisex_spring_summer,3.061611,20.0,2.0,1.0,211.0
7,backpacks_unisex_year_round,4.415094,30.0,3.0,1.0,106.0
8,ballerina_flats_female_autumn_winter,8.400835,83.0,4.0,1.0,1437.0
9,ballerina_flats_female_spring_summer,5.201169,49.0,3.0,1.0,1198.0


In [74]:
hk_stats = query_stats(country = 'hk')

Shape of md_input of country hk: (287993, 36)
computing statistics for groups...
Dumped statistics to file /Users/gfg/data/markdown/clean/venture=Zalora/hk/stats.csv


Unnamed: 0,group_id,avg_rows_per_config,max_rows_per_config,med_rows_per_config,min_rows_per_config,n_config
0,backpacks_female_autumn_winter,5.321429,51.0,2.0,1.0,196.0
1,backpacks_female_spring_summer,3.187166,26.0,2.0,1.0,187.0
2,backpacks_female_year_round,4.269231,14.0,3.0,1.0,26.0
3,backpacks_male_autumn_winter,4.603774,30.0,3.0,1.0,159.0
4,backpacks_unisex_autumn_winter,4.936975,28.0,3.0,1.0,238.0
5,backpacks_unisex_year_round,4.526316,24.0,3.0,1.0,76.0
6,ballerina_flats_female_autumn_winter,11.225490,77.0,5.0,1.0,816.0
7,ballerina_flats_female_spring_summer,4.236133,55.0,2.0,1.0,631.0
8,ballerina_flats_female_year_round,15.017857,68.0,7.0,1.0,112.0
9,beauty_acc_tools_female_year_round,3.314516,23.0,2.0,1.0,124.0


# Sample groups with most data

More data = a larger number of transactional records per config on average.
Metrics for amount of data: __median number of transactional records per config__.

In [84]:
def get_3_samples(country='sg'):
    print('Picking 3 samples for country {}'.format(country))
    dat_dir = '/Users/gfg/data/markdown/clean/venture=Zalora/{}/'.format(country)
    stats = pd.read_csv(dat_dir + 'stats.csv')
    
    sample1 = stats['group_id'].iloc[:50]
    pd.DataFrame(sample1).to_csv(dat_dir + 'sample1.csv', index=False)

    sample2 = stats['group_id'].iloc[50:100]
    pd.DataFrame(sample2).to_csv(dat_dir + 'sample2.csv', index=False)

    sample3 = stats['group_id'].iloc[100:150]
    pd.DataFrame(sample3).to_csv(dat_dir + 'sample3.csv', index=False)
    print('Dumped groups in each sample')

In [78]:
get_3_samples(country='sg')

Picking 3 samples for country sg
Dumped groups in each sample


In [79]:
get_3_samples(country='my')

Picking 3 samples for country my
Dumped groups in each sample


In [80]:
get_3_samples(country='id')

Picking 3 samples for country id
Dumped groups in each sample


In [81]:
get_3_samples(country='ph')

Picking 3 samples for country ph
Dumped groups in each sample


In [82]:
get_3_samples(country='hk')

Picking 3 samples for country hk
Dumped groups in each sample


In [83]:
get_3_samples(country='tw')

Picking 3 samples for country tw
Dumped groups in each sample
