In [1]:
import os
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import time

In [2]:
res_dir = '/Users/gfg/projects/daas-markdown/res/'
# zal_dir = '/Users/gfg/data/markdown/clean/venture=Zalora/'

In [3]:
stat_file = res_dir + 'train_stats.csv'

In [4]:
stats = pd.read_csv(stat_file)

In [5]:
stats.columns

Index([u'group_id', u'test_percent_after_drop_na', u'test_percent',
       u'train_percent_after_drop_na', u'train_size_after_drop_na',
       u'train_percent', u'n_config', u'n_sample', u'use_dummy_model',
       u'train_time', u'country', u'train_time_in_hour', u'date'],
      dtype='object')

In [6]:
stats = stats.rename(columns={'date': 'train_date'})

In [7]:
stats['train_time_in_hour'] = stats['train_time']/3600

In [8]:
stats.to_csv(stat_file, index=False)

## Helpers

In [13]:
md_input.columns

Index(['sku_config_id', 'group_id', 'md_sub_category_raw', 'md_sub_category',
       'gender', 'activation_date', 'product_lifecycle', 'season',
       'season_year', 'season_duration', 'is_new', 'n_remain_days',
       'brand_name', 'color', 'tax_class', 'current_price', 'black_price',
       'is_visible', 'config_page_views', 'n_stock', 'n_sold',
       'percent_discount_from_rrp', 'n_competitor', 'rel_price_as_ratio',
       'psv', 'tsv', 'is_slow_sku', 'weekday', 'is_weekend', 'is_workday',
       'gfg_created_at', 'snapshot_date', 'year', 'color_pop', 'brand_pop',
       'total_page_view'],
      dtype='object')

In [30]:
def get_stats(country):
    print('country {}'.format(country.upper()))
    dat_dir = zal_dir + '{}/'.format(country)
    print('\t loading md_input...')
    md_input = pd.read_csv(dat_dir + 'md_input.csv')
    print('\t compute sample sizes of groups')
    res = md_input.groupby('group_id').size().reset_index().rename(columns={0: 'sample_size'})
    stats1 = res['sample_size'].describe()
    sample_size_df = pd.DataFrame({'n_group': stats1['count'], 
                                'min_sample_size': stats1['min'],
                                'median_sample_size': stats1['50%'], 
                                'max_sample_size': stats1['max']}, 
                                index=[country])
    
    print('\t compute # configs of groups')
    res2 = md_input.groupby('group_id').agg({'sku_config_id': 'nunique'})
    res2 = res2.reset_index().rename(columns={'sku_config_id': 'n_config'})
    stats2 = res2['n_config'].describe()
    n_config_df = pd.DataFrame({'min_n_config': stats2['min'], 
                                'median_n_config': stats2['50%'],
                               'max_n_config': stats2['max']}, 
                               index=[country])
    
    return pd.merge(sample_size_df, n_config_df, left_index=True, right_index=True)

In [38]:
def count_n_model(country):
    model_dir = res_dir + '{}/models/'.format(country)
    n_model = len([f for f in os.listdir(model_dir) if '.pkl' in f])
    return n_model

In [45]:
def groups_with_models(country):
    model_dir = res_dir + '{}/models/'.format(country)
    groups = [f.replace('.pkl', '') for f in os.listdir(model_dir) if '.pkl' in f]

    pd.DataFrame({'group_id': groups}).to_csv(model_dir + 'groups_with_model.csv', index=False)
    return groups

In [46]:
model_groups = groups_with_models(country='tw')
len(model_groups)

375

In [31]:
countries = ['sg', 'id', 'my', 'ph', 'hk', 'tw']
frames = [get_stats(ct) for ct in countries]
new_stats = pd.concat(frames)

country SG
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups
country ID
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups
country MY
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups
country PH
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups
country HK
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups
country TW
	 loading md_input...
	 compute sample sizes of groups
	 compute # configs of groups


In [34]:
new_stats = new_stats.reset_index().rename(columns={'index': 'country'})

In [36]:
new_stats.to_csv(zal_dir + 'basic_stats.csv', index=False)

In [37]:
res_dir = '/Users/gfg/projects/daas-markdown/res/Zalora/'

In [40]:
countries = ['sg', 'id']
n_model = [count_n_model(ct) for ct in countries]
train_stats = pd.DataFrame({'country': countries, 'n_trained_group': n_model})

In [41]:
train_stats.to_csv(res_dir + 'train_basic_stats.csv', index=False)

## Drafts

In [None]:
# prev stats
stats = pd.read_csv(dat_dir + 'stats.csv')
n_group = stats.group_id.nunique()
max_n_config, min_n_config = max(stats.n_config), min(stats.n_config)

In [None]:
ph_dir = zal_dir + 'ph/'
stats = pd.read_csv(ph_dir + 'stats.csv')

In [11]:
sample_sizes = pd.read_csv(ph_dir + 'sample_sizes_02_05_18.csv')

In [9]:
md_input = pd.read_csv(ph_dir + 'md_input.csv')

In [24]:
res = md_input.groupby('group_id').size().reset_index().rename(columns={0: 'sample_size'})
new_stats = res['sample_size'].describe()
sample_size_df = pd.DataFrame({'n_group': new_stats['count'], 'min_sample_size': new_stats['min'],
            'median_sample_size': new_stats['50%'], 'max_sample_size': new_stats['max']}, 
            index=['ph'])

In [22]:
res2 = md_input.groupby('group_id').agg({'sku_config_id': 'nunique'})
res2 = res2.reset_index().rename(columns={'sku_config_id': 'n_config'})
stats2 = res2['n_config'].describe()
n_config_df = pd.DataFrame({'min_n_config': stats2['min'], 
                        'median_n_config': stats2['50%'],
                       'max_n_config': stats2['max']}, 
                       index=['ph'])

In [25]:
pd.merge(sample_size_df, n_config_df, left_index=True, right_index=True)

Unnamed: 0,max_sample_size,median_sample_size,min_sample_size,n_group,max_n_config,median_n_config,min_n_config
ph,87678.0,133.5,1.0,674.0,7489.0,23.0,1.0


In [23]:
n_config_df

Unnamed: 0,max_n_config,median_n_config,min_n_config
ph,7489.0,23.0,1.0


In [10]:
md_input.snapshot_date.tail()

838152    2018-05-02
838153    2018-05-02
838154    2018-05-02
838155    2018-05-02
838156    2018-05-02
Name: snapshot_date, dtype: object