In [1]:
import os
import pandas as pd
import numpy as np

## Helpers

In [2]:
res_dir = '/Users/gfg/projects/daas-markdown/res/'

def get_best(sorted_df, n_model=5):
    n_row = sorted_df.shape[0]
    idx = np.arange(0, n_row, step=n_model)
    best_df = sorted_df.iloc[idx].copy()
    return best_df

def load_metrics(country, venture='Zalora'):
    folder = os.path.join(res_dir, venture, country, '')
    return pd.read_csv(folder + 'all_metrics.csv')

In [37]:
train_stats = pd.read_csv(res_dir + 'train_stats.csv')
train_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3681 entries, 0 to 3680
Data columns (total 13 columns):
country                        3681 non-null object
group_id                       3681 non-null object
n_config                       3681 non-null float64
n_record_per_config            1839 non-null float64
n_sample                       3681 non-null float64
test_percent                   3681 non-null float64
test_percent_after_drop_na     3681 non-null float64
train_date                     1839 non-null object
train_percent                  3681 non-null float64
train_percent_after_drop_na    3681 non-null float64
train_size_after_drop_na       3681 non-null float64
train_time                     3681 non-null float64
use_dummy_model                3681 non-null float64
dtypes: float64(10), object(3)
memory usage: 373.9+ KB


# Find best model for each group per country

In [16]:
def find_best_model(country, venture):
    err_df = load_metrics(country, venture)
    best_df = get_best(err_df.sort_values(['group', 'test_medae']))
    best_df['country'] = country
    return best_df

print('finding best models for data of Zalora countries')
zal_countries = ['id', 'my', 'ph', 'sg', 'tw', 'hk']
frames = [find_best_model(country, venture='Zalora') for country in zal_countries]
best_df = pd.concat(frames)

print('shape of DF of best models: {}'.format(best_df.shape))

# del unneed cols
del best_df['n_config']
del best_df['n_samples']
del best_df['avg_records_per_config']

fname = res_dir + 'best_models.csv'
best_df.to_csv(fname, index=False)

In [30]:
best_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2433 entries, 185 to 1390
Data columns (total 11 columns):
country         2433 non-null object
group           2433 non-null object
model           2433 non-null object
test_medae      2323 non-null float64
test_medape     2323 non-null float64
test_r2         2323 non-null float64
test_rmse       2323 non-null float64
train_medae     2433 non-null float64
train_medape    2433 non-null float64
train_r2        2323 non-null float64
train_rmse      2433 non-null float64
dtypes: float64(8), object(3)
memory usage: 228.1+ KB


In [20]:
best_df.query('country == "id"')['model'].value_counts()

Random Forest              176
Lasso Regression           155
Boosted Regression Tree     91
XGBoost                     86
Ridge Regression            49
Name: model, dtype: int64

In [21]:
best_df.query('country == "my"')['model'].value_counts()

Random Forest              300
XGBoost                    153
Lasso Regression           113
Boosted Regression Tree     98
Ridge Regression            61
Name: model, dtype: int64

In [22]:
best_df.query('country == "ph"')['model'].value_counts()

Random Forest              195
Lasso Regression           126
Boosted Regression Tree     56
XGBoost                     42
Ridge Regression            30
Name: model, dtype: int64

In [23]:
best_df.query('country == "au"')['model'].value_counts()

Series([], Name: model, dtype: int64)