# Invalid OSA for nestle cereals

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import datetime as dtm

### Load data

In [3]:
def load_pos_data(file_path):
    df = pd.read_csv(file_path,
                parse_dates=['SALES_DT'])
    df = df.rename(str.lower, axis='columns')
    df.loc[:, 'pos_item_qty'] = df['pos_item_qty'].clip(0, 100000)
    df.loc[:, 'price'] = df['price'].clip(0, 100000)
    return df

In [4]:
def load_alert_data(file_path):
    df = pd.read_csv(file_path,
                    parse_dates=['SALES_DT'])
    df = df.rename(str.lower, axis='columns')
    return df

### Cluster stores

In [5]:
def add_daily_group(df, col_name, percentile_high, percentile_low, minimum_pos):
    """
    Assigns each store/day to one of high-, middle-, slow-selling group
    
    param df: input dataframe for one item
    param string col_name: choose beteween 'pos_item_qty' or 'expected_pos'
    param list percentile_list: [high percentile, low percentile]
    param int minimum_pos: minimum pos to define a slow selling item
    """
    
    top_threshold_qty = df[col_name].quantile(percentile_high)
    middle_threshold_qty = df[col_name].quantile(percentile_low)

    df['group'] = df.shape[0] * np.nan
    if middle_threshold_qty < minimum_pos:
        df.loc[:, 'group'] = 'slow'
    else:
        df.loc[:, 'group'][(df[col_name]>middle_threshold_qty) & (df[col_name]<=top_threshold_qty)] = 'middle'
        df.loc[:, 'group'][df[col_name]>top_threshold_qty] = 'top'
        df.loc[:, 'group'][df[col_name]<=middle_threshold_qty] = 'slow'
    
    return df


def select_percentile_comb(df, percentile):
    """
    Calculates how often a item/store aprears in each group. In the list of all appeared days,
    calculated x-percentile is the threshold to select stores.
    param datafram df: Input dataframe with a column group
    param float percentile: percentile to calculate the infimum number of days
    return dataframe: dataframe which contains valid stores for each retailer_item_id and group
    """
    grouped_days = df.groupby(['retailer_item_id', 'organization_unit_num', 'group'], 
                                as_index=False)['sales_dt'].count()\
                        .rename({'sales_dt': '# days in the group'}, axis='columns')
    grouped_stores = grouped_days.groupby(['retailer_item_id', 'group'], 
                                          as_index=False)['# days in the group'].quantile(percentile)\
                        .rename({'# days in the group': 'membership_threshold'}, axis='columns')
    
    grouped_days = grouped_days.merge(grouped_stores)
    
    grouped_days = grouped_days[grouped_days['# days in the group']>=grouped_days['membership_threshold']]
    
    return grouped_days

In [6]:
def grouped_count_stores(df, group_columns, new_name):
    df = df.groupby(group_columns, as_index=False)['organization_unit_num']\
            .count()\
            .rename({'organization_unit_num': new_name}, axis='columns')
    return df


def invalid_osa(df, valid_osa_ratio):
    """ For a given date returns a table of all invalid OSA alerts
    
    param dataframe df: Input dataframe of one group and date
    param valid_osa_ratio: threshold ratio of number of LSV/total to reject osa alerts
    
    """
    df_num_total_stores = grouped_count_stores(df, ['retailer_item_id', 'retailer_item_desc', '0.1_daily_num_stores'], 'num_total_stores')
    df_num_lsv_stores = grouped_count_stores(df[df['lost_sales_amt']>0], ['retailer_item_id', 'retailer_item_desc', '0.1_daily_num_stores'], 'num_lsv_stores')
    
    df_num_lsv_stores = df_num_lsv_stores.merge(df_num_total_stores)
    df_num_lsv_stores.loc[:, 'ratio'] = df_num_lsv_stores['num_lsv_stores']/df_num_lsv_stores['num_total_stores']
    return df_num_lsv_stores[(df_num_lsv_stores['ratio']>=valid_osa_ratio) & (df_num_lsv_stores['num_total_stores']>=df_num_lsv_stores['0.1_daily_num_stores'])]

### Analysis

In [7]:
def store_clustering(df, col_name, percentile_high, percentile_low, minimum_pos, membership_percentile, minimum_day_ratio, start_date, end_date):
    
    selected_cols = ['sales_dt', 'organization_unit_num', 'retailer_item_id', 'pos_item_qty', 'price']
    
    df_inp = df[selected_cols][(df['sales_dt']>=start_date) & (df['sales_dt']<end_date)]

    df_grouped = df_inp.groupby(['retailer_item_id']).apply(add_daily_group, 
                                                            col_name=col_name, 
                                                            percentile_high=percentile_high, 
                                                            percentile_low=percentile_low, 
                                                            minimum_pos=minimum_pos)

    # drop all days in group 'slow'
    df_grouped = df_grouped[df_grouped['group']!='slow']

    membership_threshold_df = select_percentile_comb(df_grouped, membership_percentile)

    # Decide to accept threshold
    minimum_days = (dtm.datetime.strptime(end_date, '%Y-%m-%d') - dtm.datetime.strptime(start_date, '%Y-%m-%d')).days
    membership_threshold_df = membership_threshold_df[membership_threshold_df['membership_threshold']>=minimum_day_ratio*minimum_days]

    df_out = df_inp.merge(membership_threshold_df)

    assigned_grouped = df_out.groupby(['organization_unit_num', 
                                       'retailer_item_id', 'group'], 
                                      as_index=False)['organization_unit_num', 'retailer_item_id', 'group'].apply(lambda x: x.drop_duplicates())
    return assigned_grouped


In [8]:
retailer = 'tesco'
client = 'kraftheinz'
df = load_pos_data(f'./data/{client}_{retailer}_Oct2019_March2020_pos.csv')

In [9]:
# Parameters
col_name = 'pos_item_qty'
percentile_high = 0.8
percentile_low = 0.6
minimum_pos = 2
percentile = 0.8
membership_percentile = 0.8
valid_osa_ratio = 0.6
minimum_day_ratio = 0.05
MIN_TOTAL_STORE_PERCENT = 0.1

# Add total number of stores for item day and calculate 10% of that number
df_total_stores = grouped_count_stores(df, ['retailer_item_id', 'sales_dt'], 'daily_num_stores')
df = df.merge(df_total_stores)
df.loc[:, '0.1_daily_num_stores'] = df['daily_num_stores'] * MIN_TOTAL_STORE_PERCENT


In [10]:
# Do store grouping
start_date = '2019-10-01'
end_date = '2020-01-01'
df_grouped = store_clustering(df, col_name, 
                              percentile_high, 
                              percentile_low, 
                              minimum_pos, 
                              membership_percentile, 
                              minimum_day_ratio, 
                              start_date, end_date)

In [11]:
df_with_groups = df.merge(df_grouped)

#Invalid OSA
df_invalid_osa = df_with_groups.groupby(['sales_dt']).apply(invalid_osa, 
                                                            valid_osa_ratio)