# New alers logic, clustering Nestle_cereals

In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import datetime as dtm

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style

import warnings
warnings.filterwarnings('ignore')
style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = (20, 8)

### Load data

In [3]:
def load_files(file_path):
    df = pd.read_csv(file_path,
                parse_dates=['SALES_DT']).fillna(0)
    df = df.rename(str.lower, axis='columns')
    df.loc[:, 'drfe_pos_item_qty'] = df['drfe_pos_item_qty'].clip(0, 100000)
    df.loc[:, 'pos_item_qty'] = df['pos_item_qty'].clip(0, 100000)
    df.loc[:, 'price'] = df['price'].clip(0, 100000)
    df = df.rename({'drfe_pos_item_qty':'expected_pos'}, axis='columns')
    return df

### Cluster stores

In [4]:
def add_daily_group(df, col_name='expected_pos', percentile_list=[0.8, 0.6], minimum_pos=3):
    """
    Assigns each store/day to one of high-, middle-, slow-selling group
    
    param df: input dataframe for one item
    param string col_name: choose beteween 'pos_item_qty' or 'expected_pos'
    param list percentile_list: [high percentile, low percentile]
    param int minimum_pos: minimum pos to define a slow selling item
    """
    
    percentile_high, percentile_low = percentile_list
    top_threshold_qty = df[col_name].quantile(percentile_high)
    middle_threshold_qty = df[col_name].quantile(percentile_low)
    
    df['group'] = df.shape[0] * np.nan
    if middle_threshold_qty < minimum_pos:
        df.loc[:, 'group'] = 'slow'
    else:
        df.loc[:, 'group'][(df[col_name]>middle_threshold_qty) & (df[col_name]<=top_threshold_qty)] = 'middle'
        df.loc[:, 'group'][df[col_name]>top_threshold_qty] = 'top'
        df.loc[:, 'group'][df[col_name]<=middle_threshold_qty] = 'slow'
    
    return df

def select_percentile_comb(df, percentile=0.8):
    """
    Calculates how often a item/store aprears in each group. In the list of all appeared days,
    calculated x-percentile is the threshold to select stores.
    param datafram df: Input dataframe with a column group
    param float percentile: percentile to calculate the infimum number of days
    return dataframe: dataframe which contains valid stores for each retailer_item_id and group
    """
    grouped_days = df.groupby(['retailer_item_id', 'organization_unit_num', 'group'], 
                                as_index=False)['sales_dt'].count()\
                        .rename({'sales_dt': '# days in the group'}, axis='columns')
    grouped_stores = grouped_days.groupby(['retailer_item_id', 'group'], 
                                          as_index=False)['# days in the group'].quantile(percentile)\
                        .rename({'# days in the group': 'membership_threshold'}, axis='columns')
    
    grouped_days = grouped_days.merge(grouped_stores)
    
    grouped_days = grouped_days[grouped_days['# days in the group']>=grouped_days['membership_threshold']]
    
    return grouped_days

In [5]:
def build_alert_table(df, col_name, membership_multiplier):
    df_with_q3 = df.groupby(['retailer_item_id', 'group', 'sales_dt'], as_index=False)[col_name].quantile(0.75).rename({col_name: 'Q3'}, axis='columns')
    df_with_q2 = df.groupby(['retailer_item_id', 'group', 'sales_dt'], as_index=False)[col_name].quantile(0.5).rename({col_name: 'Q2'}, axis='columns')
    df_with_q1 = df.groupby(['retailer_item_id', 'group', 'sales_dt'], as_index=False)[col_name].quantile(0.25).rename({col_name: 'Q1'}, axis='columns')
    
    df_with_q1_q3 = df_with_q3.merge(df_with_q1)
    df_with_qs = df_with_q1_q3.merge(df_with_q2)
    
    df_with_qs.loc[:, 'outlier threshold'] = df_with_qs['Q1'] - membership_multiplier * (df_with_qs['Q3'] - df_with_qs['Q1'])

    df = df.merge(df_with_qs)

    df_low_sales_alert= df[(df[col_name]<= df['outlier threshold'])]
    df_low_sales_alert.loc[:, 'LSV Low Sales'] = (df_low_sales_alert['Q2'] - df_low_sales_alert[col_name]) * df_low_sales_alert['price']
    return df_low_sales_alert

### Analysis

In [6]:
def analysis(df, alert_day):
    selected_cols = ['sales_dt', 'organization_unit_num', 'retailer_item_id', 'pos_item_qty', 'expected_pos', 'price']

    print(df['sales_dt'].max())
    
    # Parameters
    start_date = '2019-09-01'
    end_date = '2020-01-01'
    minimum_day_ratio = 0.4
    membership_multiplier = 1

    df_inp = df[selected_cols][(df['sales_dt']>=start_date) & (df['sales_dt']<end_date)]

    df_grouped = df_inp.groupby(['retailer_item_id']).apply(add_daily_group)

    print('Input dataframe shape:', df_inp.shape)
    # drop all days in group 'slow'
    df_grouped = df_grouped[df_grouped['group']!='slow']

    membership_threshold_df = select_percentile_comb(df_grouped, 0.8)

    # Decide to accept threshold
    minimum_days = (dtm.datetime.strptime(end_date, '%Y-%m-%d') - dtm.datetime.strptime(start_date, '%Y-%m-%d')).days
    membership_threshold_df = membership_threshold_df[membership_threshold_df['membership_threshold']>=minimum_day_ratio*minimum_days]

    df_out = df_inp.merge(membership_threshold_df)
    print('Output dataframe shape:', df_out.shape)

    assigned_grouped = df_out.groupby(['organization_unit_num', 'retailer_item_id', 'group'], as_index=False)['organization_unit_num', 'retailer_item_id', 'group'].apply(lambda x: x.drop_duplicates())

    df_lastDay = df[df['sales_dt']==alert_day]
    df_lastDay = df_lastDay.merge(assigned_grouped, how='inner')
    print('Assigned grope dataframe shape:', df_lastDay.shape)
    
    df_low_sales_alert = build_alert_table(df_lastDay, col_name='expected_pos', membership_multiplier=membership_multiplier)
    
    return assigned_grouped, df_low_sales_alert

In [8]:
df = load_files('./data/nestle_cereals_tesco_historicalData.csv')
store_item_groups, df_low_sales_alert = analysis(df, '2020-01-27')
df_low_sales_alert.head()

2020-02-03 00:00:00
Input dataframe shape: (2639197, 6)
Output dataframe shape: (205800, 9)
Assigned grope dataframe shape: (1715, 10)


Unnamed: 0,organization_unit_num,retailer_item_id,retailer_item_desc,retailer,client,sales_dt,pos_item_qty,price,expected_pos,group,Q3,Q1,Q2,outlier threshold,LSV Low Sales
116,2885,50385078,TESCO CORN FLAKES CEREAL 750G,TESCO,NESTLECEREALS,2020-01-27,7.0,0.75,2.93,top,20.83,17.37,18.91,13.91,11.985
145,5379,50385078,TESCO CORN FLAKES CEREAL 750G,TESCO,NESTLECEREALS,2020-01-27,22.0,0.75,11.38,top,20.83,17.37,18.91,13.91,5.6475
231,2547,77300979,TESCO CHOCO SNAPS CEREAL 350G,TESCO,NESTLECEREALS,2020-01-27,17.0,1.0,3.28,middle,7.78,5.82,6.56,3.86,3.28
334,2877,85116425,TESCO BRAN FLAKES 750G,TESCO,NESTLECEREALS,2020-01-27,26.0,1.05,13.23,top,23.965,19.7575,21.885,15.55,9.08775
361,2163,85116425,TESCO BRAN FLAKES 750G,TESCO,NESTLECEREALS,2020-01-27,16.0,1.05,14.22,top,23.965,19.7575,21.885,15.55,8.04825
