In [None]:
# libraries
import pandas as pd
import os
from os import path
import glob
import pyodbc
import pysftp
from io import BytesIO
import numpy as np
import requests, zipfile, io
from datetime import datetime, timedelta
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
from datetime import date, timedelta
import time
import warnings
warnings.filterwarnings("ignore")

BOX_PATH = f'/Users/rohitkg/Box Sync/Mondelez: Demand forecasts during COVID-19/4. EDA & Descriptive analytics'

In [None]:
# Initialize key date variables to be used throughout the script
max_train_date = '2020-05-09'
max_forecast_date = '2020-08-08'
growth_peak_dates = ['2020-03-14','2020-03-21'] 

In [None]:
# UPDATE with your Mondelez LAN ID & Password to allow the script to connect to Hive and download the raw POS dataset
mdlz_lan_id = "bwn2456"
mdlz_lan_pwd = ""

In [None]:
# Connection to Hive to read in POS raw data
CONNECTION_STRING = ';'.join(f"""
Description=Hortonworks Knox DSN
Driver=/opt/cloudera/hiveodbc/lib/universal/libclouderahiveodbc.dylib
Host=mdzusvpclhdp101.mdzprod.local
port=8443
HttpPath=gateway/mondelez/hive
schema=default
ServiceDiscoveryMode=0
HiveServerType=2
AuthMech=3
ThriftTransport=2
SSL=1
TwoWaySSL=0
AllowSelfSignedServerCert=1
uid={mdlz_lan_id}
pwd={mdlz_lan_pwd}
""".splitlines())

In [None]:
# Connection to edge node to write out POS model output data
edge_node = pysftp.Connection(host="10.54.252.11", username=mdlz_lan_id, password=mdlz_lan_pwd)

# Table of Contents

* [0. Refresh Steps](#first-bullet)
* [1. Read in Shipments Data](#second-bullet)
* [2. Read in State Projections](#third-bullet)
* [3. Pre-Processing of Modeling Data](#fourth-bullet)
* [4. Naive Modeling](#fifth-bullet)
* [5. Milestone Modeling + Prophet Model](#sixth-bullet)
* [6. Final Output](#seventh-bullet)

# 0. Refresh Steps <a class="anchor" id="first-bullet"></a>

In [None]:
# Read in Ship Input dataset
with pyodbc.connect(CONNECTION_STRING, autocommit=True) as conn:
    ship_raw = pd.read_sql("SELECT * FROM default.cbda_ship_model_input", conn)
# Drop prefix created when reading the data from Hive table
ship = ship_raw.copy()
ship.columns = [i.split(".")[1] for i in ship.columns.values]
    
# Refresh State Projections - Point it to Box!
projections_dates = pd.read_csv(f'{BOX_PATH}/Data/Projections/peak_deaths.csv')
projections = pd.read_csv(f'{BOX_PATH}/Data/Projections/projections_state.csv')

# 1. Read in Shipments Data <a class="anchor" id="second-bullet"></a>

### Filters for shipments data

In [None]:
ship['week_ending_date'] = pd.to_datetime(ship['week_ending_date'])

# Create a unique identifier for PPG 
ppg_id_cols = ['promoted_product_group_desc', 'product_segment_name', 'retailer_desc','channel','mdlz_business', 'mdlz_category', 'mdlz_brand','mdlz_ppg']
ship['ppg_id'] = ship[ppg_id_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 

# Create a unique identifier for PPG/State/Retailer 
cols = ['ppg_id', 'state', 'retailer']
ship['sell_id'] = ship[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 


ship_main = ship[ship['week_ending_date']>'2019-01-01'] # Week Ending Filter from January 2019

In [None]:
# BUSINESS FILTERS PROVIDED BY MDLZ 
og_len = len(ship_main)

#NEW/SHIPMENTS SPECIFIC - Only data before today
ship_main = ship_main[ship_main['week_ending_date']<=max_train_date]

print(f'rows lost including only data before today {100*(og_len - len(ship_main))/og_len} %')
og_len = len(ship_main)
ship_main = ship_main[~ship_main['mdlz_category'].isin(['None','Cookie','Display PRD'])] #Excluding low value categories 
print(f'rows lost Excluding low value categories {100*(og_len - len(ship_main))/og_len}%')
og_len = len(ship_main)
ship_main = ship_main [~((ship_main['mdlz_ppg']=='') | (ship_main['mdlz_ppg'].isnull()))] # Excluding blank and null PPG values
print(f'rows lost Excluding blank and null PPG values {100*(og_len - len(ship_main))/og_len}%')
og_len = len(ship_main)
ship_main = ship_main[~((ship_main['mdlz_business']=='') & (ship_main['mdlz_category']=='') & (ship_main['mdlz_brand']=='') & (ship_main['mdlz_ppg']!=''))] # Excluding PPGs with blank product hierarchy
print(f'rows lost Excluding PPGs with blank product hierarchy {100*(og_len - len(ship_main))/og_len}%')
og_len = len(ship_main)
ship_main = ship_main[(ship_main['pos_dollar']>=0.0) & (ship_main['pos_qty']>=0.0)] # Remove returns
print(f'rows lost Remove returns {100*(og_len - len(ship_main))/og_len}%')
og_len = len(ship_main)
ship_main = ship_main[~(((ship_main['pos_dollar'].isna()) & (ship_main['pos_qty'].isna())))] # Remove null sales
print(f'rows lost Remove null sales {100*(og_len - len(ship_main))/og_len}%')
og_len = len(ship_main)

In [None]:
assert(ship_main['pos_qty'].isna().sum()==0)
assert(ship_main['pos_dollar'].isna().sum()==0)

In [None]:
# Summary Statistics
print ("ship Summary - Before the filters are applied")
print ("\nNumber of rows: {0:,.0f}".format(len(ship)))
print ("Total Dollars: {0:,.0f}".format(np.nansum(ship['pos_dollar'])))
print ("Total Quantity: {0:,.0f}".format(np.nansum(ship['pos_qty'])))
print ("Number of unique products",ship['sell_id'].nunique())

print ("\nship Summary - After the filters are applied")
print ("\nNumber of rows: {0:,.0f}".format(len(ship_main)))
print ("Total Dollars: {0:,.0f}".format(np.nansum(ship_main['pos_dollar'])))
print ("Total Quantity: {0:,.0f}".format(np.nansum(ship_main['pos_qty'])))
print ("Number of unique products:",ship_main['sell_id'].nunique())

print ("\n Revenue contribution (%) lost due to the filters",round((1 - (np.nansum(ship_main['pos_dollar'])/np.nansum(ship['pos_dollar'])))*100,2))
print ("\n Volume contribution (%) lost due to the filters",round((1 - (np.nansum(ship_main['pos_qty'])/np.nansum(ship['pos_qty'])))*100,2))

### Calculate the YoY Growth by State, Retailer, PPG (Consider only those weeks which have both 2020 and 2019 sales)

In [None]:
ship_main['week_of_year'] = ship_main['week_ending_date'].dt.week
ship_main['year'] = ship_main['week_ending_date'].dt.year
ship_main_2020 = ship_main[ship_main['year']==2020]
ship_main_2019 = ship_main[ship_main['year']==2019]

only_2019_sell_ids = set(ship_main_2019['sell_id'])-set(ship_main_2020['sell_id'])

ship_main_2019 = ship_main_2019.drop('week_ending_date', axis = 1)
ship_main_2019 = ship_main_2019.drop(['year','sell_id','ppg_id'], axis = 1)
ship_main_2020 = ship_main_2020.drop('year', axis = 1)

ship_main_new = ship_main_2020.merge(ship_main_2019, on = ['state', 'promoted_product_group_desc', 
                                                           'product_segment_name', 'retailer_desc','channel',
                                                           'retailer', 'mdlz_business', 'mdlz_category', 
                                                           'mdlz_brand', 'mdlz_ppg', 'week_of_year'], how ='left')

In [None]:
ship_main_new = ship_main_new.rename(columns={'pos_qty_x':'pos_qty_ty', 'pos_dollar_x':'pos_dollar_ty', 'pos_qty_y':'pos_qty_ly', 'pos_dollar_y':'pos_dollar_ly'})
ship_main_new['Growth_perc_sales'] = (ship_main_new['pos_dollar_ty'] - ship_main_new['pos_dollar_ly']) / ship_main_new['pos_dollar_ly']
ship_main_new['Growth_perc_qty'] = (ship_main_new['pos_qty_ty'] - ship_main_new['pos_qty_ly']) / ship_main_new['pos_qty_ly']

In [None]:
#**********
print ("Number of unique products before the YoY join",ship_main['sell_id'].nunique())
print ("Number of unique products after the YoY join",ship_main_new['sell_id'].nunique())

# 2. Read in State Projections <a class="anchor" id="third-bullet"></a>

In [None]:
def give_week_ending(date):
    start = date - timedelta(days=date.weekday())
    end = start + timedelta(days=5)
    return end

In [None]:
def give_week_ending_incsun(peak_date):
    dt = datetime.strptime(peak_date, '%Y-%m-%d')
    
    if dt.weekday()!=6:  
        start = dt - timedelta(days=dt.weekday())
        end = start + timedelta(days=5)
    
    if dt.weekday()==6:
        start = dt - timedelta(days=dt.weekday())
        end = start + timedelta(days=12)
    
    return end

In [None]:
projections['date'] = pd.to_datetime(projections['date'])
projections['week_of_year'] = projections['date'].dt.week
projections['week_ending_date'] = projections['date'].apply(lambda x: give_week_ending(x))

projections_weekly = projections.groupby(['State','week_ending_date','week_of_year']).agg({'deaths_mean':'sum','new_pop_affected':'sum'}).reset_index()
projections_weekly.rename(columns={'State':'state'},inplace=True)

In [None]:
projections_dates['peak_deaths_week'] = projections_dates['peak_deaths_date'].apply(lambda x: give_week_ending_incsun(x))
projections_dates['thirtyperc_deaths_week'] = projections_dates['end_day'].apply(lambda x: give_week_ending_incsun(x))

# 3. Pre-Processing of Modeling Data <a class="anchor" id="fourth-bullet"></a>

### Filtering of Products based on it's 2019 and 2020 Lifespan

In [None]:
# Reattach the 2019 dataset 
ship_main_2019 = ship_main[ship_main['year']==2019]

# Create a unique identifier for PPG 
ship_main_2019['ppg_id'] = ship_main_2019[ppg_id_cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 

# Create a unique identifier for PPG/State/Retailer 
cols = ['ppg_id', 'state', 'retailer']
ship_main_2019['sell_id'] = ship_main_2019[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 


In [None]:
#How many product groups have max weeks in 2020? 
num_2020_weeks = ship_main_new['week_ending_date'].nunique()
unique_product_group = ship_main_new.groupby(['sell_id']).agg({'week_ending_date': lambda x: x.nunique()}).reset_index()
print (f"losing number of products with lifespan less than {num_2020_weeks} weeks of data (%)",round(1 - (len(unique_product_group[unique_product_group['week_ending_date']==num_2020_weeks]) / len(unique_product_group)),2)*100)

complete_2020 = unique_product_group[unique_product_group['week_ending_date']==num_2020_weeks] 
incomplete_2020 = unique_product_group[unique_product_group['week_ending_date']<num_2020_weeks] 

print (f"Losing 2020 revenue contribution (%) due to dropping products with less than {num_2020_weeks} weeks of data",round(1-sum(ship_main_new.merge(complete_2020,on=['sell_id'],how='inner')['pos_dollar_ty']) / sum(ship_main_new['pos_dollar_ty']),2)*100)
print (f"Losing 2020 volume contribution (%) due to dropping products with less than {num_2020_weeks} weeks of data",round(1-sum(ship_main_new.merge(complete_2020,on=['sell_id'],how='inner')['pos_qty_ty']) / sum(ship_main_new['pos_qty_ty']),2)*100)

In [None]:
# How many product groups have 52 weeks in 2019? 

unique_product_group_2019 = ship_main_2019[ship_main_2019['week_of_year']<=32].groupby(['sell_id']).agg({'week_ending_date': lambda x: x.nunique()}).reset_index()
print ("Losing number of products with lifespan less than 32 weeks of data (%)",round(1 - (len(unique_product_group_2019[unique_product_group_2019['week_ending_date']==32]) / len(unique_product_group_2019)),4)) 

#num_2019_critical_weeks = ship_main_2019['week_of_year']<32
complete_2019 = unique_product_group_2019[unique_product_group_2019['week_ending_date']==32] 
incomplete_2019 = unique_product_group_2019[unique_product_group_2019['week_ending_date']<32] 

print ("Losing 2019 revenue contribution (%) due to dropping products with less than 52 weeks of data",round(1-sum(ship_main_2019.merge(complete_2019,on=['sell_id'],how='inner')['pos_dollar']) / sum(ship_main_2019['pos_dollar']),4)*100)
print ("Losing 2019 volume contribution (%) due to dropping products with less than 52 weeks of data",round(1-sum(ship_main_2019.merge(complete_2019,on=['sell_id'],how='inner')['pos_qty']) / sum(ship_main_2019['pos_qty']),4)*100)


In [None]:
# Products that satisfy the lifespan filter for both 2019 and 2020 - 
print ("2020 products",len(complete_2020))
print ("2019 products",len(complete_2019))
num_common =len(complete_2019.merge(complete_2020,how='inner',on=['sell_id']))
num_not_common=len(ship_main_new['sell_id'].unique())-len(complete_2019.merge(complete_2020,how='inner',on=['sell_id']))
print ("Common products",num_common)
print ("Non-common products",num_not_common)
print(f'Common rate {np.round(100*num_common/(num_common+num_not_common),2)}%')

In [None]:
common_products = complete_2019.merge(complete_2020,how='inner',on=['sell_id'])
print ("Losing 2020 & 2019 revenue contribution (%) due to dropping products with less than 52 weeks of 2019 data and 14 weeks of 2020 data",round(1-sum(ship_main.merge(common_products,on=['sell_id'],how='inner')['pos_dollar']) / sum(ship_main['pos_dollar']),4))
print ("Losing 2020 & 2019 volume contribution (%) due to dropping products with less than 52 weeks of 2019 data and 14 weeks of 2020 data",round(1-sum(ship_main.merge(common_products,on=['sell_id'],how='inner')['pos_qty']) / sum(ship_main['pos_qty']),4))


In [None]:
common_products['sell_id'].nunique()

In [None]:
# For Prophet to run only common products - Only required when prophet is rerun after all states reopen
# common_products[['sell_id']].to_parquet("./ship_model_data/wip/ship_common_products.parquet", engine="pyarrow")

### Generate Modeling data for products with insufficient history

In [None]:
one_df = common_products.merge(incomplete_2020,how='outer',on=['sell_id'])
missing_products = list(set(ship_main_new['sell_id'].unique()) - set(one_df['sell_id'].unique()))
missing_products_data = ship_main_new[ship_main_new['sell_id'].isin(missing_products)]
incomplete_2020_data = ship_main_new[ship_main_new['sell_id'].isin(incomplete_2020['sell_id'].unique())]
incomplete_data = incomplete_2020_data.append(missing_products_data)

In [None]:
print ("Check - Number of unique products")
print ("After all the business filters are applied and YoY calculation",ship_main_new['sell_id'].nunique())
print ("Combination of modeling complete data, incomplete 2020 data, missing products",len(common_products) + len(incomplete_2020) + len(missing_products))


In [None]:
len(incomplete_data['sell_id'].unique())

# 4. Naive Model - Products with Insufficient History <a class="anchor" id="fifth-bullet"></a>

### Naive Model Declarations

In [None]:
latest_ship_week = ship_main_new['week_ending_date'].max()
date_list = projections_weekly.groupby(['week_ending_date','week_of_year']).agg({'state':'count'}).reset_index().sort_values('week_ending_date')


###  Model Run

In [None]:
#Parameters 
#set of each each week in 2020
weeks_2020 = set(ship_main_new['week_ending_date'].drop_duplicates())
# Expected number of 2020 weeks
num_weeks_expected = len(weeks_2020)
# Forecast max date
#max_forecast_date = '2020-08-08'
# Max date we have in ship
max_ship_date = str(max(weeks_2020).date())

#Find sell_ids with incomplete data
#sell_id_not_complete = unique_product_group[unique_product_group['week_ending_date']<num_weeks_expected]['sell_id'].unique()
#TEST
sell_id_not_complete = incomplete_data['sell_id'].unique()

# Fill w 0’s
#groupby df for iterating over sell_ids that are incomplete in 2020
incomplete = ship_main_new[ship_main_new['sell_id'].isin(sell_id_not_complete)].groupby('sell_id')



proj_dates = set(projections[(projections['date'].dt.weekday==5)&(projections['date']>max_ship_date)&(projections['date']<max_forecast_date)]['date'])
proj_dates.add(pd.Timestamp(max_forecast_date))

start_time = time.time()
append_dfs_list = []
print(f'modeling for {len(incomplete.groups.keys())} sell_ids with incomplete data')
for sell_id in incomplete.groups.keys():
    #Get sell_id data
    data = incomplete.get_group(sell_id)
    
    #Find 2020 weeks where sell_id does not have an instance
    weeks_missing = weeks_2020 - set(data['week_ending_date'])
    #should be less than number of weeks in 2020
    #assert len(weeks_missing)>0
    
    #Fill w 0’s
    sell_id_missing_weeks_df1 = pd.DataFrame({'week_ending_date': list(weeks_missing)},columns=ship_main_new.columns)
    sell_id_missing_weeks_df1['pos_qty_ty'].fillna(0,inplace=True)
    sell_id_missing_weeks_df1['pos_dollar_ty'].fillna(0,inplace=True)
    
    sell_id_missing_weeks_df2 = pd.DataFrame({'week_ending_date': list(proj_dates)},columns=ship_main_new.columns)
    sell_id_missing_weeks_df2['pos_qty_ty'].fillna(np.nan,inplace=True)
    sell_id_missing_weeks_df2['pos_dollar_ty'].fillna(np.nan,inplace=True)
    
    sell_id_missing_weeks_df = pd.concat([sell_id_missing_weeks_df1,sell_id_missing_weeks_df2], axis=0, ignore_index=True)
    
    
    sell_id_missing_weeks_df['sell_id'].fillna(sell_id,inplace=True)
    assert 0==sum(sell_id_missing_weeks_df['week_ending_date'].value_counts()>1)

    #Should not print.
    if len(sell_id_missing_weeks_df)==0: print(sell_id)
    
    #Append to make a list of df's that will be concated
    append_dfs_list.append(sell_id_missing_weeks_df)

end = time.time()
print(f'Number incomplete sell_ids modeled: {len(incomplete.groups.keys())}')
print("--- Loop took %s minutes ---",(end - start_time)/60)
print("--- Time Per PPG: %s seconds ---",(end - start_time)/len(sell_id_not_complete))

In [None]:
# Forecast 4/11-8/8 with rolling mean per week
missing_2020_ppgs_final = pd.concat(append_dfs_list, axis=0,ignore_index=True)
missing_2020_ppgs_final['week_of_year'] = missing_2020_ppgs_final['week_ending_date'].dt.week
missing_2020_ppgs_final.drop(columns=['pos_qty_ly'],inplace=True)
missing_2020_ppgs_final = missing_2020_ppgs_final.merge(ship_main_2019[['sell_id','week_of_year','pos_qty']], on=['sell_id','week_of_year'],how='left',validate='1:1').rename(columns={'pos_qty':'pos_qty_ly'})                                                            

sell_id_not_complete_all_data = pd.concat([ship_main_new[ship_main_new['sell_id'].isin(sell_id_not_complete)], missing_2020_ppgs_final],\
                       axis=0, ignore_index=True).sort_values(['sell_id','week_ending_date']).reset_index(drop=True)
          

In [None]:
#NEW
sell_id_not_complete_all_data['pos_qty_ly'].fillna(0, inplace=True)

rolling_average_window = 7
sell_id_not_complete_all_data['pos_qty_ly_rolling_6_week_mean'] = sell_id_not_complete_all_data[sell_id_not_complete_all_data['week_ending_date']!='2020-01-04'].groupby(['sell_id'])['pos_qty_ly'].rolling(window = rolling_average_window, center=True, min_periods=1).mean().reset_index(0,drop=True)


In [None]:
for col in ['state', 'promoted_product_group_desc', 'product_segment_name', 'retailer_desc','channel','retailer','mdlz_business','ppg_id','mdlz_category','mdlz_brand','mdlz_ppg']:
    sell_id_not_complete_all_data[col]=sell_id_not_complete_all_data.groupby('sell_id')[col].fillna(method='ffill')
    sell_id_not_complete_all_data[col]=sell_id_not_complete_all_data.groupby('sell_id')[col].fillna(method='bfill')


In [None]:
non_covid_2020 = ship_main_new[ship_main_new['week_ending_date']<'2020-03-01']
non_covid_2020['pos_qty_diff'] = (non_covid_2020['pos_qty_ty']-non_covid_2020['pos_qty_ly'])/non_covid_2020['pos_qty_ly']
non_covid_2020 = non_covid_2020.groupby(['sell_id'])['pos_qty_diff'].median().reset_index()
non_covid_2020.rename(columns={'pos_qty_diff':'2020_med_pos_qty_diff'},inplace=True)
sell_id_not_complete_all_data = sell_id_not_complete_all_data.merge(non_covid_2020, on=['sell_id'], how='left', validate='m:1')


In [None]:
# sell_id_not_complete_all_data[sell_id_not_complete_all_data['sell_id']=='ALL OTHER BELVITA COOKIE PPG_Belvita Base_AWG SCBM_45_Total Biscuit MG_Cookies_Belvita Cookies_L90_MS_US3000030'].to_clipboard()

In [None]:
print(sell_id_not_complete_all_data['2020_med_pos_qty_diff'].isna().mean())
sell_id_not_complete_all_data['2020_med_pos_qty_diff']=sell_id_not_complete_all_data.groupby(['sell_id'])['2020_med_pos_qty_diff'].fillna(method='pad')
sell_id_not_complete_all_data['2020_med_pos_qty_diff'].fillna(0,inplace=True)
sell_id_not_complete_all_data['2020_med_pos_qty_diff'].replace(np.inf, 0, inplace=True)
sell_id_not_complete_all_data['2020_med_pos_qty_diff'].replace(np.nan, 0, inplace=True)


In [None]:
sell_id_not_complete_all_data['forecast_quantity'] = sell_id_not_complete_all_data['pos_qty_ly_rolling_6_week_mean']
sell_id_not_complete_all_data['forecast_quantity'].fillna(0, inplace=True)

#Set null where we have data
sell_id_not_complete_all_data.loc[sell_id_not_complete_all_data['week_ending_date']<max_ship_date, 'forecast_quantity'] = np.nan

# Fill max ship date with actual. For viz
sell_id_not_complete_all_data.loc[sell_id_not_complete_all_data['week_ending_date']==max_ship_date,'forecast_quantity']=sell_id_not_complete_all_data.loc[sell_id_not_complete_all_data['week_ending_date']==max_ship_date,'pos_qty_ty']
sell_id_not_complete_all_data['common_product']=0

ship_incomplete = sell_id_not_complete_all_data

In [None]:
def apply_forecast(week,avg,qty_ty,qty_ly,state):
    
    if week==latest_ship_week:
        return qty_ty
    elif week>latest_ship_week and state!='':
        return avg
    elif week>latest_ship_week and state=='': #Null state forecast
        return qty_ly
    else:
        return np.nan

# 5. Milestone Modeling + Prophet Model - Products with Complete POS History <a class="anchor" id="sixth-bullet"></a>

### Prepare Modeling Data 

In [None]:
ship_model_data_raw = ship_main_new.merge(common_products['sell_id'],on=['sell_id'],how='inner')
ship_model_data = ship_model_data_raw.copy()
ship_model_data.drop(['retailer', 'promoted_product_group_desc', 'product_segment_name', 'retailer_desc','channel','mdlz_business','mdlz_category','mdlz_brand','mdlz_ppg','ppg_id','pos_qty_ly','pos_dollar_ly'],axis=1,inplace=True)

ship_model_data['state_null'] = np.where(ship_model_data.state.isnull(), 1, 0)

In [None]:
# Subset the dataset on just the peak covid growth windows (3/14 & 3/21)
maximum_peak_data = ship_model_data[(ship_model_data['week_of_year']>=11) & (ship_model_data['week_of_year']<=12)]

# For each sell id, Identify the growth peak corresponding to those two weeks
sell_maxium_peak = maximum_peak_data.loc[maximum_peak_data.groupby('sell_id')['Growth_perc_qty'].idxmax()][['sell_id','Growth_perc_qty']]
sell_maxium_peak.rename(columns={'Growth_perc_qty':'covid_max_growth'},inplace=True)

ship_model_data = ship_model_data.merge(sell_maxium_peak,on=['sell_id'],how='left')
ship_model_data['negative_covid_impact'] = ship_model_data['covid_max_growth'].apply(lambda x: 1 if x<0 else 0)


In [None]:
# Calculating 6 week rolling average after sorting the 2019 dataframe
ship_main_2019 = ship_main_2019.sort_values(['sell_id','week_of_year'])
rolling_average_window = 6
ship_main_2019['pos_qty_rolling_6_week_med'] = ship_main_2019[ship_main_2019['week_ending_date']!='2020-01-04']\
                                                .groupby(['sell_id'])['pos_qty']\
                                                .rolling(window = rolling_average_window).median()\
                                                .reset_index(0,drop=True)


In [None]:
print ("\nship Model data Summary")
print ("\nNumber of rows: {0:,.0f}".format(len(ship_model_data)))
print ("\nNumber of unique products: {0:,.0f}".format(len(ship_model_data['sell_id'].unique())))
print ("Total Dollars: {0:,.0f}".format(np.nansum(ship_model_data['pos_dollar_ty'])))
print ("Total Quantity: {0:,.0f}".format(np.nansum(ship_model_data['pos_qty_ty'])))
print ("\nRevenue contribution (%) lost due to the product filters",round((1 - (np.nansum(ship_model_data['pos_dollar_ty'])/np.nansum(ship_main_2020['pos_dollar']))),3))
print ("\nVolume contribution (%) lost due to the product filters",round((1 - (np.nansum(ship_model_data['pos_qty_ty'])/np.nansum(ship_main_2020['pos_qty']))),3))


### Variable declarations for the model

In [None]:
# Create a list of future weeks to be projected 
date_list = projections_weekly.groupby(['week_ending_date','week_of_year']).agg({'state':'count'}).reset_index().sort_values('week_ending_date')
latest_ship_week = ship_main_2020['week_ending_date'].max()

### Model Run

In [None]:
def growth_decay_multiplier(growth):
    
    if growth<0:
        return 0.8
    
    elif growth*100 >= 0 and growth*100 <= 50:
        return 0.2
    
    elif growth*100 > 50 and growth*100 <= 100:
        return 0.15
    
    elif growth*100 > 100 and growth*100 <= 500:
        return 0.1
    
    elif growth*100 > 500 and growth*100 <= 1000:
        return 0.08
    
    else:
        return 0.05       

In [None]:
# Create an empty dataframe with just the headers 
ship_merged = pd.DataFrame(columns=['week_ending_date','sell_id','state','pos_qty_ty','pos_dollar_ty','week_of_year_x','Growth_perc_sales'
                                  ,'Growth_perc_qty','week_of_year_y','first_milestone_date','second_milestone_date','third_milestone_date'])

In [None]:
#product_groups = dataset[dataset['state_null']==0].groupby(['sell_id','state'])
product_groups = ship_model_data[ship_model_data['state_null']==0].groupby(['sell_id','state'])

start_time = time.time()
print(f'Modeling {len(product_groups.groups.keys())} unique sell_ids')
for key in product_groups.groups.keys():
    
    data = product_groups.get_group(key).reset_index() # 14 week actuals dataframe for each product group
    
    state_value = key[1]
    state_null_value = data['state_null'].iat[0]
    covid_impact_value = data['negative_covid_impact'].iat[0]
    
    # To get future weeks for each unique product group
    data = data.merge(date_list[['week_ending_date','week_of_year']], on=['week_ending_date'],how ='outer') 
    
    # Fill in sell_id and state values for the future forecasting weeks 
    data['sell_id'] = data["sell_id"].fillna(key[0])
    data['state'] = data["state"].fillna(key[1])
    data['state_null'] = data["state_null"].fillna(state_null_value)
    data['negative_covid_impact'] = data["negative_covid_impact"].fillna(covid_impact_value)
        
    
    ############################################ Growth Decay Modeling ######################################################
    
    death_peak_date = projections_dates[projections_dates['Geography'] == 'United States of America']['peak_deaths_week'].iat[0]
    death_thirty_perc_date = projections_dates[projections_dates['state'] == state_value]['thirtyperc_deaths_week'].iat[0]
    
    # Before covid outbreak median & average growth
    median = data[(data['week_ending_date']>='2020-01-01') & (data['week_ending_date'] <'2020-03-01')]['Growth_perc_qty'].median() 
    average = data[(data['week_ending_date']>='2020-01-01') & (data['week_ending_date'] <'2020-03-01')]['Growth_perc_qty'].mean()
    
    # Find the absolute peak growth value and it's corresponding week 
    peak_growth_value = data.loc[data[(data['week_ending_date']>=growth_peak_dates[0]) & (data['week_ending_date']<=growth_peak_dates[1])]['Growth_perc_qty'].idxmax()]['Growth_perc_qty']
    peak_growth_date = data.loc[data[(data['week_ending_date']>=growth_peak_dates[0]) & (data['week_ending_date']<=growth_peak_dates[1])]['Growth_perc_qty'].idxmax()]['week_ending_date']
    
    
    data['forecast1'] = 0.0
    data['forecast_quantity'] = 0.0

    
    # Replicate the peak growth value as the very first forecast value - just for visualization (including stop gap measure)
    latest_growth_value = data.loc[data['week_ending_date'] == latest_ship_week]['Growth_perc_qty'].iat[0]
    latest_ship_value = data.loc[data['week_ending_date'] == latest_ship_week]['pos_qty_ty'].iat[0]
    
#     data.loc[(data['week_ending_date'] >= latest_ship_week) & (data['week_ending_date'] < death_peak_date),'forecast1'] = latest_growth_value
#     data.loc[(data['week_ending_date'] >= latest_ship_week) & (data['week_ending_date'] < death_peak_date),'forecast_quantity'] = latest_ship_value
    data.loc[(data['week_ending_date'] == latest_ship_week),'forecast1'] = latest_growth_value
    data.loc[(data['week_ending_date'] == latest_ship_week),'forecast_quantity'] = latest_ship_value
    
    
    
    
    ###### `````````````````````````````````````````````` First Milestone Period ``````````````````````````````````````````````````######
    
    first_milestone_date = death_peak_date #US specific 
    # Check if the first two milestones co-incide
    if death_thirty_perc_date <= death_peak_date:
        second_milestone_date = death_peak_date + timedelta(days=21) #Based on median days being 28 across states
    else:
        second_milestone_date = death_thirty_perc_date
    
    first_milestone_date = first_milestone_date + timedelta(days=7) # Shifting the range by one week because the latest pos week conincides with the US deaths peak date
    ### Optimization to create a prior table,dictionary based on sell_id, growth_value 
    data.loc[(data['week_ending_date'] == first_milestone_date),'forecast1'] = growth_decay_multiplier(peak_growth_value) * peak_growth_value
    if data.loc[data['week_ending_date'] == first_milestone_date]['negative_covid_impact'].iat[0]==0:
        data.loc[(data['week_ending_date'] == first_milestone_date),'forecast_quantity'] = 0.9 * latest_ship_value
    else: 
        data.loc[(data['week_ending_date'] == first_milestone_date),'forecast_quantity'] = latest_ship_value / 0.9 
    second_milestone_date_updated = second_milestone_date + timedelta(days=7)
    first_milestone_range = data.loc[(data['week_ending_date'] >= first_milestone_date) & (data['week_ending_date']< second_milestone_date_updated),'forecast1'].index.values
    for i in range(first_milestone_range[0], first_milestone_range[-1]+1): 
        data.loc[i+1, 'forecast1'] = data.loc[i, 'forecast1'] * 0.9 
        if data.loc[i,'negative_covid_impact']==0:
            data.loc[i+1, 'forecast_quantity'] = data.loc[i, 'forecast_quantity'] * 0.9
        else:
            data.loc[i+1, 'forecast_quantity'] = data.loc[i, 'forecast_quantity'] / 0.9


    ######`````````````````````````````````````````````` Second Milestone Period ``````````````````````````````````````````````######
    state_open_date = second_milestone_date_updated + timedelta(days=14) 
    state_open_date = pd.to_datetime(state_open_date)
    second_milestone_range = data.loc[(data['week_ending_date'] >= second_milestone_date_updated) & (data['week_ending_date']< state_open_date),'forecast1'].index.values
    
    data.loc[second_milestone_range[0]:second_milestone_range[0]+1, 'forecast_quantity'] = 0   # Setting this as 0 in the first milestone range = 1 week only
    for i in range(second_milestone_range[0], second_milestone_range[-1]+1):
        data.loc[i+1, 'forecast1'] = data.loc[i, 'forecast1'] * 0.5

    
    #######`````````````````````````````````````````````` Third Milestone Period ``````````````````````````````````````````````#######
    data.loc[(data['week_ending_date']>= state_open_date),'forecast1'] = median
    
    data['first_milestone_date'] = death_peak_date
    data['second_milestone_date'] = second_milestone_date
    data['third_milestone_date'] = state_open_date
    data['median_baseline'] = median
    
    ship_merged = ship_merged.append(data)

end = time.time()
print("--- %s minutes ---",(end - start_time)/60)

### Post-Processing of the Model Output

In [None]:
# Read in Prohpet's forecasts
prht_fcst_common = pd.read_feather(f"./ship_prophet_results/0430_cleaned/ship_0430_cleaned_v2.feather")

# Calculate rolling 6 week median of prophet's forecast to exclude effect of promotions
prht_fcst_common = prht_fcst_common.sort_values(['sell_id','week_ending_date'])
rolling_average_window = 6
prht_fcst_common['pos_qty_prht_rolling_6_week_med'] = prht_fcst_common.groupby(['sell_id'])['prht_frcst'].rolling(window = rolling_average_window).median().reset_index(0,drop=True)

prht_fcst_common = prht_fcst_common.drop(['prht_frcst'], axis=1)

# prht_fcst_common.head(30)

In [None]:
# Save a copy of the model output df - dont use it, overwritten by ship_merged
ship_merged_raw = ship_merged.copy()

### ship-Processing of the Model Output
ship_merged = ship_merged.drop(['week_of_year_x','index','covid_max_growth'],axis=1)
ship_merged.rename(columns={'week_of_year_y':'week_of_year'},inplace=True)
ship_merged['common_product']=1

# Merge model output with 2019 dataset to obtain the pos_qty_ly
ship_merged = ship_merged.merge(ship_main_2019,on=['sell_id','week_of_year'],how='left',suffixes=('', '_y'))
ship_merged.rename(columns={'pos_qty':'pos_qty_ly','pos_dollar':'pos_dollar_ly'},inplace=True)
ship_merged.drop(['week_ending_date_y','state_y','year'],axis=1,inplace=True)


# Merge in prophet's forecast for common products
ship_merged = ship_merged.merge(prht_fcst_common, how='left', on=['sell_id','week_ending_date'])

# Use prophet's forecast if available
ship_merged = ship_merged.assign(pos_qty_rolling_6_week_med = \
                              np.where(~(ship_merged.pos_qty_prht_rolling_6_week_med.isnull()), 
                                       ship_merged.pos_qty_prht_rolling_6_week_med, 
                                       ship_merged.pos_qty_rolling_6_week_med))

ship_merged = ship_merged.drop(['pos_qty_prht_rolling_6_week_med'],axis=1)


# Subset the dataset on just the peak covid growth windows (3/14 & 3/21)
maximum_peak_data = ship_merged[(ship_merged['week_of_year']>=11) & (ship_merged['week_of_year']<=12)]

# For each sell id, Identify the growth peak corresponding to those two weeks
sell_maxium_ly = maximum_peak_data.loc[maximum_peak_data.groupby('sell_id')['Growth_perc_qty'].idxmax()][['sell_id','pos_qty_ly']]
sell_maxium_ly.rename(columns={'pos_qty_ly':'max_ship_ly'},inplace=True)

ship_merged = ship_merged.merge(sell_maxium_ly,on=['sell_id'],how='left')

def apply_forecast_value_positive(week,growth,qty_ly_rolling,qty_ty,qty_ly_max,z,third_m,second_m,fcst_qty,high_growth_decline):
      
    if week == latest_ship_week: # If the week is current ship week, set the forecast value to actual ship (just for viz purporse)
        return qty_ty
    
    elif week>latest_ship_week and week<second_m:
        return fcst_qty
    
    elif week>=second_m and high_growth_decline==True:
        return qty_ly_rolling
    elif week>=second_m:
        return (1 + growth) * qty_ly_rolling    
    else:
        return np.nan

def apply_forecast_value_negative(week,growth,qty_ly_rolling,qty_ty,qty_ly_max,z,third_m,second_m,fcst_qty,high_growth_decline):
      
    if week == latest_ship_week: # If the week is current ship week, set the forecast value to actual ship (just for viz purporse)
        return qty_ty
    
    elif week>latest_ship_week and week<second_m:
        return fcst_qty
    
    elif week>=second_m and week<third_m:
        return (1 + growth) * qty_ly_max    
    
    elif week>=third_m and high_growth_decline==True:
        return qty_ly_rolling
    elif week>=third_m:
        return (1+growth) * qty_ly_rolling
    else:
        return np.nan

def floor_growth(median,growth,week,z): # positive growing products
        
    if week>latest_ship_week and growth<median:
        return median
    else:
        return growth

def ceil_growth(median,growth,week,z): # Negative growing products
    
    if week>latest_ship_week and growth>median:
        return median
    else:
        return growth

ship_merged['forecast1'] = ship_merged[['median_baseline','forecast1','week_ending_date','negative_covid_impact']].apply(lambda x: floor_growth(*x) if x[3] == 0 else ceil_growth(*x),axis=1)
ship_merged['forecast_quantity'] = ship_merged[['week_ending_date','forecast1','pos_qty_rolling_6_week_med',
                                                'pos_qty_ty','max_ship_ly','negative_covid_impact',
                                                'third_milestone_date','second_milestone_date',
                                                'forecast_quantity','high_growth_decline']]\
                                    .apply(lambda x: apply_forecast_value_positive(*x) if x[5]==0  \
                                            else apply_forecast_value_negative(*x) ,axis=1)

ship_merged['week_ending_date'] = pd.to_datetime(ship_merged['week_ending_date'])
ship_merged['first_milestone_date'] = pd.to_datetime(ship_merged['first_milestone_date'])
ship_merged['second_milestone_date'] = pd.to_datetime(ship_merged['second_milestone_date'])
ship_merged['third_milestone_date'] = pd.to_datetime(ship_merged['third_milestone_date'])



def update_forecast_positive(week,forecast1,fcst_growth,pos_qty_rolling,fcst_qty,first_m,second_m,neg_covid, high_growth_decline):

    if week>latest_ship_week and week<second_m and high_growth_decline!=True: # Floor the value 
        if fcst_growth < forecast1:
            return (1+forecast1) * pos_qty_rolling
        else:
            return fcst_qty
    
    else:
        return fcst_qty

def update_forecast_negative(week,forecast1,fcst_growth,pos_qty_rolling,fcst_qty,first_m,second_m,neg_covid, high_growth_decline):

    if week>latest_ship_week and week<second_m and high_growth_decline!=True: # Ceil the value
        if fcst_growth > forecast1:
            return (1+forecast1) * pos_qty_rolling
        else:
            return fcst_qty
    
    else:
        return fcst_qty

#'week_ending_date','second_milestone_date_y','pos_qty_ly','pos_qty_rolling_6_week_med'
def find_promo_uplift(week,second_m,qty_ly,qty_rolling):
    if week>=(pd.to_datetime(max_train_date) + timedelta(days=14)):
        if (qty_ly - qty_rolling)>0:
            return qty_ly - qty_rolling
        else: 
            return 0 
    else:
        return 0

In [None]:
ship_merged['forecast_growth'] = (ship_merged['forecast_quantity'] - ship_merged['pos_qty_ly']) / (ship_merged['pos_qty_ly'])

ship_merged['forecast_quantity_new'] = ship_merged[['week_ending_date','forecast1','forecast_growth',
                                                    'pos_qty_rolling_6_week_med','forecast_quantity',
                                                    'first_milestone_date','second_milestone_date',
                                                    'negative_covid_impact','high_growth_decline']]\
                                            .apply(lambda x: update_forecast_positive(*x) if x[7]==0 
                                                   else update_forecast_negative(*x) ,axis=1)

ship_merged.drop(['forecast_quantity'],axis=1,inplace=True)
ship_merged.rename(columns={'forecast_quantity_new':'forecast_quantity'},inplace=True)

# Save a copy for backup! 4/16 at 9:36 pm (negative and positive logic)
ship_merged_copy = ship_merged.copy()

In [None]:
# After new logic 
ship_merged_copy_new_logic = ship_merged.copy()

In [None]:
print(ship_merged['pos_qty_ly'].sum()+ship_incomplete['pos_qty_ly'].sum())

ship_incomplete.rename(columns={'pos_qty_ly_rolling_6_week_mean':'pos_qty_rolling_6_week_med'},inplace=True)

ship_merged = pd.concat([ship_merged, ship_incomplete], axis=0, ignore_index=True)

In [None]:
common_products = ship_merged[ship_merged['common_product']==1]['sell_id'].unique()
print('common products (all 2019+2020) weeks available')
print('sell ids', np.round(100*len(common_products)/len(ship_main['sell_id'].unique()),2),'%')
print('pos qty', np.round(100*ship_main[ship_main['sell_id'].isin(common_products)]['pos_qty'].sum()/ship_main['pos_qty'].sum(),2),'%')
print()

incomplete2020_products= ship_merged[ship_merged['common_product']==0]['sell_id'].unique()
print('non-common 2020 products (not max 2020 weeks available)')
print('sell ids', np.round(100*len(incomplete2020_products)/len(ship_main['sell_id'].unique()),2),'%')
print('pos qty', np.round(100*ship_main[ship_main['sell_id'].isin(incomplete2020_products)]['pos_qty'].sum()/ship_main['pos_qty'].sum(),2),'%')
print()
      

not_modeled_df = ship_main[~ship_main['sell_id'].isin(ship_merged['sell_id'].unique())]
not_modeled = ship_main[~ship_main['sell_id'].isin(ship_merged['sell_id'].unique())]['sell_id'].unique()
print('Not Modeled')
print('sell ids', np.round(100*len(not_modeled)/len(ship_main['sell_id'].unique()),2),'%')
print('pos qty', np.round(100*ship_main[ship_main['sell_id'].isin(not_modeled)]['pos_qty'].sum()/ship_main['pos_qty'].sum(),2),'%')


In [None]:
state_dates_df = pd.DataFrame(columns = ['state','first_milestone_date', 'second_milestone_date','third_milestone_date'])
death_peak_date = projections_dates[projections_dates['Geography'] == 'United States of America']['peak_deaths_week'].iat[0]
state_dates_df=[]
for state in projections_dates[projections_dates['state'].notna()]['state'].unique():
    death_thirty_perc_date = projections_dates[projections_dates['state'] == state]['thirtyperc_deaths_week'].iat[0]
    #if state == 'NY':
    #    print(death_thirty_perc_date)
    #    print(death_peak_date)
    if death_thirty_perc_date <= death_peak_date:
        second_milestone_date = death_peak_date + timedelta(days=21) #Based on median days being 28 across states
    else:
        second_milestone_date = death_thirty_perc_date
    
    state_open_date = second_milestone_date + timedelta(days=14) 

    
    state_dates_df.append([state,death_peak_date,second_milestone_date,state_open_date])
    


In [None]:
state_dates_df = pd.DataFrame(state_dates_df,columns = ['state','first_milestone_date', 'second_milestone_date','third_milestone_date'] )


In [None]:
ship_merged = ship_merged.merge(state_dates_df, how='left', on='state', validate = 'm:1')

In [None]:
#FS Constant growth until third milestone

ship_merged.loc[ship_merged['week_ending_date']==max_train_date ,'last_ship_week_pos_qty'] = ship_merged['pos_qty_ty']
ship_merged['last_ship_week_pos_qty']=ship_merged.groupby(['sell_id'])['last_ship_week_pos_qty'].fillna(method='pad')

ship_merged.loc[((ship_merged['common_product']==0)&\
                 (ship_merged['channel']=='30')&\
                 (ship_merged['week_ending_date']>=max_train_date) & \
                 (ship_merged['week_ending_date']<ship_merged['third_milestone_date_y']) ), 
                'forecast_quantity'] = ship_merged['last_ship_week_pos_qty']

In [None]:
def uncommon_promo_damp(s):
    s.sort_values(['sell_id', 'week_ending_date'], ascending=[True,True], inplace=True)
    print('CAUTION: sorting shipments dataframe by sell_id and time: early -> late')
    s.loc[s['pos_qty_ly']==0, 'zero_median']=1
    s['7_window_0_count']=s.groupby('sell_id')['zero_median'].apply(lambda x: x.rolling(7, center=True).count())
    s['uncommon_dampen_multiplier'] = 1-s['7_window_0_count']/7
    s.loc[s['common_product']==1, 'uncommon_dampen_multiplier']=1
    s['promo_uplift_new'] = s['promo_uplift'] * s['uncommon_dampen_multiplier']
    return s

In [None]:
#Double Ecomm
ship_merged.loc[((ship_merged['common_product']==0)&(ship_merged['week_ending_date']>max_train_date)&(ship_merged['retailer_desc']=='ECOMM DIRECT')), 'forecast_quantity'] = ship_merged['forecast_quantity']*2
ship_merged.loc[((ship_merged['common_product']==1)&(ship_merged['week_ending_date']>max_train_date)&(ship_merged['retailer_desc']=='ECOMM DIRECT')), 'forecast_quantity'] = ship_merged['forecast_quantity']*2

#Promo uplift - to review
ship_merged['promo_uplift'] = ship_merged[['week_ending_date','second_milestone_date_y','pos_qty_ly','pos_qty_rolling_6_week_med']].apply(lambda x: find_promo_uplift(*x),axis=1)

# Corrections for DOT and McLane since they only capture returns in negatives
ship_merged.loc[(ship_merged['state'].isin(['AR','IL']))&(ship_merged['retailer_desc'].isin(['DOT TOTAL'])), 'forecast_quantity'] = ship_merged['pos_qty_ly']
ship_merged.loc[(ship_merged['state'].isin(['IL']))&(ship_merged['retailer_desc'].isin(['MCLANE'])), 'forecast_quantity'] = ship_merged['pos_qty_ly']
ship_merged=ship_merged[ship_merged['forecast_quantity']!=np.inf]
ship_merged.loc[ship_merged['week_ending_date']<max_train_date, 'forecast_quantity'] = np.nan


# Promo uplift reduction for uncommon products
common_promo_mean = ship_merged[ship_merged['common_product']==1]['promo_uplift'].mean()
ship_merged = uncommon_promo_damp(ship_merged)
assert common_promo_mean == ship_merged[ship_merged['common_product']==1]['promo_uplift'].mean()

# Uncommon products
#ship_merged.loc[~ship_merged['sell_id'].isin(ship_incomplete['sell_id'].unique()),'common_product']=1

# 6. Final Output <a class="anchor" id="seventh-bullet"></a>

### Merging Naive and Milestone Modeling outputs

In [None]:
print ("Without merging with the states data, number of rows", len(ship_merged))
print ("Without merging with the states data, number of product groups", len(ship_merged["sell_id"].unique()))

In [None]:
# Write out Model output parquet file to edge node
start = time.time()

out_buffer = BytesIO()
ship_merged.to_parquet(out_buffer, index=False)

destination_path = "ship_model_results.parquet.gzip"
# destination_path = "ship_model_results.csv"
with edge_node.open(destination_path, 'w+', 32768) as f:
    f.write(out_buffer.getvalue())

print('Writing out took', time.time()-start, 'seconds.')

# Add edge node to Hadoop connection
edge_node_pwd = edge_node.execute("pwd")[0][:-1].decode("utf-8")
edge_node.execute(f"hdfs dfs -put {edge_node_pwd}/{destination_path} ./CBDA/")