In [None]:
import pandas as pd
from fbprophet import Prophet
from dateutil.relativedelta import relativedelta as rd
import time
import numpy as np
from datetime import date, timedelta
import os
import glob
import warnings
import plotly.graph_objects as go

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 500)

max_train_date = '2020-02-15' # Do not change since weeks after this started displaying COVID effects

curr_week_date = '2020-04-18' # Update to latest POS data week

PATH = "./pos_prophet_results/0430"

### Read in Raw POS data

In [None]:
# Read in POS data
with pyodbc.connect(CONNECTION_STRING, autocommit=True) as conn:
    pos_raw = pd.read_sql("SELECT * FROM default.cbda_pos_model_input", conn)

# Drop prefix created when reading the data from Hive table
pos = pos_raw.copy()
pos.columns = [i.split(".")[1] for i in pos.columns.values]
    
# Convert to datetime format
pos['week_ending_date'] = pd.to_datetime(pos['week_ending_date'])

# Create a unique identifier for PPG 
cols = ['mdlz_business', 'mdlz_category', 'mdlz_brand','mdlz_ppg']
pos['ppg_id'] = pos[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 

# Create a unique identifier for PPG/State/Retailer 
cols = ['ppg_id', 'state', 'retailer']
pos['sell_id'] = pos[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 

# BUSINESS FILTERS PROVIDED BY MDLZ 
pos_main = pos[pos['week_ending_date']>'2019-01-01'] # Week Ending Filter from January 2019
pos_main = pos_main[~pos_main['mdlz_category'].isin(['None','Cookie','Display PRD'])] #Excluding low value categories 
pos_main = pos_main [~((pos_main['mdlz_ppg']=='') | (pos_main['mdlz_ppg'].isnull()))] # Excluding blank and null PPG values
pos_main = pos_main[~((pos_main['mdlz_business']=='') & (pos_main['mdlz_category']=='') & (pos_main['mdlz_brand']=='') & (pos_main['mdlz_ppg']!=''))] # Excluding PPGs with blank product hierarchy
pos_main = pos_main[(pos_main['pos_dollar']>0.0) & (pos_main['pos_qty']>0.0)] # Remove returns
pos_main = pos_main[~(((pos_main['pos_dollar'].isna()) & (pos_main['pos_qty'].isna())))] # Remove null sales

# Sort
pos_main.sort_values(['sell_id','week_ending_date'], ascending=[True,True],inplace=True)


In [None]:
pos_main[:2]

### Read in and filter for Common Products - with full history

In [None]:
common_products = pd.read_csv(f'./common_sell_id_pos_0418.csv')

In [None]:
common_products

In [None]:
## Subset for common sell_ids
pos_main_model = pos_main.merge(common_products['sell_id'],on=['sell_id'],how='inner')

# Training data - Pre-COVID period; Do not update this date
pos_main_top = pos_main_model[pos_main_model['week_ending_date']<=max_train_date]

# Sort values for prophet
pos_main_top.sort_values(['sell_id','week_ending_date'], ascending=[True,True],inplace=True)

### Prophet Modeling

In [None]:
product_groups = pos_main_top.groupby(['sell_id'])
print(f'Modeling {len(product_groups.groups.keys())} unique sell_ids')

In [None]:
product_groups = pos_main_top.groupby(['sell_id'])

# Initialize variables
start_time = time.time()
all_sell_id_list = []
counter = 0
big_counter = 0
total_sell_id = len(product_groups.groups.keys()) 

print(f'Modeling {len(product_groups.groups.keys())} unique sell_ids')

# Loop over for each time series and generate forecasts
for key in product_groups.groups.keys():
    train = product_groups.get_group(key).reset_index()[['week_ending_date','pos_qty']]
    train.columns=['ds','y']
    max_val = train[train.ds <= '2019-08-01']["y"].max() # Since only 1 year of training data is used, apply logistic growth to prevent overfitting
    min_val = train[train.ds <= '2019-08-01']["y"].min() # Since only 1 year of training data is used, apply logistic growth to prevent overfitting
    train["cap"] = max_val
    train["floor"] = min_val
    m = Prophet(growth='logistic', yearly_seasonality=20, changepoint_prior_scale=0.05, 
                weekly_seasonality = False, daily_seasonality = False, seasonality_prior_scale=1,
               uncertainty_samples=10)
    m.fit(train)
    
    future = m.make_future_dataframe(periods=25, freq='W',include_history=False)
    future["cap"] = max_val
    future["floor"] = min_val
    fcst = m.predict(future)
    fcst = fcst[['ds','yhat','trend']]
    
    sell_id_predict_df = pd.concat([train,fcst], axis=0, ignore_index=True)
    sell_id_predict_df['sell_id'] = key
    all_sell_id_list.append(sell_id_predict_df)
    
    counter += 1
    big_counter += 1
    
    if counter==500 and big_counter<total_sell_id:
        timestr = time.strftime("%H%M%S")
        filename = timestr + ".feather"
        full_path = os.path.join(PATH, filename)
        
        # Flush out the output file along with timestamp
        output_df = pd.concat(all_sell_id_list, axis=0, ignore_index=True)
        #output_df.to_feather(f'{full_path}')
        
        # Reset the counter & dataframe
        counter = 0
        all_sell_id_list = []
    
    elif big_counter == total_sell_id:
        print ("Final Sell ID")
        timestr = time.strftime("%H%M%S")
        filename = timestr + ".feather"
        full_path = os.path.join(PATH, filename)
        
        # Flush out the output file along with timestamp
        output_df = pd.concat(all_sell_id_list, axis=0, ignore_index=True)
        print("ERROR!! Not writing to feather due to existing feather files!")
        #output_df.to_feather(f'{full_path}')
        

end = time.time()

print(f"--- Total Training+Predicting ---",{np.round(end - start_time,2)},'s')
#print("--- Time Per PPG: %s seconds ---",{np.round(((end - start_time)/total_sell_id),2)},'s')

In [None]:
!ls {PATH}

### Read in the model output files

In [None]:
# Read back in the smaller feather files and concatenate
print(f"Path: {PATH}/*.feather")
all_files = glob.glob(PATH + "/*.feather")
# all_files = glob.glob(PATH + "Prophet Model Output.feather")

li = []

for filename in all_files:
    df = pd.read_feather(filename)
    li.append(df)

final = pd.concat(li, axis=0, ignore_index=True).drop(["cap","floor","trend"],axis=1)
final.rename(columns={'ds':'week_ending_date','y':'pos_qty','yhat':'forecast_quantity'}, inplace=True)
final_raw = final.copy()

In [None]:
# Convert to Saturday
def give_week_ending(date):
    start = date - timedelta(days=date.weekday())
    end = start + timedelta(days=5)
    return end

In [None]:
# Make the week ending to fall on a saturday
final['week_ending_date'] = final['week_ending_date'].apply(lambda x: give_week_ending(x))

# Substitute last training week of actuals in the forecast value for the viz purpose 
final.loc[(final['week_ending_date']==max_train_date),'forecast_quantity'] = final['pos_qty']

# Remove duplicate record for latest week
final = final[~((final['week_ending_date']==max_train_date) & (final['forecast_quantity'].isnull()))]

In [None]:
# Create 2020 TY and 2019 LY columns
final['week_of_year'] = final['week_ending_date'].dt.week
final['year'] = final['week_ending_date'].dt.year

final_2020 = final[final['year']==2020]
final_2019 = final[final['year']==2019][['sell_id','week_of_year','pos_qty']]
final_2020 = final_2020.drop('year', axis = 1)
final_new = final_2020.merge(final_2019, on = ['sell_id','week_of_year'], how ='left')

final_new = final_new.rename(columns={'pos_qty_y':'pos_qty_ly','pos_qty_x':'pos_qty_ty'})

In [None]:
# Identify if products are showing very high growth or decline
final_diff = final_new.copy()

final_diff["difference"] = final_diff["forecast_quantity"] - final_diff["pos_qty_ly"]
final_diff["percent_diff"] = final_diff['difference']/final_diff['pos_qty_ly']
final_diff = final_diff[final_diff.week_ending_date >= curr_week_date]
final_diff_agg = final_diff.groupby(["sell_id"]).agg({"percent_diff":"mean"}).reset_index()

# If high growth / decline then identify the product as such so as not to double count/apply the growth
final_diff_agg['high_growth_decline'] = final_diff_agg['percent_diff'].abs() > 0.5

In [None]:
# Write out consolidated prophet model output feather file, to be read back in POS_2_Modeling.ipynb
prht_out = final_new[final_new.week_ending_date > max_train_date].drop(["pos_qty_ty","week_of_year",'pos_qty_ly'],axis=1).rename(columns={'forecast_quantity':'prht_frcst'}).reset_index(drop=True)
prht_out = prht_out.merge(final_diff_agg[["sell_id","high_growth_decline"]], how="inner", on="sell_id")

prht_out.to_feather(f'./pos_prophet_results/0430_cleaned/pos_0430_cleaned_v2.feather')

In [None]:
prht_out

### Visualize the Output

In [None]:
# Top sell_ids by pos_qty
top_sell_ids = final.groupby('sell_id')['pos_qty'].sum().reset_index()\
                .sort_values('pos_qty',ascending=False)['sell_id'][:10]

In [None]:
# Plot top 10 sell_ids
for sell_id in top_sell_ids:
    to_chart =final[final['sell_id']==sell_id]
    
    print(sell_id)
    fig = go.Figure()
    # Create and style traces
    fig.add_trace(go.Scatter(x=to_chart['week_ending_date'], y=to_chart['pos_qty'],name='pos_qty_ty',
                             line=dict(color='blue', width=2)))
    fig.add_trace(go.Scatter(x=to_chart['week_ending_date'], y=to_chart['forecast_quantity'],name='forecast_quantity',
                             line=dict(color='red', width=2)))
    
    fig.update_layout(
    title=sell_id)
    fig.show()