In [0]:
!pip install xgboost
!pip install xlrd==1.2.0
!pip install eli5
!pip install shap

In [0]:
import pandas as pd
import numpy as np
import datetime
from datetime import datetime as dt
from datetime import timedelta
from sklearn.ensemble import RandomForestRegressor
import pickle
import xgboost
import os
import warnings
import math
import eli5
import shap
from pandas.tseries.offsets import BDay
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [0]:
run_date_phase_1 = '2022_02_25'
run_date_phase_2 = '2022_03_20'
material_best_models='material_best_model_vlookup_'+run_date_phase_2
# reading best model lookup file
material_best_model = spark.read.table(material_best_models)
material_best_model = material_best_model.toPandas()

smi_raw_data='enter file path to SMI_data_bricks_raw_data_csv.xlsx'
# reading SMI data
smi_1=pd.read_excel(smi_raw_data)

risk_raw_data='enter file path to Risk_Support_raw_data.xlsx'
#reading financial data
fin=pd.read_excel(risk_raw_data)

freq_raw_data='order_freq_19_01'
#reading frequency data
freqc = spark.read.table(freq_raw_data)
freq=freqc.toPandas()

lead_time_mod='model_data_vf_19_01'
#reading historical data
historical_data = spark.read.table(lead_time_mod)
historical_data = historical_data.toPandas()
historical_data = historical_data[historical_data['PO_Create_Date']!='1888-01-01']
historical_data = historical_data[historical_data['Delivered_Quantity']!=0]

In [0]:
# updating values on model data 
for vs_id, vs_reg in zip(historical_data['updated_VS_location'],historical_data['updated_VS_region']):
  if (((vs_id=='madison') | (vs_id=='payette')) & (vs_reg=='id')):
    historical_data.loc[(historical_data['updated_VS_location']==vs_id) & 
                        (historical_data['updated_VS_region']==vs_reg),'updated_VS_country']='us'
  else:
    pass

historical_data.loc[historical_data['updated_VS_region']=='de','updated_VS_country']='de'
historical_data.loc[historical_data['updated_VS_region']=='br','updated_VS_country']='br'
historical_data.loc[historical_data['updated_VS_region']=='vn','updated_VS_country']='vt'
historical_data.loc[historical_data['updated_VS_country']=='vt','updated_VS_region']='vt'

In [0]:
#Creating historical daily weather variables
daily_weather_historical_v1 = spark.read.table('weather_data_weekly')
daily_weather_historical_v1 = daily_weather_historical_v1.toPandas()
daily_weather_historical_v1.rename(columns={'name':'location'}, inplace=True)

In [0]:
#Function for creating PO+2W date, PO+4W date and PO+6W date
def po_delta_week_dates(po_create_date, vs_location, vs_region):
  po_create_date_2W = pd.to_datetime(po_create_date)+timedelta(14)
  po_create_date_2W = pd.to_datetime(po_create_date_2W).strftime('%Y-%m-%d')
  po_create_date_4W = pd.to_datetime(po_create_date)+timedelta(28)
  po_create_date_4W = pd.to_datetime(po_create_date_4W).strftime('%Y-%m-%d')
  po_create_date_6W = pd.to_datetime(po_create_date)+timedelta(42)
  po_create_date_6W = pd.to_datetime(po_create_date_6W).strftime('%Y-%m-%d')
  location = vs_location+','+vs_region
  return po_create_date_2W, po_create_date_4W, po_create_date_6W, location

In [0]:
#Reading weather forecast data
daily_weather_15day_forecast = spark.read.table('weather_forecast_ves')
daily_weather_15day_forecast = daily_weather_15day_forecast.toPandas()
daily_weather_15day_forecast['datetime'] = pd.to_datetime(daily_weather_15day_forecast['datetime'])
daily_weather_15day_forecast['conditions']=daily_weather_15day_forecast['conditions'].astype(str)
daily_weather_15day_forecast['icon']=daily_weather_15day_forecast['icon'].astype(str)
daily_weather_15day_forecast['rainfall_mm']=[vals  if 'Rain' in datas else 0 for datas,vals in zip(daily_weather_15day_forecast['conditions'],daily_weather_15day_forecast['precip'])]
daily_weather_15day_forecast['Weeks']=pd.to_datetime(daily_weather_15day_forecast['datetime']).dt.week
daily_weather_15day_forecast['fog_count']=[1  if 'fog' in datas else 0 for datas in daily_weather_15day_forecast['icon']]
daily_weather_15day_forecast['rain_count']=[1  if 'rain' in datas else 0 for datas in daily_weather_15day_forecast['icon']]
daily_weather_15day_forecast['snow_count']=[1  if 'snow' in datas else 0 for datas in daily_weather_15day_forecast['icon']]
daily_weather_15day_forecast['Years']=pd.to_datetime(daily_weather_15day_forecast['datetime']).dt.year
daily_weather_15day_forecast_v1=pd.pivot_table(daily_weather_15day_forecast,values=['tempmax','tempmin','snowdepth','windspeed','rainfall_mm'],index=['location','Weeks'],aggfunc='mean').reset_index()
daily_weather_15day_forecast_v121=pd.pivot_table(daily_weather_15day_forecast,values=['fog_count','rain_count','snow_count'],index=['location','Weeks','Years'],aggfunc='sum').reset_index()
daily_weather_15day_forecast_v122=pd.pivot_table(daily_weather_15day_forecast_v121,values=['fog_count','rain_count','snow_count'],index=['location','Weeks'],aggfunc='mean').reset_index()
daily_weather_15day_forecast_v1=daily_weather_15day_forecast_v1.merge(daily_weather_15day_forecast_v122,on=['location','Weeks'],how='left')

In [0]:
#Function for creating daily weather variables from historical weather, historical weekly weather average and weather forecast
def create_daily_weather_vars(po_date, vs_location, vs_region, mat_id):
  historical_data['PO_Create_Date'] = pd.to_datetime(historical_data['PO_Create_Date'])
  last_historical_date = max(historical_data[historical_data['Material_No.']==mat_id]['PO_Create_Date'])
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  po_date_2w, po_date_4w, po_date_6w, location = po_delta_week_dates(po_date, vs_location, vs_region)
  
  alldates=pd.date_range(start=po_date, end=po_date_6w)
  tempdf1=pd.DataFrame(alldates,columns=['dates'])
  tempdf1['location']=location
  tempdf1['Weeks']=pd.to_datetime(tempdf1['dates']).dt.week
  
  daily_weather_historical_v11=daily_weather_historical_v1[(daily_weather_historical_v1['location']==location)].copy()
  weather_raw_data_historical_v2=pd.merge(tempdf1,daily_weather_historical_v11,on=['location','Weeks'],how='left')
  
  daily_weather_15day_forecast_v11=daily_weather_15day_forecast_v1[(daily_weather_15day_forecast_v1['location']==location)].copy()
  weather_raw_data_forecast_v2=pd.merge(tempdf1,daily_weather_15day_forecast_v11,on=['location','Weeks'],how='left')
  
  #Historical variables for typical weather data for 2, 4 and 6 weeks from PO Create Date
  rainfall_mm_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                                  (weather_raw_data_historical_v2['dates']<=po_date_2w)]['rainfall_mm'].max()
  rainfall_mm_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                                (weather_raw_data_historical_v2['dates']<=po_date_4w)]['rainfall_mm'].max()
  rainfall_mm_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                                (weather_raw_data_historical_v2['dates']<=po_date_6w)]['rainfall_mm'].max()
  snowdepth_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_2w)]['snowdepth'].max()
  snowdepth_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_4w)]['snowdepth'].max()
  snowdepth_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_6w)]['snowdepth'].max()
  tempmax_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_2w)]['tempmax'].max()
  tempmax_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_4w)]['tempmax'].max()
  tempmax_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_6w)]['tempmax'].max()
  tempmin_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_2w)]['tempmin'].min()
  tempmin_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_4w)]['tempmin'].min()
  tempmin_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                            (weather_raw_data_historical_v2['dates']<=po_date_6w)]['tempmin'].min()
  windspeed_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_2w)]['windspeed'].max()
  windspeed_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_4w)]['windspeed'].max()
  windspeed_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_6w)]['windspeed'].max()

  fogcount_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                             (weather_raw_data_historical_v2['dates']<=po_date_2w)]['fog_count'].max()
  fogcount_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                             (weather_raw_data_historical_v2['dates']<=po_date_4w)]['fog_count'].max()
  fogcount_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                             (weather_raw_data_historical_v2['dates']<=po_date_6w)]['fog_count'].max()
  snowcount_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_2w)]['snow_count'].max()
  snowcount_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_4w)]['snow_count'].max()
  snowcount_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_6w)]['snow_count'].max()
  raincount_2w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_2w)]['rain_count'].max()
  raincount_4w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_4w)]['rain_count'].max()
  raincount_6w_hist=weather_raw_data_historical_v2[(weather_raw_data_historical_v2['dates']>=po_date) & 
                                              (weather_raw_data_historical_v2['dates']<=po_date_6w)]['rain_count'].max()
  
  #Forecast variables for typical weather data for 2, 4 and 6 weeks from PO Create Date
  rainfall_mm_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                                (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['rainfall_mm'].max()
  rainfall_mm_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                              (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['rainfall_mm'].max()
  rainfall_mm_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                              (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['rainfall_mm'].max()
  snowdepth_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['snowdepth'].max()
  snowdepth_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['snowdepth'].max()
  snowdepth_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['snowdepth'].max()
  tempmax_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['tempmax'].max()
  tempmax_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['tempmax'].max()
  tempmax_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['tempmax'].max()
  tempmin_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['tempmin'].min()
  tempmin_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['tempmin'].min()
  tempmin_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                          (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['tempmin'].min()
  windspeed_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['windspeed'].max()
  windspeed_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['windspeed'].max()
  windspeed_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['windspeed'].max()

  fogcount_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                           (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['fog_count'].max()
  fogcount_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                           (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['fog_count'].max()
  fogcount_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                           (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['fog_count'].max()
  snowcount_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['snow_count'].max()
  snowcount_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['snow_count'].max()
  snowcount_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['snow_count'].max()
  raincount_2w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_2w)]['rain_count'].max()
  raincount_4w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_4w)]['rain_count'].max()
  raincount_6w_fcst=weather_raw_data_forecast_v2[(weather_raw_data_forecast_v2['dates']>=po_date) & 
                                            (weather_raw_data_forecast_v2['dates']<=po_date_6w)]['rain_count'].max()
  
  #By default setting the variables to historical average values
  rainfall_mm_2w = rainfall_mm_2w_hist
  rainfall_mm_4w = rainfall_mm_4w_hist
  rainfall_mm_6w = rainfall_mm_6w_hist
  snowdepth_2w = snowdepth_2w_hist
  snowdepth_4w = snowdepth_4w_hist
  snowdepth_6w = snowdepth_6w_hist
  tempmax_2w = tempmax_2w_hist
  tempmax_4w = tempmax_4w_hist
  tempmax_6w = tempmax_6w_hist
  tempmin_2w = tempmin_2w_hist
  tempmin_4w = tempmin_4w_hist
  tempmin_6w = tempmin_6w_hist
  windspeed_2w = windspeed_2w_hist
  windspeed_4w = windspeed_4w_hist
  windspeed_6w = windspeed_6w_hist
  fogcount_2w = fogcount_2w_hist
  fogcount_4w = fogcount_4w_hist
  fogcount_6w = fogcount_6w_hist
  snowcount_2w = snowcount_2w_hist
  snowcount_4w = snowcount_4w_hist
  snowcount_6w = snowcount_6w_hist
  raincount_2w = raincount_2w_hist
  raincount_4w = raincount_4w_hist
  raincount_6w = raincount_6w_hist
  
  #Rule 1: If PO + 6W date is less than the last available historical date, take all historical weather variables
  if pd.to_datetime(po_date_6w) <= pd.to_datetime(last_historical_date):
    rainfall_mm_2w = rainfall_mm_2w_hist
    rainfall_mm_4w = rainfall_mm_4w_hist
    rainfall_mm_6w = rainfall_mm_6w_hist
    snowdepth_2w = snowdepth_2w_hist
    snowdepth_4w = snowdepth_4w_hist
    snowdepth_6w = snowdepth_6w_hist
    tempmax_2w = tempmax_2w_hist
    tempmax_4w = tempmax_4w_hist
    tempmax_6w = tempmax_6w_hist
    tempmin_2w = tempmin_2w_hist
    tempmin_4w = tempmin_4w_hist
    tempmin_6w = tempmin_6w_hist
    windspeed_2w = windspeed_2w_hist
    windspeed_4w = windspeed_4w_hist
    windspeed_6w = windspeed_6w_hist
    fogcount_2w = fogcount_2w_hist
    fogcount_4w = fogcount_4w_hist
    fogcount_6w = fogcount_6w_hist
    snowcount_2w = snowcount_2w_hist
    snowcount_4w = snowcount_4w_hist
    snowcount_6w = snowcount_6w_hist
    raincount_2w = raincount_2w_hist
    raincount_4w = raincount_4w_hist
    raincount_6w = raincount_6w_hist
  
  #Rule 2: If PO + 2W date is less than the last available historical date and PO + 4W is greather than first available forecast date, take historical 2W weather variables and combine with the 4W and 6W weather variables from weather forecast data
  elif pd.to_datetime(po_date_2w) <= pd.to_datetime(last_historical_date) and pd.to_datetime(po_date_4w) >= pd.to_datetime(last_historical_date):
    rainfall_mm_2w = rainfall_mm_2w_hist
    rainfall_mm_4w = rainfall_mm_4w_fcst
    rainfall_mm_6w = rainfall_mm_6w_fcst
    snowdepth_2w = snowdepth_2w_hist
    snowdepth_4w = snowdepth_4w_fcst
    snowdepth_6w = snowdepth_6w_fcst
    tempmax_2w = tempmax_2w_hist
    tempmax_4w = tempmax_4w_fcst
    tempmax_6w = tempmax_6w_fcst
    tempmin_2w = tempmin_2w_hist
    tempmin_4w = tempmin_4w_fcst
    tempmin_6w = tempmin_6w_fcst
    windspeed_2w = windspeed_2w_hist
    windspeed_4w = windspeed_4w_fcst
    windspeed_6w = windspeed_6w_fcst
    fogcount_2w = fogcount_2w_hist
    fogcount_4w = fogcount_4w_fcst
    fogcount_6w = fogcount_6w_fcst
    snowcount_2w = snowcount_2w_hist
    snowcount_4w = snowcount_4w_fcst
    snowcount_6w = snowcount_6w_fcst
    raincount_2w = raincount_2w_hist
    raincount_4w = raincount_4w_fcst
    raincount_6w = raincount_6w_fcst
    
  #Rule 3: If PO + 4W date is less than the last available historical date and PO + 6W is greather than first available forecast date, take historical 2W, 4W weather variables and combine with the 6W weather variables from weather forecast data
  elif pd.to_datetime(po_date_4w) <= pd.to_datetime(last_historical_date) and pd.to_datetime(po_date_6w) >= pd.to_datetime(last_historical_date):
    rainfall_mm_2w = rainfall_mm_2w_hist
    rainfall_mm_4w = rainfall_mm_4w_hist
    rainfall_mm_6w = rainfall_mm_6w_fcst
    snowdepth_2w = snowdepth_2w_hist
    snowdepth_4w = snowdepth_4w_hist
    snowdepth_6w = snowdepth_6w_fcst
    tempmax_2w = tempmax_2w_hist
    tempmax_4w = tempmax_4w_hist
    tempmax_6w = tempmax_6w_fcst
    tempmin_2w = tempmin_2w_hist
    tempmin_4w = tempmin_4w_hist
    tempmin_6w = tempmin_6w_fcst
    windspeed_2w = windspeed_2w_hist
    windspeed_4w = windspeed_4w_hist
    windspeed_6w = windspeed_6w_fcst
    fogcount_2w = fogcount_2w_hist
    fogcount_4w = fogcount_4w_hist
    fogcount_6w = fogcount_6w_fcst
    snowcount_2w = snowcount_2w_hist
    snowcount_4w = snowcount_4w_hist
    snowcount_6w = snowcount_6w_fcst
    raincount_2w = raincount_2w_hist
    raincount_4w = raincount_4w_hist
    raincount_6w = raincount_6w_fcst
    
  #Rule 4: If PO date is greather than last available historical date or PO + 2W date is greather than the last historical date then take all weather variables from weather forecast data
  elif (pd.to_datetime(po_date) >= pd.to_datetime(last_historical_date)) or (pd.to_datetime(po_date) <= pd.to_datetime(last_historical_date) and pd.to_datetime(po_date_2w) >= pd.to_datetime(last_historical_date)):
    rainfall_mm_2w = rainfall_mm_2w_fcst
    rainfall_mm_4w = rainfall_mm_4w_fcst
    rainfall_mm_6w = rainfall_mm_6w_fcst
    snowdepth_2w = snowdepth_2w_fcst
    snowdepth_4w = snowdepth_4w_fcst
    snowdepth_6w = snowdepth_6w_fcst
    tempmax_2w = tempmax_2w_fcst
    tempmax_4w = tempmax_4w_fcst
    tempmax_6w = tempmax_6w_fcst
    tempmin_2w = tempmin_2w_fcst
    tempmin_4w = tempmin_4w_fcst
    tempmin_6w = tempmin_6w_fcst
    windspeed_2w = windspeed_2w_fcst
    windspeed_4w = windspeed_4w_fcst
    windspeed_6w = windspeed_6w_fcst
    fogcount_2w = fogcount_2w_fcst
    fogcount_4w = fogcount_4w_fcst
    fogcount_6w = fogcount_6w_fcst
    snowcount_2w = snowcount_2w_fcst
    snowcount_4w = snowcount_4w_fcst
    snowcount_6w = snowcount_6w_fcst
    raincount_2w = raincount_2w_fcst
    raincount_4w = raincount_4w_fcst
    raincount_6w = raincount_6w_fcst
  
  #Returning variables based on lead time cutoff
  if lead_time_cutoff <= 21:
    return [rainfall_mm_2w, snowdepth_2w, tempmax_2w, tempmin_2w, windspeed_2w, fogcount_2w, snowcount_2w, raincount_2w]
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    return [rainfall_mm_4w, snowdepth_4w, tempmax_4w, tempmin_4w, windspeed_4w, fogcount_4w, snowcount_4w, raincount_4w]
  else:
    return [rainfall_mm_6w, snowdepth_6w, tempmax_6w, tempmin_6w, windspeed_6w, fogcount_6w, snowcount_6w, raincount_6w]

In [0]:
#Reading extreme weather events data
extreme_weather_event_US = spark.read.table('Extreme_weather_data_for_vendors_within_US')
extreme_weather_event_US = extreme_weather_event_US.toPandas()
extreme_weather_event_US.fillna(0, inplace=True)

extreme_weather_event_outside_US = spark.read.table('Extreme_weather_data_for_vendors_outside_US')
extreme_weather_event_outside_US = extreme_weather_event_outside_US.toPandas()
extreme_weather_event_outside_US.fillna(0, inplace=True)

In [0]:
#Creating list of extreme weather events variables to keep
imp_ext_weather_count_cols = ['winter_storm_count','flash_flood_count','heat_count','hail_count','thunderstorm_wind_count','lightning_count',
                              'high_wind_count','wildfire_count','heavy_snow_count','strong_wind_count','winter_weather_count',
                              'extreme_cold/wind_chill_count','flood_count','heavy_rain_count','blizzard_count','cold/wind_chill_count',
                              'drought_count','tornado_count','lakeshore_flood_count','frost/freeze_count','tropical_storm_count',
                              'coastal_flood_count','marine_high_wind_count','marine_thunderstorm_wind_count','storm_surge/tide_count',
                              'tropical_depression_count','dense_fog_count','hurricane_count','marine_tropical_storm_count','ice_storm_count',
                              'marine_dense_fog_count','excessive_heat_count','waterspout_count','dense_smoke_count','sleet_count']

imp_ext_weather_duration_cols = ['marine_high_wind_duration','marine_thunderstorm_wind_duration','storm_surge/tide_duration',
                                 'waterspout_duration','tropical_depression_duration','marine_tropical_storm_duration']

In [0]:
#Function for creating extreme weather events variables
def create_exw_vars(po_create_date, vs_location, vs_region, vs_country):
  po_date_2w, po_date_4w, po_date_6w, location = po_delta_week_dates(po_create_date, vs_location, vs_region)
  PO_month=pd.to_datetime(po_create_date).month
  PO_month_6W=pd.to_datetime(po_date_6w).month
  cols_to_keep = imp_ext_weather_count_cols+imp_ext_weather_duration_cols
  
  count_col_list = []
  duration_col_list = []
  
  #creating extreme weather events variables for countries in US, Mexico and Canada
  if vs_country in ['us', 'mx', 'ca']:
    for count_col in imp_ext_weather_count_cols:
      if count_col not in extreme_weather_event_US.columns.tolist():
        count = 0
      else:
        if PO_month<PO_month_6W:
          count_df=extreme_weather_event_US[(extreme_weather_event_US['name']==location) & (extreme_weather_event_US['Months']>=PO_month) & 
                                            (extreme_weather_event_US['Months']<=PO_month_6W)][count_col]
        else:
          count_df=extreme_weather_event_US[(extreme_weather_event_US['name']==location) & ((extreme_weather_event_US['Months']>=PO_month) | 
                                            (extreme_weather_event_US['Months']<=PO_month_6W))][count_col]
        count_df=count_df[count_df.isnull()==False]
        count = count_df.sum()
      count_col = count
      count_col_list.append(count_col)
    
    for dur_col in imp_ext_weather_duration_cols:
      if dur_col not in extreme_weather_event_US.columns.tolist():
        overall_event_duration = 0
      else:
        if PO_month<PO_month_6W:
          dur_df=extreme_weather_event_US[(extreme_weather_event_US['name']==location) & (extreme_weather_event_US['Months']>=PO_month) & 
                                          (extreme_weather_event_US['Months']<=PO_month_6W)][dur_col]
        else:
          dur_df=extreme_weather_event_US[(extreme_weather_event_US['name']==location) & ((extreme_weather_event_US['Months']>=PO_month) | 
                                          (extreme_weather_event_US['Months']<=PO_month_6W))][dur_col]
        dur_df=dur_df[dur_df.isnull()==False]
        duration=dur_df.mean()
        overall_event_duration=duration*count
      dur_col = overall_event_duration
      duration_col_list.append(dur_col)
  
  #creating extreme weather events variables for countries in US, Mexico and Canada
  else:
    if vs_country in ['au', 'tw', 'vt']:
      coast_side = 'West'
    elif vs_country in ['es', 'de', 'br']:
      coast_side = 'East'
    
    for count_col in imp_ext_weather_count_cols:
      if count_col not in extreme_weather_event_outside_US.columns.tolist():
        count = 0
      else:
        if PO_month<PO_month_6W:
          count_df=extreme_weather_event_outside_US[(extreme_weather_event_outside_US['Coast']==coast_side) & 
                                                    (extreme_weather_event_outside_US['Months']>=PO_month) & 
                                                    (extreme_weather_event_outside_US['Months']<=PO_month_6W)][count_col]
        else:
          count_df=extreme_weather_event_outside_US[(extreme_weather_event_outside_US['Coast']==coast_side) & 
                                                    ((extreme_weather_event_outside_US['Months']>=PO_month) | 
                                                    (extreme_weather_event_outside_US['Months']<=PO_month_6W))][count_col]
        count_df=count_df[count_df.isnull()==False]
        count=count_df.sum()
      count_col = count
      count_col_list.append(count_col)
    
    for dur_col in imp_ext_weather_duration_cols:
      if dur_col not in extreme_weather_event_US.columns.tolist():
        overall_event_duration = 0
      else:
        if PO_month<PO_month_6W:
          dur_df=extreme_weather_event_outside_US[(extreme_weather_event_outside_US['Coast']==coast_side) & 
                                                  (extreme_weather_event_outside_US['Months']>=PO_month) & 
                                                  (extreme_weather_event_outside_US['Months']<=PO_month_6W)][dur_col]
        else:
          dur_df=extreme_weather_event_outside_US[(extreme_weather_event_outside_US['Coast']==coast_side) & 
                                                  ((extreme_weather_event_outside_US['Months']>=PO_month) | 
                                                   (extreme_weather_event_outside_US['Months']<=PO_month_6W))][dur_col]
        dur_df=dur_df[dur_df.isnull()==False]
        duration=dur_df.mean()
        overall_event_duration=duration*count
      dur_col = overall_event_duration
      duration_col_list.append(dur_col)
  
  return count_col_list+duration_col_list

In [0]:
traffic_data_final = spark.read.table('Traffic_dataset_raw')
traffic_data_final = traffic_data_final.toPandas()
traffic_data_final.fillna(0, inplace=True)
traffic_data_final['INCIDENT_START_DATE'] = pd.to_datetime(traffic_data_final['INCIDENT_START_DATE'])
traffic_data_final['INCIDENT_END_DATE'] = pd.to_datetime(traffic_data_final['INCIDENT_END_DATE'])

In [0]:
#Function for traffic variables
def create_traffic_vars(po_create_date, plant_id, vs_location, vs_region, vs_id, mat_id):
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  po_date_2w, po_date_4w, po_date_6w, location = po_delta_week_dates(po_create_date, vs_location, vs_region)
  
  traffic_col_dict = {}
  for event_type in traffic_data_final['DELAY_TYPE'].unique().tolist():
    traffic_col_dict[event_type+'_count_2w']=traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & 
                                              (traffic_data_final['INCIDENT_START_DATE']>=po_create_date) & 
                                              (traffic_data_final['INCIDENT_START_DATE']<=po_date_2w) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]+traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & (traffic_data_final['INCIDENT_START_DATE']<=po_create_date) & (traffic_data_final['INCIDENT_END_DATE']>=po_create_date) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]
    
    traffic_col_dict[event_type+'_count_4w']=traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & (traffic_data_final['INCIDENT_START_DATE']>=po_create_date) & (traffic_data_final['INCIDENT_START_DATE']<=po_date_4w) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]+traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & (traffic_data_final['INCIDENT_START_DATE']<=po_create_date) & (traffic_data_final['INCIDENT_END_DATE']>=po_create_date) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]
    
    traffic_col_dict[event_type+'_count_6w']=traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & (traffic_data_final['INCIDENT_START_DATE']>=po_create_date) & (traffic_data_final['INCIDENT_START_DATE']<=po_date_6w) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]+traffic_data_final[(traffic_data_final['Plant_ID']==plant_id) & (traffic_data_final['updated_VS_ID']==vs_id) & (traffic_data_final['INCIDENT_START_DATE']<=po_create_date) & (traffic_data_final['INCIDENT_END_DATE']>=po_create_date) & (traffic_data_final['DELAY_TYPE']==event_type)].shape[0]
  
  if lead_time_cutoff <= 21:
    col_list = [traffic_col_dict[key] for key in list(traffic_col_dict.keys()) if '_2w' in key]
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    col_list = [traffic_col_dict[key] for key in list(traffic_col_dict.keys()) if '_4w' in key]
  else:
    col_list = [traffic_col_dict[key] for key in list(traffic_col_dict.keys()) if '_6w' in key]
  
  return col_list

In [0]:
port_data_hist = spark.read.table('Port_congestion_data')
port_data_hist = port_data_hist.toPandas()
port_data_hist.fillna(0, inplace=True)

In [0]:
port_data_hist['Country'].unique()

In [0]:
#Reading forecast Port Congestion
port_data = spark.read.table('ves_import_congestion_01_03')
port_data = port_data.toPandas()
port_data['Week'] = pd.to_datetime(port_data['dates']).dt.week

In [0]:
#Function for creating port congestion variables
def create_port_congestion_vars(historical_data, po_create_date, vs_location, vs_region, vs_country, mat_id):
  historical_data['PO_Create_Date'] = pd.to_datetime(historical_data['PO_Create_Date'])
  historical_data_port_subset = historical_data[historical_data['Material_No.']==mat_id]
  last_historical_date = max(historical_data_port_subset['PO_Create_Date'])
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  po_date_2w, po_date_4w, po_date_6w, location = po_delta_week_dates(po_create_date, vs_location, vs_region)
  PO_week=pd.to_datetime(po_create_date).week
  PO_2week=pd.to_datetime(po_date_2w).week
  PO_4week=pd.to_datetime(po_date_4w).week
  PO_6week=pd.to_datetime(po_date_6w).week
  
  port_congestion_col_dict = {}
  #Creating historical port congestion variables
  if vs_country in ['ca', 'mx']:
    port_congestion_col_dict['Port_wait_time_mean_2w'] = 0
    port_congestion_col_dict['Port_wait_time_median_2w'] = 0
    port_congestion_col_dict['Port_wait_time_max_2w'] = 0
    port_congestion_col_dict['Port_wait_time_mean_4w'] = 0
    port_congestion_col_dict['Port_wait_time_median_4w'] = 0
    port_congestion_col_dict['Port_wait_time_max_4w'] = 0
    port_congestion_col_dict['Port_wait_time_mean_6w'] = 0
    port_congestion_col_dict['Port_wait_time_median_6w'] = 0
    port_congestion_col_dict['Port_wait_time_max_6w'] = 0
  if vs_country=='us': #(vendors within US)
    if PO_week<PO_2week:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        (port_data_hist['Week']>=PO_week) & 
                                                                        (port_data_hist['Week']<=PO_2week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          (port_data_hist['Week']>=PO_week) & 
                                                                          (port_data_hist['Week']<=PO_2week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       (port_data_hist['Week']>=PO_week) & 
                                                                       (port_data_hist['Week']<=PO_2week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        ((port_data_hist['Week']>=PO_week) | 
                                                                        (port_data_hist['Week']<=PO_2week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          ((port_data_hist['Week']>=PO_week) | 
                                                                          (port_data_hist['Week']<=PO_2week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       ((port_data_hist['Week']>=PO_week) | 
                                                                       (port_data_hist['Week']<=PO_2week))]['TotalWait'].max()
    
    if PO_week<PO_4week:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        (port_data_hist['Week']>=PO_week) & 
                                                                        (port_data_hist['Week']<=PO_4week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          (port_data_hist['Week']>=PO_week) & 
                                                                          (port_data_hist['Week']<=PO_4week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       (port_data_hist['Week']>=PO_week) & 
                                                                       (port_data_hist['Week']<=PO_4week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        ((port_data_hist['Week']>=PO_week) | 
                                                                        (port_data_hist['Week']<=PO_4week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          ((port_data_hist['Week']>=PO_week) | 
                                                                          (port_data_hist['Week']<=PO_4week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       ((port_data_hist['Week']>=PO_week) | 
                                                                       (port_data_hist['Week']<=PO_4week))]['TotalWait'].max()
    
    if PO_week<PO_6week:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        (port_data_hist['Week']>=PO_week) & 
                                                                        (port_data_hist['Week']<=PO_6week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          (port_data_hist['Week']>=PO_week) & 
                                                                          (port_data_hist['Week']<=PO_6week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       (port_data_hist['Week']>=PO_week) & 
                                                                       (port_data_hist['Week']<=PO_6week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                        ((port_data_hist['Week']>=PO_week) | 
                                                                        (port_data_hist['Week']<=PO_6week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                          ((port_data_hist['Week']>=PO_week) | 
                                                                          (port_data_hist['Week']<=PO_6week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[(port_data_hist['PortState']==vs_region) & 
                                                                       ((port_data_hist['Week']>=PO_week) | 
                                                                       (port_data_hist['Week']<=PO_6week))]['TotalWait'].max()
    
  elif ((vs_country=='au') | (vs_country=='vt') | (vs_country=='tw')): #(vendors in APAC)
    if PO_week<PO_2week:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |
                                                                         (port_data_hist['Coast']=='West Coast'))& 
                                                                        (port_data_hist['Week']>=PO_week) & 
                                                                        (port_data_hist['Week']<=PO_2week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& 
                                                (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_2week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& 
                                             (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_2week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |
                                                                         (port_data_hist['Coast']=='West Coast'))& 
                                                                        ((port_data_hist['Week']>=PO_week) | 
                                                                        (port_data_hist['Week']<=PO_2week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& 
                                                ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_2week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& 
                                             ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_2week))]['TotalWait'].max()
    
    if PO_week<PO_4week:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].max()
    
    if PO_week<PO_6week:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='West Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].max()
    
    
  elif ((vs_country=='es') | (vs_country=='br') | (vs_country=='de')): #(vendors in South America and EU)
    if PO_week<PO_2week:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_2week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_2week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_2week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_2week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_2week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_2w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_2week))]['TotalWait'].max()
    
    if PO_week<PO_4week:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_4week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_4w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_4week))]['TotalWait'].max()
    
    if PO_week<PO_6week:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& (port_data_hist['Week']>=PO_week) & (port_data_hist['Week']<=PO_6week)]['TotalWait'].max()
    else:
      port_congestion_col_dict['Port_wait_time_mean_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].mean()
      port_congestion_col_dict['Port_wait_time_median_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].median()
      port_congestion_col_dict['Port_wait_time_max_6w']=port_data_hist[((port_data_hist['Country']==vs_country) |(port_data_hist['Coast']=='East Coast'))& ((port_data_hist['Week']>=PO_week) | (port_data_hist['Week']<=PO_6week))]['TotalWait'].max()
  
  ##Creating forecast port congestion variables
  ##If PO date is greater than last historical date then take port congestion variables from port congestion forecast data
  if (pd.to_datetime(po_create_date) >= pd.to_datetime(last_historical_date)) or (pd.to_datetime(po_create_date) <= pd.to_datetime(last_historical_date) and pd.to_datetime(po_date_2w) >= pd.to_datetime(last_historical_date)):
    if vs_country == 'us':
      if PO_week<PO_2week:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_2week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_2week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].max()
      
      if PO_week<PO_4week:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_4week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_4week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].max()
      
      if PO_week<PO_6week:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_6week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_6week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[(port_data['PortState']==vs_region) & 
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].max()
    elif ((vs_country=='au') | (vs_country=='vt') | (vs_country=='tw')):
      if PO_week<PO_2week:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_2week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_2week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].max()
      
      if PO_week<PO_4week:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_4week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_4week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].max()
      
      if PO_week < PO_6week:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_6week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_6week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='West Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].max()
    elif ((vs_country=='es') | (vs_country=='br') | (vs_country=='de')):
      if PO_week<PO_2week:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_2week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_2week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_2week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_2w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_2week))]['TotalWait'].max()
      if PO_week<PO_4week:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_4week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_4week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_4week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_4week))]['TotalWait'].max()
      if PO_week<PO_6week:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               (port_data['Week']>=PO_week) & 
                                                                               (port_data['Week']<=PO_6week)]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 (port_data['Week']>=PO_week) & 
                                                                                 (port_data['Week']<=PO_6week)]['TotalWait'].max()
      else:
        port_congestion_col_dict['Port_wait_time_mean_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                               ((port_data['Week']>=PO_week) | 
                                                                               (port_data['Week']<=PO_6week))]['TotalWait'].mean()
        port_congestion_col_dict['Port_wait_time_median_4w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].median()
        port_congestion_col_dict['Port_wait_time_max_6w']= port_data[((port_data['Country']==vs_country) |
                                                                               (port_data['Coast']=='East Coast'))&
                                                                                 ((port_data['Week']>=PO_week) | 
                                                                                 (port_data['Week']<=PO_6week))]['TotalWait'].max()
  if lead_time_cutoff <= 21:
    col_list = [port_congestion_col_dict[key] for key in list(port_congestion_col_dict.keys()) if '_2w' in key]
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    col_list = [port_congestion_col_dict[key] for key in list(port_congestion_col_dict.keys()) if '_4w' in key]
  else:
    col_list = [port_congestion_col_dict[key] for key in list(port_congestion_col_dict.keys()) if '_6w' in key]
  
  return col_list

In [0]:
#Feature creation : "impact to business"
a=list()
for i in smi_1['Impact to Business']:
  a.append(i.split(' - ')[0])
smi_1['Impact types']=a

#cleaning data removing null values, defining the datatypes and renaming columns
smi_1.drop(smi_1[smi_1['Vendor Number'].isnull()].index,inplace=True)
smi_1['Material Number']=smi_1['Material Number'].astype(np.int64)
smi_1['Vendor Number']=smi_1['Vendor Number'].astype(np.int64)
smi_1.drop(smi_1[smi_1['Vendor Site'].isnull()].index,inplace=True)
smi_1['Vendor Site']=smi_1['Vendor Site'].astype(np.int64)
smi_1.rename(columns={'Vendor Number':'Vendor','Vendor Site':'updated_VS_ID','PO Number':'Purchase_Order','Material Number':'Material_No.','Material Order Quantity':'Purchase_Order_Scheduled_Qty'},inplace=True)
smi_1['Vendor']=smi_1['Vendor'].astype(np.int64)
smi_1['updated_VS_ID']=smi_1['updated_VS_ID'].astype(np.int64)
smi_1['Material_No.']=smi_1['Material_No.'].astype(np.int64)
smi_1['Purchase_Order_Scheduled_Qty']=smi_1['Purchase_Order_Scheduled_Qty'].astype(float)

In [0]:
def smi_var(smi_data, po_create_date, vs_id, mat_id):
  l3m_date = pd.to_datetime(po_create_date) - timedelta(90)
  l12m_date = pd.to_datetime(po_create_date) - timedelta(365)
  no_of_complaints_3_months = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                       (smi_data['Material_No.']==mat_id) & 
                                       (smi_data['Incident Date']<=po_create_date) & 
                                       (smi_data['Incident Date']>=l3m_date)]['updated_VS_ID'].shape[0]
  no_of_complaints_12_months = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                        (smi_data['Material_No.']==mat_id) & 
                                        (smi_data['Incident Date']<=po_create_date) & 
                                        (smi_data['Incident Date']>=l12m_date)]['updated_VS_ID'].shape[0]
  no_of_complaints_3_months_major = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                             (smi_data['Material_No.']==mat_id) & 
                                             (smi_data['Incident Date']<=po_create_date) & 
                                             (smi_data['Incident Date']>=l3m_date) & 
                                             (smi_data['Impact types']=='Major')]['updated_VS_ID'].shape[0]
  no_of_complaints_12_months_major = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                              (smi_data['Material_No.']==mat_id) & 
                                              (smi_data['Incident Date']<=po_create_date) & 
                                              (smi_data['Incident Date']>=l12m_date) & 
                                              (smi_data['Impact types']=='Major')]['updated_VS_ID'].shape[0]
  no_of_complaints_3_months_minor = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                             (smi_data['Material_No.']==mat_id) & 
                                             (smi_data['Incident Date']<=po_create_date) & 
                                             (smi_data['Incident Date']>=l3m_date) & 
                                             (smi_data['Impact types']=='Minor')]['updated_VS_ID'].shape[0]
  no_of_complaints_12_months_minor = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                              (smi_data['Material_No.']==mat_id) & 
                                              (smi_data['Incident Date']<=po_create_date) & 
                                              (smi_data['Incident Date']>=l12m_date) & 
                                              (smi_data['Impact types']=='Minor')]['updated_VS_ID'].shape[0]
  no_of_complaints_3_months_open = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                             (smi_data['Material_No.']==mat_id) & 
                                             (smi_data['Incident Date']<=po_create_date) & 
                                             (smi_data['Incident Date']>=l3m_date) & 
                                             (smi_data['Status']=='Open')]['updated_VS_ID'].shape[0]
  no_of_complaints_3_months_open_major = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                  (smi_data['Material_No.']==mat_id) & 
                                                  (smi_data['Incident Date']<=po_create_date) & 
                                                  (smi_data['Incident Date']>=l3m_date) & 
                                                  (smi_data['Status']=='Open') & 
                                                  (smi_data['Impact types']=='Major')]['updated_VS_ID'].shape[0]
  no_of_complaints_3_months_open_minor = smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                  (smi_data['Material_No.']==mat_id) & 
                                                  (smi_data['Incident Date']<=po_create_date) & 
                                                  (smi_data['Incident Date']>=l3m_date) & 
                                                  (smi_data['Status']=='Open') & 
                                                  (smi_data['Impact types']=='Minor')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Documentation_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Documentation')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Indigenous_Material_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Indigenous Material')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Packaging_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Packaging')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Transportation_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Transportation')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Ingredient_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Ingredient')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Comsumer_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Consumer')]['updated_VS_ID'].shape[0]
  no_of_Complaint_type_Foreign_Material_l12m=smi_data[(smi_data['updated_VS_ID']==vs_id) & 
                                                   (smi_data['Material_No.']==mat_id) & 
                                                   (smi_data['Incident Date']<=po_create_date) & 
                                                   (smi_1['Incident Date']>=l12m_date) & 
                                                   (smi_data['Complaint Types']=='Foreign Material')]['updated_VS_ID'].shape[0]
  if no_of_Complaint_type_Documentation_l12m != 0:
    Complaint_type_Documentation_l12m_distinct = 1
  else:
    Complaint_type_Documentation_l12m_distinct = 0
  
  if no_of_Complaint_type_Indigenous_Material_l12m != 0:
    Complaint_type_Indigenous_Material_l12m_distinct = 1
  else:
    Complaint_type_Indigenous_Material_l12m_distinct = 0
  
  if no_of_Complaint_type_Packaging_l12m != 0:
    Complaint_type_Packaging_l12m_distinct = 1
  else:
    Complaint_type_Packaging_l12m_distinct = 0
    
  if no_of_Complaint_type_Transportation_l12m != 0:
    Complaint_type_Transportation_l12m_distinct = 1
  else:
    Complaint_type_Transportation_l12m_distinct = 0
    
  if no_of_Complaint_type_Ingredient_l12m != 0:
    Complaint_type_Ingredient_l12m_distinct = 1
  else:
    Complaint_type_Ingredient_l12m_distinct = 0
    
  if no_of_Complaint_type_Comsumer_l12m != 0:
    Complaint_type_Consumer_l12m_distinct = 1
  else:
    Complaint_type_Consumer_l12m_distinct = 0
    
  if no_of_Complaint_type_Foreign_Material_l12m != 0:
    Complaint_type_Foreign_Material_l12m_distinct = 1
  else:
    Complaint_type_Foreign_Material_l12m_distinct = 0
  smi_var_list = [no_of_complaints_3_months, no_of_complaints_12_months, no_of_complaints_3_months_major, no_of_complaints_3_months_minor, 
                  no_of_complaints_12_months_major, no_of_complaints_12_months_minor, no_of_complaints_3_months_open, 
                  no_of_complaints_3_months_open_major, no_of_complaints_3_months_open_minor, no_of_Complaint_type_Documentation_l12m, 
                  no_of_Complaint_type_Packaging_l12m, no_of_Complaint_type_Transportation_l12m, no_of_Complaint_type_Ingredient_l12m, 
                  no_of_Complaint_type_Comsumer_l12m, no_of_Complaint_type_Foreign_Material_l12m, no_of_Complaint_type_Indigenous_Material_l12m, 
                  Complaint_type_Documentation_l12m_distinct, Complaint_type_Packaging_l12m_distinct, Complaint_type_Transportation_l12m_distinct, 
                  Complaint_type_Ingredient_l12m_distinct, Complaint_type_Consumer_l12m_distinct, Complaint_type_Foreign_Material_l12m_distinct, 
                  Complaint_type_Indigenous_Material_l12m_distinct]
  return smi_var_list

In [0]:
#list of columns smi cols kept 
smi_col_list = ['no.of.complaint_3_months', 'no.of.complaint_12_months', 'no.of.complaint_3_months_major', 'no.of.complaint_3_months_minor', 
                'no.of.complaint_12_months_major', 'no.of.complaint_12_months_minor', 'no.of.complaint_3_months_open', 
                'no.of.complaint_3_months_open_major', 'no.of.complaint_3_months_open_minor', 'no.of.Complaint_type_Documentation_l12m', 
                'no.of.Complaint_type_Packaging_l12m', 'no.of.Complaint_type_Transportation_l12m', 'no.of.Complaint_type_Ingredient_l12m', 
                'no.of.Complaint_type_Consumer_l12m', 'no.of.Complaint_type_Foreign_Material_l12m', 
                'no.of.Complaint_type_Indigenous_Material_l12m', 'Complaint_type_Documentation_l12m_distinct', 
                'Complaint_type_Packaging_l12m_distinct', 'Complaint_type_Transportation_l12m_distinct', 'Complaint_type_Ingredient_l12m_distinct', 
                'Complaint_type_Consumer_l12m_distinct', 'Complaint_type_Foreign_Material_l12m_distinct', 
                'Complaint_type_Indigenous_Material_l12m_distinct']

In [0]:
fin.drop(fin[fin['Risk Level'].isnull()].index,inplace=True)
data1=pd.DataFrame(fin['Risk Level'])
data1['Rating']=fin['Rating']
data2=pd.get_dummies(data1)
financial=pd.concat([fin,data2],axis=1)
financial.drop(['Risk Level','Rating'],axis=1,inplace=True)
financial.rename(columns={'Supplier Number':'Vendor'},inplace=True)

In [0]:
freq=pd.get_dummies(freq,columns=['Delivery_freq'])

In [0]:
#po_qty_to_del_qty ratio attribute generation
def po_qty_to_del_qty(historical_data, plant_id, vs_id, mat_id, po_create_date, po_qty):
  historical_data['Delivery_date'] = pd.to_datetime(historical_data['Delivery_date'])
  historical_data_v1 = historical_data[(historical_data['Plant_ID']==plant_id) & 
                                       (historical_data['Material_No.']==mat_id) & 
                                       (historical_data['updated_VS_ID']==vs_id)]
  historical_data_v1 = historical_data_v1.sort_values(['Delivery_date'])
  freq_df = freq[(freq['Plant_ID']==plant_id) & 
                 (freq['updated_VS_ID']==vs_id) & 
                 (freq['Material_No.']==mat_id)][[col for col in freq.columns.tolist() if col != 'days']].drop_duplicates()
  freq_df.loc[freq_df['Delivery_freq_Daily']==1, 'Delivery_Freq.'] = 'Daily'
  freq_df.loc[freq_df['Delivery_freq_Weekly']==1, 'Delivery_Freq.'] = 'Weekly'
  freq_df.loc[freq_df['Delivery_freq_Half Yearly']==1, 'Delivery_Freq.'] = 'Half Yearly'
  freq_df.loc[freq_df['Delivery_freq_Monthly']==1, 'Delivery_Freq.'] = 'Monthly'
  freq_df.loc[freq_df['Delivery_freq_Quarterly']==1, 'Delivery_Freq.'] = 'Quarterly'
  freq_df.loc[freq_df['Delivery_freq_Yearly']==1, 'Delivery_Freq.'] = 'Yearly'
  freq_df.drop(columns=[col for col in freq_df.columns.tolist() if '_freq_' in col], inplace=True)
  historical_data_v2 = historical_data_v1.merge(freq_df, on=['Plant_ID', 'updated_VS_ID', 'Material_No.'], how='left')
  historical_data_v2.reset_index()
  MovingAverage = 0
  if (historical_data_v2['Delivery_Freq.'].unique()[0]== 'Daily'):
    if len(historical_data_v2)<=5:
      MovingAverage = historical_data_v2['Delivered_Quantity'].mean()
    else:
      MovingAverage = historical_data_v2.tail(5)['Delivered_Quantity'].mean()
  elif (historical_data_v2['Delivery_Freq.'].unique()[0]== 'Weekly'):
    if len(historical_data_v2)<=2:
      MovingAverage = historical_data_v2['Delivered_Quantity'].mean()
    elif len(historical_data_v2)>2:
      MovingAverage = historical_data_v2.tail(3)['Delivered_Quantity'].mean()
  else:
    if len(historical_data_v2)<=2:
      MovingAverage = historical_data_v2['Delivered_Quantity'].mean()
    elif len(historical_data_v2)>2:
      MovingAverage = historical_data_v2.tail(2)['Delivered_Quantity'].mean()
  
  POqty_to_AvgDelQty = po_qty/MovingAverage
  return POqty_to_AvgDelQty

In [0]:
# list of columns to keep for models using internal and external variables
def create_col_list_order_int_ext(df):
  col_list_order = ['vendor_percentage','vendor_material_cnt_per_plant','distance','POqty_to_AvgDelQty','no.of.complaint_3_months',
                    'no.of.complaint_12_months','no.of.complaint_3_months_major','no.of.complaint_3_months_minor',
                    'no.of.complaint_12_months_major','no.of.complaint_12_months_minor','no.of.complaint_3_months_open',
                    'no.of.complaint_3_months_open_major','no.of.complaint_3_months_open_minor','no.of.Complaint_type_Documentation_l12m',
                    'no.of.Complaint_type_Packaging_l12m','no.of.Complaint_type_Transportation_l12m','no.of.Complaint_type_Ingredient_l12m',
                    'no.of.Complaint_type_Consumer_l12m','no.of.Complaint_type_Foreign_Material_l12m',
                    'no.of.Complaint_type_Indigenous_Material_l12m']+[col for col in df.columns.tolist() if '_2W' in col]+[col for col in df.columns.tolist() if '_4W' in col]+[col for col in df.columns.tolist() if '_6W' in col]+[col for col in df.columns.tolist() if 'Port_wait_time_' in col]+[col for col in df.columns.tolist() if 'closed_road_or_infinte_delay_count_' in col]+[col for col in df.columns.tolist() if 'moderate_count_' in col]+[col for col in df.columns.tolist() if 'major_count_' in col]+['coastal_flood_count',
                    'high_wind_count','dense_smoke_count','tropical_storm_count','wildfire_count','winter_weather_count','blizzard_count',
                    'cold/wind_chill_count','tropical_depression_count','hurricane_count','heavy_rain_count','marine_tropical_storm_count',
                    'lakeshore_flood_count','sleet_count','drought_count','marine_dense_fog_count','waterspout_count','frost/freeze_count',
                    'thunderstorm_wind_count','dense_fog_count','strong_wind_count','marine_thunderstorm_wind_count','tornado_count',
                    'hail_count','storm_surge/tide_count','marine_high_wind_count','winter_storm_count','extreme_cold/wind_chill_count',
                    'heat_count','lightning_count','excessive_heat_count','flash_flood_count','flood_count','heavy_snow_count',
                    'ice_storm_count','waterspout_duration','storm_surge/tide_duration','marine_high_wind_duration',
                    'marine_tropical_storm_duration','tropical_depression_duration','marine_thunderstorm_wind_duration','dom_or_int_int',
                    'sku_type_single','nestle_managed_freight_yes','mode_of_transport_road','Material_group_type_R',
                    'single_or_multisource_single','Risk_Level_High','Risk_Level_Low','Risk_Level_Medium','Rating_A','Rating_B',
                    'Rating_C','Rating_D','Rating_D_-_FND','Rating_F','Rating_Inactive','Rating_Out_of_Scope','Delivery_freq_Daily',
                    'Delivery_freq_Half_Yearly','Delivery_freq_Monthly','Delivery_freq_Quarterly','Delivery_freq_Weekly',
                    'Delivery_freq_Yearly','Complaint_type_Documentation_l12m_distinct','Complaint_type_Packaging_l12m_distinct',
                    'Complaint_type_Transportation_l12m_distinct','Complaint_type_Ingredient_l12m_distinct',
                    'Complaint_type_Consumer_l12m_distinct','Complaint_type_Foreign_Material_l12m_distinct',
                    'Complaint_type_Indigenous_Material_l12m_distinct']
  return col_list_order

In [0]:
#Getting columns for daily weather data
def getting_daily_weather_col_list(historical_data, mat_id):
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  if lead_time_cutoff <= 21:
    daily_weather_col_list = ['rainfall_mm_2W', 'snowdepth_2W', 'tempmax_2W', 'tempmin_2W', 'windspeed_2W', 'fog_count_2W', 'snow_count_2W', 
                              'rain_count_2W']
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    daily_weather_col_list = ['rainfall_mm_4W', 'snowdepth_4W', 'tempmax_4W', 'tempmin_4W', 'windspeed_4W', 'fog_count_4W', 'snow_count_4W', 
                              'rain_count_4W']
  else:
    daily_weather_col_list = ['rainfall_mm_6W', 'snowdepth_6W', 'tempmax_6W', 'tempmin_6W', 'windspeed_6W', 'fog_count_6W', 'snow_count_6W', 
                              'rain_count_6W']
  return daily_weather_col_list

#Getting columns for traffic data
def getting_traffic_col_list(historical_data, mat_id):
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  if lead_time_cutoff <= 21:
    traffic_col_list = ['closed_road_or_infinte_delay_count_2w','moderate_count_2w','major_count_2w']
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    traffic_col_list = ['closed_road_or_infinte_delay_count_4w','moderate_count_4w','major_count_4w']
  else:
    traffic_col_list = ['closed_road_or_infinte_delay_count_6w','moderate_count_6w','major_count_6w']
  return traffic_col_list

#Getting columns for port congestion data
def getting_port_congestion_col_list(historical_data, mat_id):
  lead_time_cutoff = historical_data[(historical_data['Material_No.']==mat_id) & 
                                     (historical_data['lead_time']>0)]['lead_time'].describe()[6]
  if lead_time_cutoff <= 21:
    port_congestion_col_list = ['Port_wait_time_mean_2w','Port_wait_time_median_2w','Port_wait_time_max_2w']
  elif lead_time_cutoff > 21 and lead_time_cutoff <= 35:
    port_congestion_col_list = ['Port_wait_time_mean_4w','Port_wait_time_median_4w','Port_wait_time_max_4w']
  else:
    port_congestion_col_list = ['Port_wait_time_mean_6w','Port_wait_time_median_6w','Port_wait_time_max_6w']
  return port_congestion_col_list

#Getting columns for port congestion data
def getting_extreme_weather_col_list():
  return imp_ext_weather_count_cols + imp_ext_weather_duration_cols

In [0]:
# list of columns to keep for models using internal variables
col_list_order_int = ['vendor_percentage', 'vendor_material_cnt_per_plant', 'distance', 'POqty_to_AvgDelQty', 'no.of.complaint_3_months', 'no.of.complaint_12_months', 'no.of.complaint_3_months_major', 'no.of.complaint_3_months_minor', 'no.of.complaint_12_months_major', 'no.of.complaint_12_months_minor', 'no.of.complaint_3_months_open', 'no.of.complaint_3_months_open_major', 'no.of.complaint_3_months_open_minor', 'no.of.Complaint_type_Documentation_l12m', 'no.of.Complaint_type_Packaging_l12m', 'no.of.Complaint_type_Transportation_l12m', 'no.of.Complaint_type_Ingredient_l12m', 'no.of.Complaint_type_Consumer_l12m', 'no.of.Complaint_type_Foreign_Material_l12m', 'no.of.Complaint_type_Indigenous_Material_l12m', 'dom_or_int_int', 'sku_type_single', 'nestle_managed_freight_yes', 'mode_of_transport_road', 'Material_group_type_R', 'single_or_multisource_single','Risk_Level_High', 'Risk_Level_Low', 'Risk_Level_Medium', 'Rating_A', 'Rating_B', 'Rating_C', 'Rating_D', 'Rating_D_-_FND', 'Rating_F', 'Rating_Inactive', 'Rating_Out_of_Scope', 'Delivery_freq_Daily', 'Delivery_freq_Half_Yearly', 'Delivery_freq_Monthly', 'Delivery_freq_Quarterly', 'Delivery_freq_Weekly', 'Delivery_freq_Yearly', 'Complaint_type_Documentation_l12m_distinct', 'Complaint_type_Packaging_l12m_distinct', 'Complaint_type_Transportation_l12m_distinct', 'Complaint_type_Ingredient_l12m_distinct', 'Complaint_type_Consumer_l12m_distinct', 'Complaint_type_Foreign_Material_l12m_distinct', 'Complaint_type_Indigenous_Material_l12m_distinct']

In [0]:
def pred_output(historical_data, plant_id, vs_id, mat_id, po_create_date, po_qty):
  historical_data_v1 = historical_data[(historical_data['Plant_ID']==plant_id) & 
                                       (historical_data['updated_VS_ID']==vs_id)]
  vs_location = historical_data_v1['updated_VS_location'].unique()[0]
  vs_region = historical_data_v1['updated_VS_region'].unique()[0]
  vs_country = historical_data_v1['updated_VS_country'].unique()[0]
  if mat_id not in material_best_model['Material_ID'].unique().tolist():#for 3 material, best model is not available hardcoding it with xgboost
    mat_model = 'xgb_int_ext_reg'
  else:
    mat_model = material_best_model[material_best_model['Material_ID']==mat_id]['best_score_model'].unique()[0]
  
  #Getting Main Vendor ID from histrical data to get financial data for the corresponding Main Vendor
  vendor = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['Vendor'].unique()[0]
  
  #Getting distance between Plant and VS from historical data
  distance = historical_data_v1['distance'].unique()[0]
  
  #Getting PO Create Month from po_create_date
  PO_Create_Month = 'PO_Create_Month_'+str(datetime.datetime.strptime(po_create_date, "%Y-%m-%d").month)
  
  #Getting vendor percentage and vendor material cnt per plant feature from historical data
  vendor_percentage = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['vendor_percentage'].unique()[0]
  vendor_material_cnt_per_plant = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['vendor_material_cnt_per_plant'].unique()[0]
  
  #Getting SMI Variables corresponding to the PO Create Date
  smi_var_list = smi_var(smi_1, po_create_date, vs_id, mat_id)
  
  #Getting financial data for the main vendor ID generated above, if financial data for a vendor is not available, taking the variables as 0
  test_fin=financial[financial['Vendor']==vendor]
  if test_fin.shape[0]>0:
    vendor_fin_df = financial[financial['Vendor']==vendor][[col for col in financial.columns.tolist() if col != 'Supplier']].drop_duplicates()
  else:
    vendor_fin_df = financial[financial['Vendor']==vendor][[col for col in financial.columns.tolist() if col != 'Supplier']].drop_duplicates()
    for i in vendor_fin_df.columns.tolist():
      vendor_fin_df.loc[i]=0
    vendor_fin_df.loc['Vendor']=vendor
    
  #Getting freq data for a Plant, Material and VS
  freq_df = freq[(freq['Plant_ID']==plant_id) & 
                 (freq['updated_VS_ID']==vs_id) & 
                 (freq['Material_No.']==mat_id)][[col for col in freq.columns.tolist() if col != 'days']].drop_duplicates()
  
  #Renaming freq column to match with columns in Historical data
  if 'Delivery_freq_Half Yearly' in freq_df.columns.tolist():
    freq_df.rename(columns={'Delivery_freq_Half Yearly':'Delivery_freq_Half_Yearly'}, inplace=True)
    
  #Getting PO Qty to Average Delivery QTY based on the historical delivery data between an given Plant and a given Vendor for a given Material
  POqty_to_AvgDelQty = po_qty_to_del_qty(historical_data, plant_id, vs_id, mat_id, po_create_date, po_qty)
  
  #Getting the domestic/international flag, sku_type, simgle_or_multisource, mode_of_transport, nestle_managed_frieght, material group type from Historical data
  dom_or_int_int = historical_data_v1['dom_or_int_int'].unique()[0]
  sku_type_single = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['sku_type_single'].unique()[0]
  single_or_multisource_single = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['single_or_multisource_single'].unique()[0]
  mode_of_transport_road = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['mode_of_transport_road'].unique()[0]
  nestle_managed_freight_yes = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['nestle_managed_freight_yes'].unique()[0]
  Material_group_type_R = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['Material_group_type_R'].unique()[0]
  
  #Creating a copy from freq df to add other independent features to get the final data containg independent features for predicting lead time
  df_int = freq_df.copy()
  df_int['distance'] = distance
  df_int['vendor_percentage'] = vendor_percentage
  df_int['vendor_material_cnt_per_plant'] = vendor_material_cnt_per_plant
#   df_int[PO_Create_Month] = 1
  
  #Joining column names with '_' to match with historical data
  for col in vendor_fin_df.columns.tolist():
    if ('Risk' in col) or ('Rating' in col):
      col1 = '_'.join(col.split(' '))
      df_int[col1] = vendor_fin_df[col].unique()[0]
  
  #Adding SMI variables and other features to the independent feature dataset
  for i in range(len(smi_var_list)):
    df_int[smi_col_list[i]] = smi_var_list[i]
  df_int['POqty_to_AvgDelQty'] = POqty_to_AvgDelQty
  df_int['dom_or_int_int'] = dom_or_int_int
  df_int['sku_type_single'] = sku_type_single
  df_int['single_or_multisource_single'] = single_or_multisource_single
  df_int['mode_of_transport_road'] = mode_of_transport_road
  df_int['nestle_managed_freight_yes'] = nestle_managed_freight_yes
  df_int['Material_group_type_R'] = Material_group_type_R
  df_int = df_int[col_list_order_int]
  
  #Getting daily weather data corresponding to VS location and PO Create Date
  df_int_ext = df_int.copy()
  daily_weather_col_list = getting_daily_weather_col_list(historical_data, mat_id)
  traffic_col_list = getting_traffic_col_list(historical_data, mat_id)
  port_congestion_col_list = getting_port_congestion_col_list(historical_data, mat_id)
  extreme_weather_col_list = getting_extreme_weather_col_list()
  #Getting daily weather events
  weather_var_list = create_daily_weather_vars(po_create_date, vs_location, vs_region, mat_id)
  #Getting extreme weather events 
  exw_vars = create_exw_vars(po_create_date, vs_location, vs_region, vs_country)
  #Getting traffic data 
  traffic_vars = create_traffic_vars(po_create_date, plant_id, vs_location, vs_region, vs_id, mat_id)
  #Getting port congestion 
  port_congestion_vars = create_port_congestion_vars(historical_data, po_create_date, vs_location, vs_region, vs_country, mat_id)
  
  for i in range(len(weather_var_list)):
    df_int_ext[daily_weather_col_list[i]] = weather_var_list[i]
  
  for i in range(len(exw_vars)):
    df_int_ext[extreme_weather_col_list[i]] = exw_vars[i]
  
  for i in range(len(traffic_vars)):
    df_int_ext[traffic_col_list[i]] = traffic_vars[i]
  
  for i in range(len(port_congestion_vars)):
    df_int_ext[port_congestion_col_list[i]] = port_congestion_vars[i]
  
  df_int_ext = df_int_ext.reset_index()
  
  #Ordering the columns in the same order in which the materials were modelled
  col_list_order_int_ext = create_col_list_order_int_ext(df_int_ext)
  df_int_ext = df_int_ext[col_list_order_int_ext]
  df_int_ext.fillna(0, inplace=True)
  
  #Extracting the model object from the directory where the models were saved
  if mat_model != 'avg':
    if '_ext' in mat_model:
      mat_model_name = mat_model.split('_int')[0]
      file_name = '/dbfs/FileStore/models/' + '2022_03_20/'+str(mat_id)+'/'+mat_model_name+'_int_ext_reg.pkl'
      loaded_model = pickle.load(open(file_name, "rb"))
      lead_time_pred = loaded_model.predict(df_int_ext)[0]
      try:
        eli5_output = eli5.explain_prediction_df(estimator=loaded_model, doc=df_int_ext.iloc[0])
        eli5_output = eli5_output[['feature', 'weight', 'value']]
        eli5_output.drop(columns=['value'],inplace=True)
        eli5_output['method']='eli5'
      except:
        explainer = shap.TreeExplainer(loaded_model)
        shap_values = explainer.shap_values(df_int_ext.iloc[0])
        eli5_output= pd.DataFrame(shap_values,df_int_ext.columns,columns=['weight'])
        eli5_output.reset_index(inplace=True)
        eli5_output.rename(columns={'index':'feature'},inplace=True)
        eli5_output['method']='shap'
      eli5_output_v1 = eli5_output[eli5_output['weight']>=0]
      eli5_output_v1.sort_values(['weight'],ascending=False,inplace=True)
      eli5_output_v2 = eli5_output_v1[eli5_output_v1['feature']!='<BIAS>']
      df_int=df_int_ext.transpose().reset_index()
      df_int.rename(columns={'index':'feature',0:'fet_val'},inplace=True)
      eli5_output_v2=eli5_output_v2.merge(df_int,on=['feature'],how='inner')
      for feature,fest_val,index in zip(eli5_output_v2['feature'],eli5_output_v2['fet_val'],eli5_output_v2.index):
        if (('tempmax'.lower() not in feature.lower()) & ('tempmin'.lower() not in feature.lower())):
          if fest_val<=0:
            eli5_output_v2.drop(index,inplace=True)
          else:
            pass
        else:
          pass
      eli5_output_v2.reset_index(inplace=True)
      eli5_output_v2.drop(columns=['index','fet_val'],inplace=True)
      eli5_output_final = eli5_output_v2.head(5)
    elif '_int_ext' not in mat_model:
      file_name = '/dbfs/FileStore/models/' + run_date_phase_1+'/'+str(mat_id)+'/'+mat_model+'.pkl'   #enter file path
      loaded_model = pickle.load(open(file_name, "rb"))
      bus_day_count_pred = loaded_model.predict(df_int)[0]
      try:
        #Generating explainations for predictions with eli5
        eli5_output = eli5.explain_prediction_df(estimator=loaded_model, doc=df_int.iloc[0])
        eli5_output = eli5_output[['feature', 'weight', 'value']]
        eli5_output.drop(columns=['value'],inplace=True)
        eli5_output['method']='eli5'
      except:
        #If eli5 explainations are not available then generating explainations using shap values
        explainer = shap.TreeExplainer(loaded_model)
        shap_values = explainer.shap_values(df_int.iloc[0])
        eli5_output= pd.DataFrame(shap_values,df_int.columns,columns=['weight'])
        eli5_output.reset_index(inplace=True)
        eli5_output.rename(columns={'index':'feature'},inplace=True)
        eli5_output['method']='shap'
      eli5_output_v1 = eli5_output[eli5_output['weight']>=0]
      eli5_output_v1.sort_values('weight',ascending=False,inplace=True)
      eli5_output_v2 = eli5_output_v1[eli5_output_v1['feature']!='<BIAS>']
      df_int=df_int_ext.transpose().reset_index()
      df_int.rename(columns={'index':'feature',0:'fet_val'},inplace=True)
      eli5_output_v2=eli5_output_v2.merge(df_int,on=['feature'],how='inner')
      for feature,fest_val,index in zip(eli5_output_v2['feature'],eli5_output_v2['fet_val'],eli5_output_v2.index):
        if (('tempmax'.lower() not in feature.lower()) & ('tempmin'.lower() not in feature.lower())):
          if fest_val<=0:
            eli5_output_v2.drop(index,inplace=True)
          else:
            pass
        else:
          pass
      eli5_output_v2.reset_index(inplace=True)
      eli5_output_v2.drop(columns=['index','fet_val'],inplace=True)
      eli5_output_final = eli5_output_v2.head(5)
      lead_time_pred = ((pd.to_datetime(po_create_date) + BDay(int(math.ceil(bus_day_count_pred)))) - pd.to_datetime(po_create_date)).days
  else:
    #If model is not built for a material then using monthly average as predicted lead time
    lead_time_pred = historical_data_v1[(historical_data_v1['Material_No.']==mat_id) & 
                                        (historical_data_v1[PO_Create_Month]==1)]['lead_time'].mean()
    mat_avg = historical_data_v1[(historical_data_v1['Material_No.']==mat_id)]['lead_time'].mean()
    if lead_time_pred > mat_avg:
      weight = lead_time_pred-mat_avg
    else:
      weight = 0
    eli5_details = {
      'feature' : [PO_Create_Month],
      'weight' : weight,
      'method' : 'none'
    }
    eli5_output_final = pd.DataFrame(eli5_details)
  
  #If the lead time is not predicted then, getting the prediction as the average business day count for a Plant, VS and Material combo
  if math.isnan(lead_time_pred) == True:
    lead_time_pred = historical_data_v1[(historical_data_v1['Material_No.']==mat_id)]['lead_time'].mean()
    eli5_details = {
      'feature' : ['material_average_lead_time'],
      'weight' : 0,
      'method' : 'none'
    }
    eli5_output_final = pd.DataFrame(eli5_details)
    
  return lead_time_pred,eli5_output_final

In [0]:
# def pred_output(historical_data, plant_id, vs_id, mat_id, po_create_date, po_qty):
#   historical_data_v1 = historical_data[(historical_data['Plant_ID']==plant_id) & 
#                                        (historical_data['updated_VS_ID']==vs_id)]
#   vs_location = historical_data_v1['updated_VS_location'].unique()[0]
#   vs_region = historical_data_v1['updated_VS_region'].unique()[0]
#   vs_country = historical_data_v1['updated_VS_country'].unique()[0]
#   if mat_id not in material_best_model['Material_ID'].unique().tolist():#for 3 material, best model is not available hardcoding it with xgboost
#     mat_model = 'xgb_int_ext_reg'
#   else:
#     mat_model = material_best_model[material_best_model['Material_ID']==mat_id]['best_score_model'].unique()[0]
  
#   #Getting Main Vendor ID from histrical data to get financial data for the corresponding Main Vendor
#   vendor = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['Vendor'].unique()[0]
  
#   #Getting distance between Plant and VS from historical data
#   distance = historical_data_v1['distance'].unique()[0]
  
#   #Getting PO Create Month from po_create_date
#   PO_Create_Month = 'PO_Create_Month_'+str(datetime.datetime.strptime(po_create_date, "%Y-%m-%d").month)
  
#   #Getting vendor percentage and vendor material cnt per plant feature from historical data
#   vendor_percentage = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['vendor_percentage'].unique()[0]
#   vendor_material_cnt_per_plant = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['vendor_material_cnt_per_plant'].unique()[0]
  
#   #Getting SMI Variables corresponding to the PO Create Date
#   smi_var_list = smi_var(smi_1, po_create_date, vs_id, mat_id)
  
#   #Getting financial data for the main vendor ID generated above, if financial data for a vendor is not available, taking the variables as 0
#   test_fin=financial[financial['Vendor']==vendor]
#   if test_fin.shape[0]>0:
#     vendor_fin_df = financial[financial['Vendor']==vendor][[col for col in financial.columns.tolist() if col != 'Supplier']].drop_duplicates()
#   else:
#     vendor_fin_df = financial[financial['Vendor']==vendor][[col for col in financial.columns.tolist() if col != 'Supplier']].drop_duplicates()
#     for i in vendor_fin_df.columns.tolist():
#       vendor_fin_df.loc[i]=0
#     vendor_fin_df.loc['Vendor']=vendor
    
#   #Getting freq data for a Plant, Material and VS
#   freq_df = freq[(freq['Plant_ID']==plant_id) & 
#                  (freq['updated_VS_ID']==vs_id) & 
#                  (freq['Material_No.']==mat_id)][[col for col in freq.columns.tolist() if col != 'days']].drop_duplicates()
  
#   #Renaming freq column to match with columns in Historical data
#   if 'Delivery_freq_Half Yearly' in freq_df.columns.tolist():
#     freq_df.rename(columns={'Delivery_freq_Half Yearly':'Delivery_freq_Half_Yearly'}, inplace=True)
    
#   #Getting PO Qty to Average Delivery QTY based on the historical delivery data between an given Plant and a given Vendor for a given Material
#   POqty_to_AvgDelQty = po_qty_to_del_qty(historical_data, plant_id, vs_id, mat_id, po_create_date, po_qty)
  
#   #Getting the domestic/international flag, sku_type, simgle_or_multisource, mode_of_transport, nestle_managed_frieght, material group type from Historical data
#   dom_or_int_int = historical_data_v1['dom_or_int_int'].unique()[0]
#   sku_type_single = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['sku_type_single'].unique()[0]
#   single_or_multisource_single = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['single_or_multisource_single'].unique()[0]
#   mode_of_transport_road = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['mode_of_transport_road'].unique()[0]
#   nestle_managed_freight_yes = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['nestle_managed_freight_yes'].unique()[0]
#   Material_group_type_R = historical_data_v1[historical_data_v1['Material_No.']==mat_id]['Material_group_type_R'].unique()[0]
  
#   #Creating a copy from freq df to add other independent features to get the final data containg independent features for predicting lead time
#   df_int = freq_df.copy()
#   df_int['distance'] = distance
#   df_int['vendor_percentage'] = vendor_percentage
#   df_int['vendor_material_cnt_per_plant'] = vendor_material_cnt_per_plant
# #   df_int[PO_Create_Month] = 1
  
#   #Joining column names with '_' to match with historical data
#   for col in vendor_fin_df.columns.tolist():
#     if ('Risk' in col) or ('Rating' in col):
#       col1 = '_'.join(col.split(' '))
#       df_int[col1] = vendor_fin_df[col].unique()[0]
  
#   #Adding SMI variables and other features to the independent feature dataset
#   for i in range(len(smi_var_list)):
#     df_int[smi_col_list[i]] = smi_var_list[i]
#   df_int['POqty_to_AvgDelQty'] = POqty_to_AvgDelQty
#   df_int['dom_or_int_int'] = dom_or_int_int
#   df_int['sku_type_single'] = sku_type_single
#   df_int['single_or_multisource_single'] = single_or_multisource_single
#   df_int['mode_of_transport_road'] = mode_of_transport_road
#   df_int['nestle_managed_freight_yes'] = nestle_managed_freight_yes
#   df_int['Material_group_type_R'] = Material_group_type_R
#   df_int = df_int[col_list_order_int]
  
#   #Getting daily weather data corresponding to VS location and PO Create Date
#   df_int_ext = df_int.copy()
#   daily_weather_col_list = getting_daily_weather_col_list(historical_data, mat_id)
#   traffic_col_list = getting_traffic_col_list(historical_data, mat_id)
#   port_congestion_col_list = getting_port_congestion_col_list(historical_data, mat_id)
#   extreme_weather_col_list = getting_extreme_weather_col_list()
#   #Getting daily weather events
#   weather_var_list = create_daily_weather_vars(po_create_date, vs_location, vs_region, mat_id)
#   #Getting extreme weather events 
#   exw_vars = create_exw_vars(po_create_date, vs_location, vs_region, vs_country)
#   #Getting traffic data 
#   traffic_vars = create_traffic_vars(po_create_date, plant_id, vs_location, vs_region, vs_id, mat_id)
#   #Getting port congestion 
#   port_congestion_vars = create_port_congestion_vars(historical_data, po_create_date, vs_location, vs_region, vs_country, mat_id)
  
#   for i in range(len(weather_var_list)):
#     df_int_ext[daily_weather_col_list[i]] = weather_var_list[i]
  
#   for i in range(len(exw_vars)):
#     df_int_ext[extreme_weather_col_list[i]] = exw_vars[i]
  
#   for i in range(len(traffic_vars)):
#     df_int_ext[traffic_col_list[i]] = traffic_vars[i]
  
#   for i in range(len(port_congestion_vars)):
#     df_int_ext[port_congestion_col_list[i]] = port_congestion_vars[i]
  
#   df_int_ext = df_int_ext.reset_index()
  
#   #Ordering the columns in the same order in which the materials were modelled
#   col_list_order_int_ext = create_col_list_order_int_ext(df_int_ext)
#   df_int_ext = df_int_ext[col_list_order_int_ext]
#   df_int_ext.fillna(0, inplace=True)
  
#   #Extracting the model object from the directory where the models were saved
#   if mat_model != 'avg':
#     if '_ext' in mat_model:
#       mat_model_name = mat_model.split('_int')[0]
#       file_name = '/dbfs/FileStore/models/' + '2022_03_20/'+str(mat_id)+'/'+mat_model_name+'_int_ext_reg.pkl'
#       loaded_model = pickle.load(open(file_name, "rb"))
#       lead_time_pred = loaded_model.predict(df_int_ext)[0]
#       #print(lead_time_pred)
#       #print(df_int_ext.iloc[0])
#       #print(eli5.explain_prediction(estimator=loaded_model, doc=df_int_ext.iloc[0]))
#       eli5_output = eli5.explain_prediction_df(estimator=loaded_model, doc=df_int_ext.iloc[0])
#       eli5_output = eli5_output[['feature', 'weight', 'value']]
#       eli5_output.drop(columns=['value'],inplace=True)
#       eli5_output['method']='eli5'
#       eli5_output_v1 = eli5_output[eli5_output['weight']>=0]
#       eli5_output_v1.sort_values(['weight'],ascending=False,inplace=True)
#       eli5_output_v2 = eli5_output_v1[eli5_output_v1['feature']!='<BIAS>']
#       df_int=df_int_ext.transpose().reset_index()
#       df_int.rename(columns={'index':'feature',0:'fet_val'},inplace=True)
#       eli5_output_v2=eli5_output_v2.merge(df_int,on=['feature'],how='inner')
#       for feature,fest_val,index in zip(eli5_output_v2['feature'],eli5_output_v2['fet_val'],eli5_output_v2.index):
#         if (('tempmax'.lower() not in feature.lower()) & ('tempmin'.lower() not in feature.lower())):
#           if fest_val<=0:
#             eli5_output_v2.drop(index,inplace=True)
#           else:
#             pass
#         else:
#           pass
#       eli5_output_v2.reset_index(inplace=True)
#       eli5_output_v2.drop(columns=['index'],inplace=True)
#       eli5_output_final = eli5_output_v2.head(5)
#     elif '_int_ext' not in mat_model:
#       file_name = '/dbfs/FileStore/models/' + run_date_phase_1+'/'+str(mat_id)+'/'+mat_model+'.pkl'
#       loaded_model = pickle.load(open(file_name, "rb"))
#       bus_day_count_pred = loaded_model.predict(df_int)[0]
#       #Generating explainations for predictions with eli5
#       eli5_output = eli5.explain_prediction_df(estimator=loaded_model, doc=df_int.iloc[0])
#       eli5_output = eli5_output[['feature', 'weight', 'value']]
#       eli5_output.drop(columns=['value'],inplace=True)
#       eli5_output['method']='eli5'
#       eli5_output_v1 = eli5_output[eli5_output['weight']>=0]
#       eli5_output_v1.sort_values('weight',ascending=False,inplace=True)
#       eli5_output_v2 = eli5_output_v1[eli5_output_v1['feature']!='<BIAS>']
#       df_int=df_int_ext.transpose().reset_index()
#       df_int.rename(columns={'index':'feature',0:'feature_val'},inplace=True)
#       eli5_output_v2=eli5_output_v2.merge(df_int,on=['feature'],how='inner')
#       for feature,fest_val,index in zip(eli5_output_v2['feature'],eli5_output_v2['feature_val'],eli5_output_v2.index):
#         if (('tempmax'.lower() not in feature.lower()) & ('tempmin'.lower() not in feature.lower())):
#           if fest_val<=0:
#             eli5_output_v2.drop(index,inplace=True)
#           else:
#             pass
#         else:
#           pass
#       eli5_output_v2.reset_index(inplace=True)
#       eli5_output_v2.drop(columns=['index'],inplace=True)
#       eli5_output_final = eli5_output_v2.head(5)
#       lead_time_pred = ((pd.to_datetime(po_create_date) + BDay(int(math.ceil(bus_day_count_pred)))) - pd.to_datetime(po_create_date)).days
#   else:
#     #If model is not built for a material then using monthly average as predicted lead time
#     lead_time_pred = historical_data_v1[(historical_data_v1['Material_No.']==mat_id) & 
#                                         (historical_data_v1[PO_Create_Month]==1)]['lead_time'].mean()
#     mat_avg = historical_data_v1[(historical_data_v1['Material_No.']==mat_id)]['lead_time'].mean()
#     if lead_time_pred > mat_avg:
#       weight = lead_time_pred-mat_avg
#     else:
#       weight = 0
#     eli5_details = {
#       'feature' : [PO_Create_Month],
#       'weight' : weight,
#       'method' : 'none'
#     }
#     eli5_output_final = pd.DataFrame(eli5_details)
  
#   #If the lead time is not predicted then, getting the prediction as the average business day count for a Plant, VS and Material combo
#   if math.isnan(lead_time_pred) == True:
#     lead_time_pred = historical_data_v1[(historical_data_v1['Material_No.']==mat_id)]['lead_time'].mean()
#     eli5_details = {
#       'feature' : ['material_average_lead_time'],
#       'weight' : 0,
#       'method' : 'none'
#     }
#     eli5_output_final = pd.DataFrame(eli5_details)
    
#   return lead_time_pred,eli5_output_final

In [0]:
# lead_time_predictor,eli5_output_final  = pred_output(historical_data, 5004, 100922625, 22001236,
# '2021-08-04',39670.0)

In [0]:
# eli5_output_final

In [0]:
# lead_time_predictor,eli5_output_final = pred_output(historical_data, 5004, 101042953, 22002218,
# '2022-01-20',40320.0)

In [0]:
# lead_time_predictor,eli5_output_final = pred_output(historical_data, 5004, 100365002, 22002248,
# '2022-02-28',207.0)

In [0]:
# lead_time_predictor,eli5_output_final = pred_output(historical_data, 5959, 100838018, 22013999,
# '2021-12-01',165.0)

In [0]:
# lead_time_predictor,eli5_output_final = pred_output(historical_data, 5004, 100922611, 43058765,
# '2022-02-08',1000.0)
# eli5_output_final

In [0]:
# 5004 101042953 22002218 2022-01-20 40320.0
# 5004 100365002 22002248 2022-02-28 207.0
# 5004 100922611 43058765 2022-02-08 1000.0
# 5004 101112438 22001316 2022-03-08 584.0
# 5959 100838018 22013999 2021-12-01 165.0