In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn import linear_model 
from sklearn import metrics 
from sklearn.feature_selection import RFECV 
from sklearn.svm import SVR 
from sklearn.model_selection import cross_val_score
import pickle

In [127]:
#Select the Store and Version of Clustered Data 
Store = 160
data = pd.read_csv('Store'+str(Store)+'\Store'+str(Store)+'_order_15-min-cluster_20180103-20190711.csv',index_col=0)

data['BusinessDate'] = pd.to_datetime(data['BusinessDate'])
#Specify Date-Time Variable Types
date_time = ['OrderStartDateTime','NormalDateTime','BumpedDateTime','CookingDateTime','DayHalfHour','DayQuarterHour']
for i in date_time:
    data.loc[:,i] = pd.to_datetime(data.loc[:,i], format="%Y-%m-%d %H:%M:%S.%f") 

In [128]:
Time = 40
data['LongOrder'] = np.where(data['OrderTime-Min']<=Time, 0, 1)

# Merge Labor Variable

### Labor Category

In [129]:
labor = pd.read_csv('labor_20180103-20190711.csv')
labor = labor[labor['StoreKey']==Store]
#Specify Variable Type for Date Time Variable
labor.loc[:,'BusinessDate'] = pd.to_datetime(labor.loc[:,'BusinessDate'])

#Clean Hour Description
labor['Hour'] = np.where(labor['HourDescription'].str.contains('PM'), labor['HourDescription'].str.slice(0,2).str.replace(' ','').astype('int')+12, labor['HourDescription'].str.slice(0,2).str.replace(' ','').astype('int'))
labor['Hour'] = np.where(labor['Hour']==12,0,labor['Hour'])
labor['Hour'] = np.where(labor['Hour']==24,12,labor['Hour'])
labor.columns= ['BusinessDate','StoreKey', 'LaborCategory','HourDescription','ActualLaborHrs','Hour']
labor.head()

Unnamed: 0,BusinessDate,StoreKey,LaborCategory,HourDescription,ActualLaborHrs,Hour
29037,2018-06-26,160,Prep,1 AM,0.38,1
29038,2018-06-26,160,Dishwasher,1 AM,0.65,1
29039,2018-06-26,160,Bar,1 AM,0.52,1
29040,2018-06-26,160,Busser,1 AM,1.22,1
29041,2018-06-26,160,Cook,1 PM,3.12,13


In [130]:
#Merge Each Labor Category into data
labor_cat = []
for i in labor['LaborCategory'].unique(): 
    r = labor[labor['LaborCategory'] == i][['StoreKey','BusinessDate','Hour','ActualLaborHrs']]
    r.columns = ['StoreKey','BusinessDate','Hour','LaborHrs_'+i]
    data = data.merge(right = r, how = 'left', on = ['StoreKey','BusinessDate','Hour'])
    data['LaborHrs_'+i] = np.where(pd.isna(data['LaborHrs_'+i]),0,data['LaborHrs_'+i])
    labor_cat.append('LaborHrs_'+i)

In [131]:
labor['BusinessDate'].min()

Timestamp('2018-06-26 00:00:00')

In [132]:
labor['BusinessDate'].max()

Timestamp('2019-07-11 00:00:00')

## Labor at Station

In [133]:
labor1 = pd.read_csv('Store'+str(Store)+'\Store'+str(Store)+'_employee_labor_20180103-20190711.csv')
labor1['BusinessDate'] = pd.to_datetime(labor1['BusinessDate'])
labor1.head()

Unnamed: 0,BusinessDate,StoreKey,Hour,EmployeeKey,JobName,LaborCategoryName,ActualLaborMinutes
0,2018-06-26,160,7,63032,Prep Cook,Prep,40
1,2018-06-26,160,8,63032,Prep Cook,Prep,60
2,2018-06-26,160,9,63032,Prep Cook,Prep,60
3,2018-06-26,160,10,63032,Prep Cook,Prep,60
4,2018-06-26,160,11,63032,Prep Cook,Prep,60


In [134]:
schedule = pd.read_csv('employee_station_20180103-20190711.csv')
schedule = schedule[schedule['StoreKey']==Store]
schedule['BusinessDate'] = pd.to_datetime(schedule['BusinessDate'])
schedule['StartTime'] = pd.to_datetime(schedule['StartTime'])
schedule['EndTime'] = pd.to_datetime(schedule['EndTime'])
schedule['LocationName'] = schedule['LocationName'].str.lower()
schedule.head()

Unnamed: 0,BusinessDate,StoreKey,EmployeeKey,JobName,LaborCategoryName,StartTime,EndTime,LocationName
21475,2018-02-20,160,268579,Line Cook1,Cook,2018-02-20 09:00:00,2018-02-20 16:00:00,saladst
21476,2018-02-20,160,140930,Line Cook1,Cook,2018-02-20 09:00:00,2018-02-20 16:00:00,broilck
21477,2018-02-20,160,107746,Line Cook1,Cook,2018-02-20 09:00:00,2018-02-20 16:00:00,grillck
21478,2018-02-20,160,63897,Line Cook1,Cook,2018-02-20 09:00:00,2018-02-20 16:00:00,pizzast
21479,2018-02-20,160,169393,Line Cook1,Cook,2018-02-20 09:00:00,2018-02-20 16:00:00,fryck


In [135]:
Station = pd.Series(data['StationName'].unique()).str.lower()
s_clean = [i.replace('broiler', 'broil') for i in Station]
s_clean = [i.replace('sandwich', 'sand') for i in s_clean]
s_clean = [i.replace('apps', 'app') for i in s_clean]
s_sort = [i for i in s_clean if (' / ' not in i)] + [i for i in s_clean if (' / ' in i)]

schedule['AtStation'] = None
labor_st = []
for i in s_sort:
    name = 'Labor_'+str(i)
    labor_st.append(name)
    if ' / ' not in i: 
        schedule[name] = np.where(schedule['LocationName'].str.contains(i),1,0)
        schedule['AtStation'] = np.where(schedule['LocationName'].str.contains(i),1,schedule['AtStation'])
    elif ' / ' in i: #if combo station, must contain key words from both station 
        i1,i2  = i.split(' / ')
        print(i1,i2)
        schedule[name] = np.where(((schedule['LocationName'].str.contains(i1))|(schedule['LocationName'].str.contains(i2)))&(pd.isnull(schedule['AtStation'])),1,0)
        schedule['AtStation'] = np.where(((schedule['LocationName'].str.contains(i1))|(schedule['LocationName'].str.contains(i2)))&(pd.isnull(schedule['AtStation'])),1,schedule['AtStation'])


broil app
grill fry
pasta saute


In [136]:
station_location = {}
for i in s_sort: 
    name = 'Labor_'+str(i)
    station_location[i] = schedule[schedule[name]==1]['LocationName'].unique()
station_location

{'saute': array(['sautest', 'sautepl', 'sautefl', 'sautemd'], dtype=object),
 'salad': array(['saladst', 'saladpr', 'saladfn', 'saladmd'], dtype=object),
 'fry': array(['fryck', 'frypl'], dtype=object),
 'pizza': array(['pizzast', 'pizzafn'], dtype=object),
 'sand': array([], dtype=object),
 'broil / app': array(['broilck', 'broilpl', 'broilmd'], dtype=object),
 'grill / fry': array(['grillck', 'grillpl'], dtype=object),
 'pasta / saute': array(['pastast', 'pastapl', 'pastafl', 'pastamd'], dtype=object)}

In [137]:
schedule['LocationName'].unique()

array(['saladst', 'broilck', 'grillck', 'pizzast', 'fryck', 'pastast',
       'sautest', 'scraping', 'racking', 'cleansd', 'potwash', 'fill1',
       'greet1', '.din 16', '.din 13', '.din 06', '.din 02', 'bar 01',
       '.din 09', '.din 04', '.din 14', 'bar 04', '.din 10', 'ost',
       'oven', 'saladpr', 'slicer', 'sauceck', 'prodrl', 'pprepcr',
       'cash1', 'dessert 1', 'crbcsh1', 'barcw1', 'busd01', 'busd05',
       'busd04', 'upfdrn1', 'portion', 'no_location', 'napkin1',
       'train cook', 'train fd', 'kitchfl', 'steward', 'strbar',
       '.din 15', 'bar 02', 'cash2', 'dessert 2', 'barcw2', 'barmw1',
       'busd03', 'busd02', 'fdrun2', 'frypl', 'pastapl', 'sautepl',
       'dsfloat', 'dskrun1', 'dskrun2', 'crowd1', '.din 11', '.din 07',
       '.din 05', '.din 08', '.din 17', '.din 12', 'bar 03', '.din 03',
       'fdrun3', 'fdrun5', 'fdrun4', 'carts', 'train bake', 'pprepfl',
       'train bus', 'bsrncpt', '.din 01', 'train bar', 'train server',
       'scrape', 'busb03',

In [138]:
#Uncategorized Location
list(schedule[pd.isnull(schedule['AtStation'])]['LocationName'].unique())

['scraping',
 'racking',
 'cleansd',
 'potwash',
 'fill1',
 'greet1',
 '.din 16',
 '.din 13',
 '.din 06',
 '.din 02',
 'bar 01',
 '.din 09',
 '.din 04',
 '.din 14',
 'bar 04',
 '.din 10',
 'ost',
 'oven',
 'slicer',
 'sauceck',
 'prodrl',
 'pprepcr',
 'cash1',
 'dessert 1',
 'crbcsh1',
 'barcw1',
 'busd01',
 'busd05',
 'busd04',
 'upfdrn1',
 'portion',
 'no_location',
 'napkin1',
 'train cook',
 'train fd',
 'kitchfl',
 'steward',
 'strbar',
 '.din 15',
 'bar 02',
 'cash2',
 'dessert 2',
 'barcw2',
 'barmw1',
 'busd03',
 'busd02',
 'fdrun2',
 'dsfloat',
 'dskrun1',
 'dskrun2',
 'crowd1',
 '.din 11',
 '.din 07',
 '.din 05',
 '.din 08',
 '.din 17',
 '.din 12',
 'bar 03',
 '.din 03',
 'fdrun3',
 'fdrun5',
 'fdrun4',
 'carts',
 'train bake',
 'pprepfl',
 'train bus',
 'bsrncpt',
 '.din 01',
 'train bar',
 'train server',
 'scrape',
 'busb03',
 'strbake',
 'upcrwd1',
 'train dish',
 'prodfl',
 'pat 01',
 'pat 02',
 'pat 03',
 'train prep',
 'upcarts',
 'do not use',
 'upgrt1',
 'busp01',
 '

In [139]:
#Keep the relationship between Employee and Station
employee_station = schedule[pd.isnull(schedule['AtStation'])==False].drop(columns=['JobName','LaborCategoryName','StartTime','EndTime','LocationName'])

In [140]:
labor2 = labor1.merge(right=employee_station, on=['StoreKey','EmployeeKey','BusinessDate']) 
labor2 = labor2.drop(columns='EmployeeKey').groupby(['StoreKey','BusinessDate','Hour']+labor_st).sum().reset_index()

#Fill in Hours worked at Station during the Hour period
for i in labor_st:
    labor2.loc[:,i] = labor2.loc[:,i]*labor2.loc[:,'ActualLaborMinutes'] / 60
labor2 = labor2.drop(columns='ActualLaborMinutes')

In [141]:
labor2 = labor2.groupby(['StoreKey','BusinessDate','Hour']).sum().reset_index() 

In [142]:
data = data.merge(right=labor2,how='left',on=['StoreKey','BusinessDate','Hour'])

In [143]:
data.columns

Index(['StoreKey', 'BusinessDate', 'TimeKey', 'CheckNum', 'GuestCount',
       'TableOpenMinutes', 'ProductKey', 'CourseName', 'IXIName',
       'MajorCodeName', 'MinorCodeName', 'StationName', 'SentTime',
       'OrderStartDateTime', 'NormalDateTime', 'CookingDateTime',
       'BumpedDateTime', 'NatHolidayDesc', 'EmployeeKey', 'TicketTime', 'RNK',
       'PROD_RNK', 'ORDER_RNK', 'StationKey', 'DateKey', 'TypeofServiceNum',
       'ChannelKey', 'OpenHour', 'OpenMinute', 'CloseHour', 'CloseMinute',
       'TypeofServiceCat', 'ChannelName', 'StartTime', 'OffSiteOrder',
       'OnSiteOrder', 'TotalOrder', 'OnSiteItem', 'OffSiteItem', 'TotalItem',
       'HalfHourStart', 'QuarterHour', 'HalfHour', 'Hour', 'DayHalfHour',
       'DayQuarterHour', 'DayOfWeek', 'DayOfWeekName', 'Week', 'Weekday',
       'OrderTimeMin', 'Holiday', 'BumpedDateTime_fill', 'OpenDateTime',
       'CloseDateTime', 'OnSiteOrder_fill', 'OffSiteOrder_fill',
       'OnSiteItem_fill', 'OffSiteItem_fill', 'OnSiteGuest', '

In [144]:
#NA creates for time slots with no time worked (?), fill in 0
data.loc[:,labor_st] = data.loc[:,labor_st].fillna(0)

In [145]:
labor1['BusinessDate'].min()

Timestamp('2018-06-26 00:00:00')

In [146]:
labor1['BusinessDate'].max()

Timestamp('2019-07-11 00:00:00')

In [147]:
schedule['BusinessDate'].min()

Timestamp('2018-01-03 00:00:00')

In [148]:
schedule['BusinessDate'].max()

Timestamp('2019-07-11 00:00:00')

In [149]:
del labor1
del schedule

# Logistic Regression

In [109]:
#Check if Dates from different Sources Align
data = data[data['BusinessDate']>='2018-06-26'] 

In [110]:
var = ['LongOrder','DayOfWeek','QuarterHour','DayQuarterHour','OffSiteOrder','OnSiteOrder','OnSiteItem','OffSiteItem','GuestCount','Class','OnSiteGuest']+labor_cat+labor_st
var_remove = ['LongOrder','DayOfWeek','QuarterHour','DayQuarterHour','OffSiteOrder','OnSiteOrder','OnSiteItem','OffSiteItem','GuestCount','OnSiteGuest','OnSiteGuest^2','OffSiteOrder^2','OnSiteOrder^2','OnSiteItem^2','OffSiteItem^2','GuestCount^2','Class']

reg_data = data[(data['BusinessDate']<=pd.to_datetime('2019-06-18'))][var].reset_index().drop(columns='index')
ot_data = data[(data['BusinessDate']>=pd.to_datetime('2019-06-19'))&(data['BusinessDate']<='2019-06-25')][var].reset_index().drop(columns='index') 

In [111]:
#Create Dummy Var
#Squrared Terms:
s = ['OffSiteOrder','OnSiteOrder','OnSiteItem','OffSiteItem','OnSiteGuest','GuestCount']
c = []
for i in s: 
    name = str(i)+'^2'
    reg_data.loc[:,name] = reg_data.loc[:,i]**2
    ot_data.loc[:,name] = ot_data.loc[:,i]**2
    c.append(name)
c=c+s+[None]
#Class Dummy
n = data['Class'].unique()
for i in n:
    for j in c:
        if j!=None:
            name = 'Class'+str(i)+'_'+str(j)
            reg_data[name] = np.where(reg_data['Class']==i,reg_data[j],0)
            ot_data[name] = np.where(ot_data['Class']==i,ot_data[j],0)
        else: 
            name = 'Class'+str(i)
            reg_data[name] = np.where(reg_data['Class']==i,1,0)
            ot_data[name] = np.where(ot_data['Class']==i,1,0)

#Spliting Training & Testing
from random import sample
n_total = len(reg_data)
n_train = round(n_total*0.8)
train = sample(range(n_total),n_train) 

train_data = reg_data.loc[train,:]
test_data = reg_data.drop(index=train)
reg_X = reg_data.drop(columns = var_remove)
train_X = train_data.drop(columns = var_remove)
test_X = test_data.drop(columns = var_remove)
ot_X  = ot_data.drop(columns = var_remove)

In [112]:
log = linear_model.LogisticRegression()
log.fit(train_X,train_data['LongOrder'])



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [125]:
test_data['pred_prob'] = list(pd.DataFrame(log.predict_proba(test_X)).iloc[:,1])
train_data['pred_prob'] = list(pd.DataFrame(log.predict_proba(train_X)).iloc[:,1])
reg_data['pred_prob'] = list(pd.DataFrame(log.predict_proba(reg_X)).iloc[:,1])
ot_data['pred_prob'] = list(pd.DataFrame(log.predict_proba(ot_X)).iloc[:,1])

threshold=0.005
test_data['pred'] = np.where(test_data['pred_prob']>=threshold,1,0)
train_data['pred'] = np.where(train_data['pred_prob']>=threshold,1,0)
reg_data['pred'] = np.where(reg_data['pred_prob']>=threshold,1,0)
ot_data['pred'] = np.where(ot_data['pred_prob']>=threshold,1,0)

In [126]:
print('training accuracy', sum(((train_data['LongOrder']==1)&(train_data['pred']==1))|((train_data['LongOrder']==0)
&(train_data['pred']==0)))/train_data.shape[0]) 
print('test accuracy', sum(((test_data['LongOrder']==1)&(test_data['pred']==1))|((test_data['LongOrder']==0)
&(test_data['pred']==0)))/test_data.shape[0]) 

print('training accuracy-1:', sum(((train_data['LongOrder']==1)&(train_data['pred']==1)))/train_data[train_data['LongOrder']==1].shape[0])
print('training accuracy-0:', sum(((train_data['LongOrder']==0)&(train_data['pred']==0)))/train_data[train_data['LongOrder']==0].shape[0])

print('test accuracy-1:', sum(((test_data['LongOrder']==1)&(test_data['pred']==1)))/test_data[test_data['LongOrder']==1].shape[0])
print('test accuracy-0:', sum(((test_data['LongOrder']==0)&(test_data['pred']==0)))/test_data[test_data['LongOrder']==0].shape[0])


#print('training RMSE', metrics.mean_squared_error(reg_data['LongOrder'],reg_data['pred'])**0.5)
#print('testing RMSE', metrics.mean_squared_error(test_data['OrderTime'],test_data['pred'])**0.5)
#print('out-of-time RMSE', metrics.mean_squared_error(ot_data['LongOrder'],ot_data['pred'])**0.5) 

training accuracy 0.9228395986221357
test accuracy 0.9194847985622285
training accuracy-1: 0.2119205298013245
training accuracy-0: 0.923644378471988
test accuracy-1: 0.1
test accuracy-0: 0.92046783625731


In [98]:
train_data[['OnSiteOrder','OnSiteItem','LongOrder','pred_prob','pred']].sort_values(by=['pred_prob'],ascending=False)

Unnamed: 0,OnSiteOrder,OnSiteItem,LongOrder,pred_prob,pred
84711,34.0,89.0,1,0.987123,1
84705,34.0,91.0,1,0.982874,1
84713,33.0,86.0,0,0.980968,1
84717,34.0,88.0,0,0.980574,1
84746,33.0,90.0,0,0.979450,1
29689,27.0,61.0,1,0.976682,1
84706,31.0,78.0,1,0.975691,1
84708,31.0,82.0,1,0.966051,1
84712,32.0,75.0,0,0.933631,1
87800,18.0,50.0,0,0.892892,1


In [99]:
pd.DataFrame(log.predict_proba(train_X),columns=['0','1']).sort_values(by='1',ascending=False)

Unnamed: 0,0,1
33222,0.012877,0.987123
50724,0.017126,0.982874
73225,0.019032,0.980968
26206,0.019426,0.980574
108312,0.020550,0.979450
15061,0.023318,0.976682
107790,0.024309,0.975691
70525,0.033949,0.966051
111250,0.066369,0.933631
108468,0.107108,0.892892


In [140]:
sum(((train_data['LongOrder']==1)&(train_data['pred']==1)))

16

In [141]:
train_data[train_data['LongOrder']==1].shape[0]

152

In [142]:
sum(((reg_data['LongOrder']==1)&(reg_data['pred']==1))|((reg_data['LongOrder']==0)&(reg_data['pred']==0)))

166057

In [38]:
reg_data['LongOrder']

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
166895    0
166896    0
166897    0
166898    0
166899    0
166900    0
166901    0
166902    0
166903    0
166904    0
166905    0
166906    0
166907    0
166908    0
166909    0
166910    0
166911    0
166912    0
166913    0
166914    0
166915    0
166916    0
166917    0
166918    0
166919    0
166920    0
166921    0
166922    0
166923    0
166924    0
Name: LongOrder, Length: 166925, dtype: int32

# Nueral Net

In [None]:
from sklearn.neural_network import MLPClassifier