In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
big_bills = pd.read_csv('D:/Project_Restaurant_Chain/Full/big_bills.csv')
restaurant_portfolio = pd.read_csv('D:/Project_Restaurant_Chain/Full/restaurant_portfolio.csv')

In [3]:
big_bills_selected = big_bills[(big_bills.total_covers>=0) & (big_bills.total_covers<26)]

In [4]:
big_bills_selected.head()

Unnamed: 0.1,Unnamed: 0,branch_number,concept_number,revenue_centre_number,brand,sale_date,guest_check_number,check_datetimestart,check_datetimestop,total_covers,total_gross_sales,total_net_sales,total_service_charge,total_tip,total_discount,total_cost,total_stock_cost,total_tesco_rebate
0,0,3338,1,2,Prezzo,2016-06-19,25,2016-06-19 13:57:47,2016-06-19 15:59:55,0,18.92,15.766,13.67,0.0,-90.0,57.41,1.149,45.25
1,1,2001,11,1,Prezzo,2016-06-19,7,2016-06-19 12:39:55,2016-06-19 22:18:13,0,0.0,0.0,,,,0.0,,
2,2,3204,1,1,Prezzo,2016-06-19,10,2016-06-19 12:44:17,2016-06-19 16:27:22,0,0.0,0.0,,,,0.0,1.127,
3,3,2001,11,1,Prezzo,2016-06-19,50,2016-06-19 15:33:50,2016-06-19 16:44:04,0,0.0,0.0,,,,0.0,,
4,4,3204,1,1,Prezzo,2016-06-19,20,2016-06-19 13:08:28,2016-06-19 13:54:32,0,0.0,0.0,,,,0.0,3.488,


In [5]:
merged_df = pd.merge(big_bills_selected, restaurant_portfolio, on='branch_number')

## Feature Selection and Manipulation

Based on what we have, select some feature we deem as useful.

### Dropping features

In [6]:
features_to_drop_first_batch = ['Unnamed: 0_x', 'Unnamed: 0_y', 'brand_x', 'brand_y', 'concept_number_x', 'concept_number_y', \
                                'total_net_sales', 'exec_employee_id', 'ops_area_employee_id', 'region', 'status', 'total_service_charge', \
                                'total_tesco_rebate', 'company', 'description', 'area_chef_employee_id']

In [7]:
merged_df.drop(features_to_drop_first_batch, axis=1, inplace=True)

In [8]:
merged_df.columns

Index(['branch_number', 'revenue_centre_number', 'sale_date',
       'guest_check_number', 'check_datetimestart', 'check_datetimestop',
       'total_covers', 'total_gross_sales', 'total_tip', 'total_discount',
       'total_cost', 'total_stock_cost', 'post_code', 'town', 'county',
       'country', 'location', 'rating', 'total_seats', 'total_inside_seats',
       'ground_seats', '1st_floor_seats', 'conservatory_seats', 'bar_seats',
       'outside_seats', 'private_dining_covers', 'ops_dir_employee_id',
       'ops_manager_employee_id'],
      dtype='object')

In [9]:
merged_df.head()

Unnamed: 0,branch_number,revenue_centre_number,sale_date,guest_check_number,check_datetimestart,check_datetimestop,total_covers,total_gross_sales,total_tip,total_discount,...,total_seats,total_inside_seats,ground_seats,1st_floor_seats,conservatory_seats,bar_seats,outside_seats,private_dining_covers,ops_dir_employee_id,ops_manager_employee_id
0,3338,2,2016-06-19,25,2016-06-19 13:57:47,2016-06-19 15:59:55,0,18.92,0.0,-90.0,...,204.0,144.0,128.0,0.0,0.0,16.0,60.0,No,626.0,999059.0
1,3338,1,2016-06-19,2,2016-06-19 13:05:28,2016-06-19 13:06:00,0,-15.87,,,...,204.0,144.0,128.0,0.0,0.0,16.0,60.0,No,626.0,999059.0
2,3338,1,2016-06-19,11,2016-06-19 16:29:55,2016-06-19 16:30:07,0,0.0,,,...,204.0,144.0,128.0,0.0,0.0,16.0,60.0,No,626.0,999059.0
3,3338,1,2016-06-19,103,2016-06-19 20:19:09,2016-06-19 20:30:52,0,3.69,,,...,204.0,144.0,128.0,0.0,0.0,16.0,60.0,No,626.0,999059.0
4,3338,1,2016-06-19,10,2016-06-19 12:52:23,2016-06-19 12:52:36,0,4.99,,,...,204.0,144.0,128.0,0.0,0.0,16.0,60.0,No,626.0,999059.0


### Missing data imputation

There are two types of data, one is numerical, and the other is generic.
As for the limited knowledge of the numerical data, we have to make 'NaN' to 0 normally, but make special arrangements for some special cases.

In [10]:
merged_df.total_gross_sales.fillna(0, inplace = True)
merged_df.total_tip.fillna(0, inplace = True)
merged_df.total_discount.fillna(0, inplace = True)
merged_df.total_cost.fillna(0, inplace = True)
merged_df.total_stock_cost.fillna(0, inplace = True)
merged_df.total_seats.fillna(0, inplace = True)
merged_df.total_inside_seats.fillna(0, inplace = True)
merged_df.ground_seats.fillna(0, inplace = True)
merged_df['1st_floor_seats'].fillna(0, inplace = True)
merged_df.conservatory_seats.fillna(0, inplace = True)
merged_df.bar_seats.fillna(0, inplace = True)
merged_df.outside_seats.fillna(0, inplace = True)
merged_df.private_dining_covers.fillna(0, inplace = True)

After data imputation:

In [11]:
merged_df.conservatory_seats.value_counts()

0.0     5065214
5.0       21149
8.0       18262
31.0      16113
18.0      15794
4.0       15088
Name: conservatory_seats, dtype: int64

This is too lopsided, and this must be deleted.

### One Hot Encoding
As for the generic features, their contents of data must be on-hot encoded. In case there are so many newly added attributes, more attributes must be deleted. List some feature that must be deleted.

branch_number: 258

revenue_centre_number: 3 (but heavily concentrated on one type)

sale_date: 366

guest_check_number: 370

post_code: 258

town: 221

county: 38

location: 258

rating: 7

ops_dir_employee_id: 2 (very hard to imagine this is gonna be important

ops_manager_employee_id: 8 (very hard to imagine this is gonna be important)

In general, we have to only keep 'county' and 'rating' for now. But for 'private_dining_covers', the data must be cleaned further.

Also the geographical values must be dealed with here.

## Geographical information extraction

In [12]:
merged_df.town.value_counts(dropna = False)

London              342163
NaN                 171110
Glasgow             102310
Lincoln              74334
Stevenage            60588
York                 56772
Milton Keynes        55198
Chester              54502
Chelmsford           47491
Edinburgh            45179
Cardiff              44993
Bristol              44451
Grays                42041
Braintree            41983
Leeds                41824
Nottingham           41401
Aberdeen             41069
Bournemouth          39419
Newport              38579
Cheltenham           36373
Manchester           35914
Canterbury           35876
Leicester            32196
Norwich              31836
Bath                 31787
Birmingham           31784
Cambridge            31761
Wembley              31307
Sheffield            30542
Fareham              30424
                     ...  
Lytham St. Annes     12795
Farnham              12740
Bromsgrove           12682
Gloucester           12585
St Austell           12463
Bridgwater           12399
C

Too many 'NaN' values.

In [20]:
merged_df.location.value_counts(dropna = False)

Kings Cross                 93219
Lincoln                     74334
Stevenage LP                47315
Northumberland Ave          45855
Euston                      45078
St Martins Lane             43459
Lakeside                    42041
Braintree                   41983
Aberdeen                    41069
Canterbury                  35876
Edinburgh 2                 32363
Victoria Place              32036
Bath                        31787
Cambridge                   31761
Wembley                     31307
Bournemouth                 31284
Glasgow                     30943
Fareham                     30424
Glass House Street          30148
Haymarket                   29702
Milton Keynes L/Park        29323
Hemel Hempstead             29279
Cheshire Oaks               29223
Cardiff 2                   29150
Liffey Valley               29123
Plymouth                    29020
Chelmsford                  28583
York 2                      28526
Hull                        28456
Manchester Med

It is hard to use this kind of values.

In [18]:
merged_df.post_code.value_counts(dropna = False)

N1C 4AH      93219
LN1 1YX      74334
SG1 2UA      47315
WC2N 5BW     45855
NW1 2BD      45078
WC2N 4BF     43459
RM20 2ZN     42041
CM7 8YH      41983
AB11 5RG     41069
CT1 2BA      35876
EH1 1SB      32363
SW1W 9SJ     32036
BA1 1SX      31787
CB2 1UJ      31761
HA9 0FD      31307
BH1 2BZ      31284
G53 6AG      30943
PO15 7PD     30424
W1B 5DL      30148
SW1Y 4BP     29702
MK1 1ST      29323
HP2 4JS      29279
CH65 9HD     29223
CF10 2EF     29150
NaN          29123
PL1 3GE      29020
CM1 1GD      28583
YO32 9AE     28526
HU2 8LN      28456
M50 2HF      28456
             ...  
EN5 5TD      12538
PL25 5AZ     12463
TA6 3BY      12399
CA1 1DN      12361
DL1 5AD      12340
L37 3NW      12235
HA6 2PY      12168
KT19 8DA     12044
B13 8DD      11980
CO10 2ET     11962
TR7 1DN      11854
NR19 2AP     11698
GL20 5AB     11396
GU34 1HD     11335
SR7 9HU      10878
SN15 1ES     10767
TR11 3QA     10681
FY6 7DF      10549
RG14 5DH     10226
 HU17 8AJ     9860
CO9 2AR       9859
DT1 1UJ     

In [30]:
# print more entries
pd.options.display.max_rows = 1000
np.set_printoptions(threshold = 1000)

In [37]:
post_code_index = merged_df.post_code.value_counts(dropna = False).index

In [38]:
post_code_list = []
for i in post_code_index:
    post_code_list.append(i)

In [39]:
post_code_list

['N1C 4AH',
 'LN1 1YX',
 'SG1 2UA',
 'WC2N 5BW',
 'NW1 2BD',
 'WC2N 4BF',
 'RM20 2ZN',
 'CM7 8YH',
 'AB11 5RG',
 'CT1 2BA',
 'EH1 1SB',
 'SW1W 9SJ',
 'BA1 1SX',
 'CB2 1UJ',
 'HA9 0FD',
 'BH1 2BZ',
 'G53 6AG',
 'PO15 7PD',
 'W1B 5DL',
 'SW1Y 4BP',
 'MK1 1ST',
 'HP2 4JS',
 'CH65 9HD',
 'CF10 2EF',
 nan,
 'PL1 3GE',
 'CM1 1GD',
 'YO32 9AE',
 'HU2 8LN',
 'M50 2HF',
 'W1K 6WZ',
 'IP33 1LW',
 'YO1 9RG',
 'HA1 1HS',
 'BS1 5DB',
 'PE30 1JJ',
 'OX9 3EW',
 'RG1 2HF',
 'SE10 0QJ',
 'OX26 6FA',
 'G1 2ER',
 'BA16 0BB',
 'W1H 7LU',
 'RH10 1BT',
 'MK10 0BA',
 'CT10 1LU',
 'CH4 0DP',
 'PO6 4TP',
 'LS1 8TL',
 'BA14 8AH',
 'G51 4BP',
 'NN1 2DA',
 'PE1 1NA',
 'PO19 1EE',
 'NG1 4AA',
 'AL10 0XY',
 'CH45 2PB',
 'DN21 2NA',
 'NW7 3TG',
 'TW18 4WB',
 'HP23 4AB',
 'SP1 2PF',
 'SO50 5FX',
 'CO15 1QZ',
 'EX4 3EB',
 'SS6 7EW',
 'W8 5BA',
 'NN16 0AQ',
 'SG6 3BY',
 'EN2 6LU',
 'TN1 1RT',
 'W1J 7RS',
 'PO30 1AB',
 'BL1 2AL',
 'SL4 1PL',
 'TQ2 5EG',
 'BT1 4QG',
 'CM9 5EP',
 'CM23 2LD',
 'IP1 1DT',
 'IP15 5AQ',
 'HP9

Insepecting the values, 'nan' and ' HU17 8AJ' must be dealt with.

In [41]:
merged_df.post_code = merged_df.post_code.replace({' HU17 8AJ': 'HU17 8AJ'})

In [102]:
merged_df.post_code = merged_df.post_code.fillna('NaN')

In [103]:
merged_df.post_code.value_counts(dropna = False)

N1C 4AH     93219
LN1 1YX     74334
SG1 2UA     47315
WC2N 5BW    45855
NW1 2BD     45078
WC2N 4BF    43459
RM20 2ZN    42041
CM7 8YH     41983
AB11 5RG    41069
CT1 2BA     35876
EH1 1SB     32363
SW1W 9SJ    32036
BA1 1SX     31787
CB2 1UJ     31761
HA9 0FD     31307
BH1 2BZ     31284
G53 6AG     30943
PO15 7PD    30424
W1B 5DL     30148
SW1Y 4BP    29702
MK1 1ST     29323
HP2 4JS     29279
CH65 9HD    29223
CF10 2EF    29150
NaN         29123
PL1 3GE     29020
CM1 1GD     28583
YO32 9AE    28526
HU2 8LN     28456
M50 2HF     28456
W1K 6WZ     28446
IP33 1LW    28277
YO1 9RG     28246
HA1 1HS     28047
BS1 5DB     28044
PE30 1JJ    27776
OX9 3EW     27367
RG1 2HF     27083
SE10 0QJ    26786
OX26 6FA    26653
G1 2ER      26575
BA16 0BB    26352
W1H 7LU     26155
RH10 1BT    26062
MK10 0BA    25875
CT10 1LU    25686
CH4 0DP     25279
PO6 4TP     25188
LS1 8TL     25110
BA14 8AH    25076
G51 4BP     24975
NN1 2DA     24866
PE1 1NA     24684
PO19 1EE    24493
NG1 4AA     24382
AL10 0XY  

Use the table downloaded online to extract the area it belongs to.

Refer to this page for more information, https://en.wikipedia.org/wiki/Regions_of_England.

#### But the table is a bit of different from the Wikipedia page.

In [104]:
postcodeareas = pd.read_csv('D:/Project_Restaurant_Chain/Full/postcodeareas.csv')

In [105]:
postcodeareas.head()

Unnamed: 0,Postcode_head,Area
0,GY,Channel Islands
1,JE,Channel Islands
2,CB,East England
3,CM,East England
4,CO,East England


In [106]:
post_code = merged_df.post_code.tolist()

In [107]:
from collections import Counter

In [108]:
Counter(post_code)

Counter({'AB11 5RG': 41069,
         'AL10 0XY': 23979,
         'AL3 4DD': 17036,
         'AL5 2HX': 13546,
         'AL8 6AL': 17498,
         'B13 8DD': 11980,
         'B17 9NJ': 19804,
         'B61 8HQ': 12682,
         'B70 7PP': 14033,
         'B75 5BP': 7285,
         'B90 3AG': 18339,
         'B97 4EQ': 17521,
         'BA1 1SX': 31787,
         'BA14 8AH': 25076,
         'BA16 0BB': 26352,
         'BA20 1EG': 9166,
         'BH1 2BZ': 31284,
         'BH2 5DD': 8135,
         'BH21 1LT': 18098,
         'BH23 1DY': 14087,
         'BH24 1AN': 17099,
         'BL1 2AL': 21989,
         'BN18 9AJ': 16113,
         'BN2 5WA': 20558,
         'BN21 1BA': 16795,
         'BN27 1AR': 14253,
         'BN7 1YE': 14268,
         'BR3 1AG': 16475,
         'BR7 5AG': 15257,
         'BS1 5DB': 28044,
         'BS37 4FT': 16407,
         'BT1 4QG': 21608,
         'BT12 6HU': 5235,
         'CA1 1DN': 12361,
         'CB10 1EX': 15288,
         'CB2 1UJ': 31761,
         'CB7 4JZ'

In [118]:
post_code_header = []
for i in post_code:
    if i != 'NaN':
        first_two_digits = str(i)[0:2]
        if first_two_digits[1].isdigit() == True:
            post_code_header.append(first_two_digits[0])
        else:
            post_code_header.append(first_two_digits)
    else:    
        post_code_header.append('NaN')

In [119]:
Counter(post_code_header)

Counter({'AB': 41069,
         'AL': 72059,
         'B': 101644,
         'BA': 92381,
         'BH': 88703,
         'BL': 21989,
         'BN': 81987,
         'BR': 31732,
         'BS': 44451,
         'BT': 26843,
         'CA': 12361,
         'CB': 105355,
         'CF': 58428,
         'CH': 78325,
         'CM': 201842,
         'CO': 62393,
         'CT': 79253,
         'CV': 49605,
         'DA': 16599,
         'DE': 27827,
         'DL': 21745,
         'DN': 23721,
         'DT': 43594,
         'E': 38083,
         'EH': 45179,
         'EN': 49718,
         'EX': 52181,
         'FY': 38692,
         'G': 102310,
         'GL': 60354,
         'GU': 136432,
         'HA': 114520,
         'HG': 28394,
         'HP': 104890,
         'HR': 17139,
         'HU': 38316,
         'IG': 36411,
         'IP': 126159,
         'KT': 62989,
         'KY': 17905,
         'L': 12235,
         'LE': 51853,
         'LN': 74334,
         'LS': 41824,
         'M': 64370,
       

In [143]:
UK_area_list = [0 for x in range(len(post_code_header))]

In [147]:
post_code_header_index = 0
for i in post_code_header:
    for j in postcodeareas.index:
        if i == postcodeareas.Postcode_head[j]:
            UK_area_list[post_code_header_index] = postcodeareas.Area[j]
    post_code_header_index += 1

In [148]:
Counter(UK_area_list)

Counter({0: 29123,
         'East Midlands': 206454,
         'North East': 61131,
         'Wales (North)': 16546,
         'North West': 302372,
         'Northern Ireland': 26843,
         'Wales (South)': 123500,
         'West Midlands': 201406,
         'South West': 562700,
         'Scotland (South & Central)': 165394,
         'Home Counties': 596951,
         'South East': 691792,
         'Yorkshire & Humberside': 219569,
         'Greater London': 1104297,
         'East England': 802473,
         'Scotland (North)': 41069})

Create a new data frame to hold the values and save as csv file.

In [171]:
data = {'Area': UK_area_list}

In [173]:
Area_df = pd.DataFrame(data)

In [174]:
Area_df.to_csv('Area_of_post_codes.csv')

#### Read the csv again and join the two dataset

In [13]:
Area_df = pd.read_csv('D:/Project_Restaurant_Chain/Full/Area_of_post_codes.csv')

In [15]:
merged_df = merged_df.join(Area_df)

In [39]:
merged_df.Area.value_counts()

Greater London                1104297
East England                   802473
South East                     691792
Home Counties                  596951
South West                     562700
North West                     302372
Yorkshire & Humberside         219569
East Midlands                  206454
West Midlands                  201406
Scotland (South & Central)     165394
Wales (South)                  123500
North East                      61131
Scotland (North)                41069
0                               29123
Northern Ireland                26843
Wales (North)                   16546
Name: Area, dtype: int64

There is no need to keep so many regions. So some of them must be combined together, for example, south of Wales and north of Wales. And there is a value as '0', 29123 values are 'NaN' and do not have postcodes originally. But we know that from the first part of data cleaning, exactly 29123 transactions come from Ireland branches, so they do not have UK postcodes.

In [41]:
merged_df.Area = merged_df.Area.replace({'Scotland (South & Central)': 'Scotland', 'Scotland (North)': 'Scotland', 'Wales (North)': 'Wales', 
                                         'Wales (South)': 'Wales', '0': 'Ireland'})

In [42]:
merged_df.Area.value_counts()

Greater London            1104297
East England               802473
South East                 691792
Home Counties              596951
South West                 562700
North West                 302372
Yorkshire & Humberside     219569
Scotland                   206463
East Midlands              206454
West Midlands              201406
Wales                      140046
North East                  61131
Ireland                     29123
Northern Ireland            26843
Name: Area, dtype: int64

Now this looks pretty clean, we should drop extra features we will not use now.

In [43]:
features_to_drop_second_batch = ['branch_number', 'revenue_centre_number', 'sale_date', 'guest_check_number', 'post_code', 'town', 'county',
                                'location', 'ops_dir_employee_id', 'ops_manager_employee_id', 'country', 'conservatory_seats']

In [44]:
merged_df.drop(features_to_drop_second_batch, axis=1, inplace=True)

In [45]:
merged_df.private_dining_covers.value_counts()

No         2845219
0          1034002
0           189601
40          175866
30          133667
60          114157
12           88629
14           83620
10           74419
NO           64508
20-40        41898
135          30943
25 - 50      30347
15 - 30      29291
15 - 50      29150
15           24753
8            21949
24 & 24      20402
24           19920
20           19650
50           18098
20 - 30      16572
25           15493
50-70        15484
28           13982
Name: private_dining_covers, dtype: int64

This is a mess, and values must be reassigned.

'0': 0, 'No': 0, 'NO': 0

We don't know what '24 & 24' is, but maybe we can keep it for now.

In [46]:
merged_df.private_dining_covers = merged_df.private_dining_covers.replace({'0': 0, 'No': 0, 'NO': 0})

In [47]:
merged_df.private_dining_covers.value_counts()

0          4133330
40          175866
30          133667
60          114157
12           88629
14           83620
10           74419
20-40        41898
135          30943
25 - 50      30347
15 - 30      29291
15 - 50      29150
15           24753
8            21949
24 & 24      20402
24           19920
20           19650
50           18098
20 - 30      16572
25           15493
50-70        15484
28           13982
Name: private_dining_covers, dtype: int64

The number of types are too broad, some values must be replaced further. Those with vague value ranges are replaced with the median.

In [48]:
merged_df.private_dining_covers = merged_df.private_dining_covers.replace({'20-40': 30, '25 - 50': 38, '15 - 30': 23, '15 - 50': 33, 
                                                                          '24 & 24': 24, '20 - 30': 25, '50-70': 60})

In [49]:
merged_df.private_dining_covers.value_counts()

0      4133330
40      175866
30      133667
60      114157
12       88629
14       83620
10       74419
30       41898
135      30943
38       30347
23       29291
33       29150
15       24753
8        21949
24       20402
24       19920
20       19650
50       18098
25       16572
25       15493
60       15484
28       13982
Name: private_dining_covers, dtype: int64

Replace the string with values.

In [50]:
merged_df.private_dining_covers = merged_df.private_dining_covers.replace({'40': 40, '30': 30, '60': 60, '12': 12, '14': 14, '10': 10,
                                                                          '135': 135, '15': 15, '8': 8, '24': 24, '20': 20, '50': 50,
                                                                          '25': 25, '60': 60, '28': 28})

In [51]:
merged_df.private_dining_covers.value_counts()

0      4133330
40      175866
30      175565
60      129641
12       88629
14       83620
10       74419
24       40322
25       32065
135      30943
38       30347
23       29291
33       29150
15       24753
8        21949
20       19650
50       18098
28       13982
Name: private_dining_covers, dtype: int64

In [52]:
merged_df.private_dining_covers.value_counts().sort_index()

0      4133330
8        21949
10       74419
12       88629
14       83620
15       24753
20       19650
23       29291
24       40322
25       32065
28       13982
30      175565
33       29150
38       30347
40      175866
50       18098
60      129641
135      30943
Name: private_dining_covers, dtype: int64

It seems much better now since they are all values.

#### Do one-hot coding

Compared to the first version, very few new columns are created.

In [54]:
columns_to_one_hot_encode = ['Area', 'rating']

In [55]:
#one hot encoding
def one_hot_encode_columns(df, list_of_columns_to_one_hot_encode):
    for col in list_of_columns_to_one_hot_encode:
        df[col] = df[col].astype('object')
    
    one_hot=pd.get_dummies(df[list_of_columns_to_one_hot_encode],dummy_na=False)
    
    df=df.join(one_hot)
    return df

In [56]:
processed_merged_df = one_hot_encode_columns(merged_df,columns_to_one_hot_encode)

Drop 'Area' and 'rating' now.

In [57]:
processed_merged_df.drop(['Area', 'rating'], axis=1, inplace=True)

In [58]:
processed_merged_df.head()

Unnamed: 0,check_datetimestart,check_datetimestop,total_covers,total_gross_sales,total_tip,total_discount,total_cost,total_stock_cost,total_seats,total_inside_seats,...,Area_South West,Area_Wales,Area_West Midlands,Area_Yorkshire & Humberside,rating_A,rating_B,rating_C,rating_D,rating_E,rating_F
0,2016-06-19 13:57:47,2016-06-19 15:59:55,0,18.92,0.0,-90.0,57.41,1.149,204.0,144.0,...,0,0,0,0,0,1,0,0,0,0
1,2016-06-19 13:05:28,2016-06-19 13:06:00,0,-15.87,0.0,0.0,15.87,-1.916,204.0,144.0,...,0,0,0,0,0,1,0,0,0,0
2,2016-06-19 16:29:55,2016-06-19 16:30:07,0,0.0,0.0,0.0,0.0,1.52,204.0,144.0,...,0,0,0,0,0,1,0,0,0,0
3,2016-06-19 20:19:09,2016-06-19 20:30:52,0,3.69,0.0,0.0,-3.69,0.214,204.0,144.0,...,0,0,0,0,0,1,0,0,0,0
4,2016-06-19 12:52:23,2016-06-19 12:52:36,0,4.99,0.0,0.0,-4.99,1.722,204.0,144.0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
processed_merged_df.drop(['Unnamed: 0'], axis=1, inplace=True)

In [61]:
processed_merged_df.columns

Index(['check_datetimestart', 'check_datetimestop', 'total_covers',
       'total_gross_sales', 'total_tip', 'total_discount', 'total_cost',
       'total_stock_cost', 'total_seats', 'total_inside_seats', 'ground_seats',
       '1st_floor_seats', 'bar_seats', 'outside_seats',
       'private_dining_covers', 'Area_East England', 'Area_East Midlands',
       'Area_Greater London', 'Area_Home Counties', 'Area_Ireland',
       'Area_North East', 'Area_North West', 'Area_Northern Ireland',
       'Area_Scotland', 'Area_South East', 'Area_South West', 'Area_Wales',
       'Area_West Midlands', 'Area_Yorkshire & Humberside', 'rating_A',
       'rating_B', 'rating_C', 'rating_D', 'rating_E', 'rating_F'],
      dtype='object')

Now check there are two types of data available, which is the time for start checking and the time for stop checking. We need to know how much time is spent on the meal.

In [62]:
from datetime import datetime

In [63]:
check_time_difference = []
datetimeFormat = '%Y-%m-%d %H:%M:%S'
def get_check_time_difference(df, columns_names):    
    for i in range(len(df.index)):
        end_dt = datetime.strptime(processed_merged_df[columns_names[1]][i], datetimeFormat)
        start_dt = datetime.strptime(processed_merged_df[columns_names[0]][i], datetimeFormat)
        diff = end_dt - start_dt
        time_diff = round(diff.seconds/60)
        check_time_difference.append(int(time_diff))
    check_time_difference_in_minutes = pd.Series(check_time_difference, index = df.index)    
    return check_time_difference_in_minutes

In [64]:
check_time_difference_in_minutes = get_check_time_difference(processed_merged_df, ['check_datetimestart', 'check_datetimestop'])  

In [65]:
check_time_difference_in_minutes

0          122
1            1
2            0
3           12
4            0
5           35
6            0
7            0
8           42
9            0
10         613
11          37
12          36
13           4
14          24
15          49
16          82
17           0
18           3
19         177
20           0
21           0
22          66
23          19
24           0
25           0
26          22
27          35
28          80
29          83
          ... 
5151590     82
5151591    110
5151592    381
5151593     26
5151594     40
5151595     41
5151596     31
5151597     85
5151598     50
5151599     41
5151600     72
5151601     91
5151602     60
5151603     72
5151604     35
5151605     67
5151606     77
5151607     86
5151608     82
5151609     69
5151610     68
5151611     78
5151612     86
5151613     75
5151614     68
5151615     61
5151616     71
5151617    134
5151618    116
5151619     86
dtype: int64

In [66]:
check_time_difference_in_minutes_df = check_time_difference_in_minutes.to_frame(name = 'check_time_difference_in_seconds')

In [67]:
check_time_difference_in_minutes_df

Unnamed: 0,check_time_difference_in_seconds
0,122
1,1
2,0
3,12
4,0
5,35
6,0
7,0
8,42
9,0


In [68]:
check_time_difference_in_minutes_df.to_csv('check_time_difference_in_minutes.csv')

In [69]:
processed_merged_new_df = processed_merged_df.join(check_time_difference_in_minutes_df)

In [70]:
processed_merged_new_df

Unnamed: 0,check_datetimestart,check_datetimestop,total_covers,total_gross_sales,total_tip,total_discount,total_cost,total_stock_cost,total_seats,total_inside_seats,...,Area_Wales,Area_West Midlands,Area_Yorkshire & Humberside,rating_A,rating_B,rating_C,rating_D,rating_E,rating_F,check_time_difference_in_seconds
0,2016-06-19 13:57:47,2016-06-19 15:59:55,0,18.92,0.00,-90.00,57.41,1.149,204.0,144.0,...,0,0,0,0,1,0,0,0,0,122
1,2016-06-19 13:05:28,2016-06-19 13:06:00,0,-15.87,0.00,0.00,15.87,-1.916,204.0,144.0,...,0,0,0,0,1,0,0,0,0,1
2,2016-06-19 16:29:55,2016-06-19 16:30:07,0,0.00,0.00,0.00,0.00,1.520,204.0,144.0,...,0,0,0,0,1,0,0,0,0,0
3,2016-06-19 20:19:09,2016-06-19 20:30:52,0,3.69,0.00,0.00,-3.69,0.214,204.0,144.0,...,0,0,0,0,1,0,0,0,0,12
4,2016-06-19 12:52:23,2016-06-19 12:52:36,0,4.99,0.00,0.00,-4.99,1.722,204.0,144.0,...,0,0,0,0,1,0,0,0,0,0
5,2016-06-19 18:03:58,2016-06-19 18:39:00,0,5.18,0.00,0.00,-5.18,0.934,204.0,144.0,...,0,0,0,0,1,0,0,0,0,35
6,2016-06-19 18:37:44,2016-06-19 18:37:52,0,5.18,0.00,0.00,-5.18,0.741,204.0,144.0,...,0,0,0,0,1,0,0,0,0,0
7,2016-06-19 18:39:21,2016-06-19 18:39:29,0,4.29,0.00,0.00,-4.29,0.667,204.0,144.0,...,0,0,0,0,1,0,0,0,0,0
8,2016-06-19 12:49:02,2016-06-19 13:30:46,0,0.00,0.00,0.00,0.00,0.000,204.0,144.0,...,0,0,0,0,1,0,0,0,0,42
9,2016-06-19 12:09:40,2016-06-19 12:09:44,0,0.00,0.00,0.00,0.00,0.667,204.0,144.0,...,0,0,0,0,1,0,0,0,0,0


Remove the two previous attributes now.

In [71]:
processed_merged_new_df.drop(['check_datetimestart', 'check_datetimestop'], axis=1, inplace=True)

In [72]:
processed_merged_new_df.columns

Index(['total_covers', 'total_gross_sales', 'total_tip', 'total_discount',
       'total_cost', 'total_stock_cost', 'total_seats', 'total_inside_seats',
       'ground_seats', '1st_floor_seats', 'bar_seats', 'outside_seats',
       'private_dining_covers', 'Area_East England', 'Area_East Midlands',
       'Area_Greater London', 'Area_Home Counties', 'Area_Ireland',
       'Area_North East', 'Area_North West', 'Area_Northern Ireland',
       'Area_Scotland', 'Area_South East', 'Area_South West', 'Area_Wales',
       'Area_West Midlands', 'Area_Yorkshire & Humberside', 'rating_A',
       'rating_B', 'rating_C', 'rating_D', 'rating_E', 'rating_F',
       'check_time_difference_in_seconds'],
      dtype='object')

In [73]:
processed_merged_new_df.isnull().any().any()

False

Now there are no 'NaN' values in the dataset, and save the dataset for now.

In [74]:
processed_merged_new_df.head()

Unnamed: 0,total_covers,total_gross_sales,total_tip,total_discount,total_cost,total_stock_cost,total_seats,total_inside_seats,ground_seats,1st_floor_seats,...,Area_Wales,Area_West Midlands,Area_Yorkshire & Humberside,rating_A,rating_B,rating_C,rating_D,rating_E,rating_F,check_time_difference_in_seconds
0,0,18.92,0.0,-90.0,57.41,1.149,204.0,144.0,128.0,0.0,...,0,0,0,0,1,0,0,0,0,122
1,0,-15.87,0.0,0.0,15.87,-1.916,204.0,144.0,128.0,0.0,...,0,0,0,0,1,0,0,0,0,1
2,0,0.0,0.0,0.0,0.0,1.52,204.0,144.0,128.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0,3.69,0.0,0.0,-3.69,0.214,204.0,144.0,128.0,0.0,...,0,0,0,0,1,0,0,0,0,12
4,0,4.99,0.0,0.0,-4.99,1.722,204.0,144.0,128.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [75]:
processed_merged_new_df.to_csv('merged_dataset.csv')