# 01. Import libraries and data

In [2]:
# Import libraries

import pandas as pd
import numpy as np
import os
# Set path
path = r'/home/scruffy/anaconda_projects/Instacart Basket Analysis/'
# Import data to dataframe
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_cust_merge.pkl'))

In [3]:
# Check import
print(df_ords_prods.head())

   product_id                product_name  aisle_id  department_id    prices  \
0           1  Chocolate Sandwich Cookies        61             19  5.800781   
1           1  Chocolate Sandwich Cookies        61             19  5.800781   
2           1  Chocolate Sandwich Cookies        61             19  5.800781   
3           1  Chocolate Sandwich Cookies        61             19  5.800781   
4           1  Chocolate Sandwich Cookies        61             19  5.800781   

   order_id  user_id  order_number  order_dow  order_hour_of_day  ...  \
0   3139998      138            28          6                 11  ...   
1   1977647      138            30          6                 17  ...   
2    389851      709             2          0                 21  ...   
3    652770      764             1          3                 13  ...   
4   1813452      764             3          4                 17  ...   

   reordered  first_name  last_name  gender      state age date_joined  \
0     

In [4]:
print(df_ords_prods.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 22 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_id              int32  
 1   product_name            object 
 2   aisle_id                int8   
 3   department_id           int8   
 4   prices                  float16
 5   order_id                int32  
 6   user_id                 int32  
 7   order_number            int8   
 8   order_dow               int8   
 9   order_hour_of_day       int8   
 10  days_since_prior_order  float16
 11  add_to_cart_order       int32  
 12  reordered               int8   
 13  first_name              object 
 14  last_name               object 
 15  gender                  object 
 16  state                   object 
 17  age                     int8   
 18  date_joined             object 
 19  n_dependents            int8   
 20  fam_status              object 
 21  income                  int32

# 02. Create new columns

### Derive price_range column using .loc

In [7]:
df_ords_prods.loc[df_ords_prods['prices'] > 15, 'price_range'] = 'High-range product'

In [8]:
df_ords_prods.loc[(df_ords_prods['prices'] <= 15) & (df_ords_prods['prices'] > 5), 'price_range'] = 'Mid-range product'

In [9]:
df_ords_prods.loc[df_ords_prods['prices'] <= 5, 'price_range'] = 'Low-range product'

In [10]:
df_ords_prods['price_range'].value_counts(dropna=False)

price_range
Mid-range product     21860860
Low-range product     10126321
High-range product      412551
NaN                       5127
Name: count, dtype: int64

### Derive busiest_day column using for loop

In [12]:
result = []

for value in df_ords_prods["order_dow"]:
  if value in [0]:
    result.append("Busiest day")
  elif value in [4]:
    result.append("Least busy day")
  else:
    result.append("Regularly busy day")

df_ords_prods['busiest_day'] = result
df_ords_prods['busiest_day'].value_counts(dropna=False)

busiest_day
Regularly busy day    22416875
Busiest day            6204182
Least busy day         3783802
Name: count, dtype: int64

### Derive busiest_days column using .loc

In [14]:
df_ords_prods.loc[df_ords_prods['order_dow'].isin([0,1]), 'busiest_days_loc'] = 'Busiest days'

In [15]:
df_ords_prods.loc[df_ords_prods['order_dow'].isin([2,5,6]), 'busiest_days_loc'] = 'Regularly busy'

In [16]:
df_ords_prods.loc[df_ords_prods['order_dow'].isin([3,4]), 'busiest_days_loc'] = 'Least busy days'

In [17]:
# Print frequency table for busiest_days_loc
df_ords_prods['busiest_days_loc'].value_counts(dropna=False)

busiest_days_loc
Regularly busy     12916111
Busiest days       11864412
Least busy days     7624336
Name: count, dtype: int64

#### "Busiest days" count is just under double the previous "busiest day" count
#### "Least busy days" count is just over double previous "least busy day" count
#### All as expected.

### Create busiest_period_of_day column

In [20]:
# Print frequency table for order_hour_of_day
df_ords_prods['order_hour_of_day'].value_counts(dropna=False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [21]:
# Define busiest and least busy periods

orders_by_hour = df_ords_prods['order_hour_of_day'].value_counts(dropna=False)

busy_hours = orders_by_hour[orders_by_hour>np.percentile(orders_by_hour, 75)].index
regular_hours = orders_by_hour[(orders_by_hour>np.percentile(orders_by_hour, 25)) & (orders_by_hour<=np.percentile(orders_by_hour, 75))].index
slow_hours = orders_by_hour[orders_by_hour<=np.percentile(orders_by_hour, 25)].index

print(busy_hours)
print(regular_hours)
print(slow_hours)

Index([10, 11, 14, 15, 13, 12], dtype='int8', name='order_hour_of_day')
Index([16, 9, 17, 8, 18, 19, 20, 7, 21, 22, 23, 6], dtype='int8', name='order_hour_of_day')
Index([0, 1, 5, 2, 4, 3], dtype='int8', name='order_hour_of_day')


### Create busiest_period_of_day column using .loc

In [23]:
df_ords_prods.loc[df_ords_prods['order_hour_of_day'].isin(busy_hours), 'busiest_period_of_day'] = 'Most orders'

In [24]:
df_ords_prods.loc[df_ords_prods['order_hour_of_day'].isin(regular_hours), 'busiest_period_of_day'] = 'Average orders'

In [25]:
df_ords_prods.loc[df_ords_prods['order_hour_of_day'].isin(slow_hours), 'busiest_period_of_day'] = 'Fewest orders'

In [26]:
# Print frequency table for busiest_period_of_day
print(df_ords_prods['busiest_period_of_day'].value_counts(dropna=False))

busiest_period_of_day
Most orders       16128666
Average orders    15679865
Fewest orders       596328
Name: count, dtype: int64


# 03. Check and export DataFrame

In [28]:
# Check updated DataFrame
print(df_ords_prods.head())

   product_id                product_name  aisle_id  department_id    prices  \
0           1  Chocolate Sandwich Cookies        61             19  5.800781   
1           1  Chocolate Sandwich Cookies        61             19  5.800781   
2           1  Chocolate Sandwich Cookies        61             19  5.800781   
3           1  Chocolate Sandwich Cookies        61             19  5.800781   
4           1  Chocolate Sandwich Cookies        61             19  5.800781   

   order_id  user_id  order_number  order_dow  order_hour_of_day  ...  \
0   3139998      138            28          6                 11  ...   
1   1977647      138            30          6                 17  ...   
2    389851      709             2          0                 21  ...   
3    652770      764             1          3                 13  ...   
4   1813452      764             3          4                 17  ...   

       state  age  date_joined n_dependents fam_status  income  \
0  Minnesota  

In [48]:
print(df_ords_prods.info())
print(df_ords_prods.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 26 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   product_id              int32  
 1   product_name            object 
 2   aisle_id                int8   
 3   department_id           int8   
 4   prices                  float16
 5   order_id                int32  
 6   user_id                 int32  
 7   order_number            int8   
 8   order_dow               int8   
 9   order_hour_of_day       int8   
 10  days_since_prior_order  float16
 11  add_to_cart_order       int32  
 12  reordered               int8   
 13  first_name              object 
 14  last_name               object 
 15  gender                  object 
 16  state                   object 
 17  age                     int8   
 18  date_joined             object 
 19  n_dependents            int8   
 20  fam_status              object 
 21  income                  int32

  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)
  return dtype.type(n)
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  the_mean = the_sum / count if count > 0 else np.nan
  return dtype.type(n)


         product_id      aisle_id  department_id        prices      order_id  \
count  3.240486e+07  3.240486e+07   3.240486e+07  3.239973e+07  3.240486e+07   
mean   2.559866e+04  6.449111e+01   9.919792e+00           NaN  1.710745e+06   
std    1.408400e+04  4.843815e+01   6.281485e+00  0.000000e+00  9.872988e+05   
min    1.000000e+00 -1.280000e+02   1.000000e+00  1.000000e+00  2.000000e+00   
25%    1.354400e+04  2.400000e+01   4.000000e+00  4.199219e+00  8.559470e+05   
50%    2.530200e+04  7.900000e+01   9.000000e+00  7.398438e+00  1.711049e+06   
75%    3.794700e+04  1.000000e+02   1.600000e+01  1.129688e+01  2.565499e+06   
max    4.968800e+04  1.270000e+02   2.100000e+01  2.500000e+01  3.421083e+06   

            user_id  order_number     order_dow  order_hour_of_day  \
count  3.240486e+07  3.240486e+07  3.240486e+07       3.240486e+07   
mean   1.029372e+05  1.714230e+01  2.738867e+00       1.342515e+01   
std    5.946610e+04  1.753532e+01  2.090077e+00       4.246380e+00   

In [403]:
# Export data to pkl
df_ords_prods.drop(['busiest_day'], axis=1, inplace=True)
df_ords_prods.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'ords_prods_added_cols.pkl'))