## 01. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os

## 02. Import Data Sets

In [2]:
path = r'D:\02.2022_Instacart Basket Analysis'

In [3]:
ords_prods_merge = pd.read_pickle(os.path.join(path,'02 Data','Prepared Data','order_product_merge_f.pkl'))

In [4]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32399732 entries, 0 to 32435058
Data columns (total 15 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                float64 
 1   user_id                 float64 
 2   order_number            float64 
 3   orders_day_of_week      float64 
 4   order_time_of_day       float64 
 5   days_since_prior_order  float16 
 6   first_order             object  
 7   product_id              float64 
 8   add_to_cart_order       float64 
 9   reordered               float64 
 10  product_name            object  
 11  aisle_id                float64 
 12  department_id           float64 
 13  prices                  float16 
 14  _merge                  category
dtypes: category(1), float16(2), float64(10), object(2)
memory usage: 3.3+ GB


In [43]:
ords_prods_merge.shape

(32399732, 15)

In [41]:
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge'],
      dtype='object')

In [44]:
ords_prods_merge = ords_prods_merge.drop(columns = ['_merge'])

In [45]:
ords_prods_merge.shape

(32399732, 14)

## 03. Exercise Walk through

### If-Statements with User-Defined Functions

In [46]:
# Create a subset
df = ords_prods_merge[:1000000]

In [47]:
df.shape

(1000000, 14)

In [48]:
# User-defined function: categorize prices in different ranges
def price_label(row):
    
    if row['prices'] <= 5:
        return 'Low-range product'
    elif(row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif(row['price'] > 15):
        return 'High-range product'
    else: return 'Not enough data'

In [49]:
# Assign the column to return value
df['price_range'] = df.apply(price_label, axis = 1) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


### If-Statements with the loc() Functions

In [51]:
# Loc() function: categorize prices in different ranges
df.loc[df['prices'] > 15,'price_range_loc'] = 'High-range product'
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'low-range product'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [52]:
# Check the result of loc functions 
df['price_range_loc'].value_counts(dropna = False)

Mid-range product    756450
low-range product    243550
Name: price_range_loc, dtype: int64

### If-Statements with For-Loops

In [57]:
# Check the columns of ords_prods_merge
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices'],
      dtype='object')

In [58]:
# Check the frequency of ordering days
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

0.0    6203329
1.0    5659298
6.0    4495887
2.0    4213105
5.0    4205076
3.0    3839865
4.0    3783172
Name: orders_day_of_week, dtype: int64

In [59]:
# For-Loops function: categorize ordering days based on their frequency and calculate the operating time

%%time

result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")
    
ords_prods_merge['busiest_day'] = result  

Wall time: 12.4 s


In [62]:
# Check the result of for-loops function
ords_prods_merge['busiest_day'].value_counts(dropna = False)

Regularly busy    22413231
Busiest day        6203329
Least busy         3783172
Name: busiest_day, dtype: int64

## 04. Answer Questions

### (1.) Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days. Create a new column for this using a suitable method.

In [63]:
# For-Loops function: categorize ordering days based on their frequency 
result= []

for value in ords_prods_merge['orders_day_of_week']:
    if value == 0:
        result.append('Busiest days')
    elif value == 1:
        result.append('Busiest days')
    elif value == 3:
        result.append('Least busy days')
    elif value == 4:
        result.append('Least busy days')
    else:
        result.append('Regularly busy') 
        
ords_prods_merge['busiest_days'] = result

In [65]:
# Check the result of for-loops function
ords_prods_merge['busiest_days'].value_counts(dropna =False)

Regularly busy     12914068
Busiest days       11862627
Least busy days     7623037
Name: busiest_days, dtype: int64

In [66]:
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'busiest_day', 'busiest_days'],
      dtype='object')

In [67]:
ords_prods_merge.shape

(32399732, 16)

### (2.) Check the values of this new column for accuracy. Note any observations in markdown format.

Busiest days = Busiest day + 1 order_day_of_week
Least busy days = Least busy day + 3 orders_day_of_week
Therefore, the values of this new column are correct.

### (3.) When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.” Create a new column containing these labels called “busiest_period_of_day.”

In [70]:
# Check the frequency of ordering time
ords_prods_merge['order_time_of_day'].value_counts(dropna = False)

10.0    2761333
11.0    2735694
14.0    2688728
15.0    2661718
13.0    2660570
12.0    2618104
16.0    2534744
9.0     2453842
17.0    2087273
8.0     1717863
18.0    1636226
19.0    1258076
20.0     976000
7.0      890923
21.0     795528
22.0     634159
23.0     402272
6.0      290450
0.0      218742
1.0      115683
5.0       87944
2.0       69360
4.0       53232
3.0       51268
Name: order_time_of_day, dtype: int64

In [71]:
# For-Loops function: categorize ordering times based on their frequency 
o_result= []

for value in ords_prods_merge['order_time_of_day']:
    if  10 <= value <=16:
        o_result.append('Most orders') 
    elif value <=5:
        o_result.append('Fewest orders')
    else:
        o_result.append('Average orders')
        
ords_prods_merge['busiest_period_of_day'] = o_result

In [73]:
ords_prods_merge.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_time_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', 'busiest_day', 'busiest_days',
       'busiest_period_of_day'],
      dtype='object')

In [74]:
ords_prods_merge.shape

(32399732, 17)

### (4.) Print the frequency for this new column.

In [75]:
ords_prods_merge['busiest_period_of_day'].value_counts(dropna = False)

Most orders       18660891
Average orders    13142612
Fewest orders       596229
Name: busiest_period_of_day, dtype: int64

### (5.) Export your dataframe as a pickle file (since you added new columns) and store it correctly in your “Prepared Data” folder.

In [76]:
ords_prods_merge.to_pickle(os.path.join(path,'02 Data','Prepared Data','Aftermerge_busiest_dayandtime.pkl'))