# 4.7 Deriving New Variables 

### This script contains the following points 

1. User-defined Functions 
2. loc() Function
3. For-loops 
4. Task Responses
5. Exporting dataframe

In [1]:
# Import Libraries 

import pandas as pd 
import numpy as np 
import os 

In [2]:
# Create path 

path = r'/Users/tsique/Documents/Instacart Basket Analysis'

In [3]:
# Importing ords_prods_merged 

df_ords_prods_merged = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged.pkl'))

## 01. User-defined Functions 

In [4]:
# Create a subset

df = df_ords_prods_merged[:1000000]

In [5]:
# Verify shape

df.shape

(1000000, 14)

In [6]:
# Define a function 

def price_label(row):
    
    if row['prices']<=5:
        return 'Low range product'
    elif (row['prices']>5) and (row['prices']<=15):
        return 'Mid range product'
    elif row['prices']>15:
        return 'High range product'
    else: return np.nan

In [7]:
# Apply the function 

df['price_range'] = df.apply(price_label,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label,axis=1)


In [8]:
df['price_range'].value_counts(dropna = False)

Mid range product    631969
Low range product    368031
Name: price_range, dtype: int64

In [9]:
# Checking the max price

df['prices'].max()

14.0

## 02. loc() Function

### Due to the warning from the 'User-defined Function' loc() function will be used. 

In [10]:
# Adding High-range product to new derived column 'price_range_loc'

df_ords_prods_merged.loc[df_ords_prods_merged['prices'] > 15 , 'price_range_loc'] = 'High-range product'

In [11]:
# Adding Mid-range product to new derived column 'price_range_loc'

df_ords_prods_merged.loc[(df_ords_prods_merged['prices'] <= 15) & (df_ords_prods_merged['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [12]:
# Adding Low-range product to new derived column 'price_range_loc'

df_ords_prods_merged.loc[df_ords_prods_merged['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [13]:
# Checking results for prince_range_loc

df_ords_prods_merged['price_range_loc'].value_counts(dropna = False)

Mid-range product     20462144
Low-range product      9476774
High-range product      389845
Name: price_range_loc, dtype: int64

In [14]:
# Checking output

df_ords_prods_merged

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product
1,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product
2,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product
3,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product
4,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30328758,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,both,Low-range product
30328759,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,both,Mid-range product
30328760,850996,204229,12,2,3,25.0,37595,20,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product
30328761,2550789,204472,6,3,15,7.0,37595,9,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product


## 03. For-Loops

In [15]:
# Checking the frequency of orders by day of week

df_ords_prods_merged['orders_day_of_week'].value_counts(dropna = False)

0    5779087
1    5303718
6    4190948
5    3952326
2    3947564
3    3600589
4    3554531
Name: orders_day_of_week, dtype: int64

In [16]:
# Creating Busiest, Least, and Regularly busy days 

result = []

for value in df_ords_prods_merged["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest day")
    elif value == 4:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [17]:
result

['Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Regularly busy',
 'Busiest day',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Reg

In [18]:
# Creating a new column 'busiest_day' for result

df_ords_prods_merged['busiest_day'] = result

In [19]:
# Checking results for busiest_day column

df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    20995145
Busiest day        5779087
Least busy         3554531
Name: busiest_day, dtype: int64

In [20]:
# Checking output for busiest_day column

df_ords_prods_merged

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
1,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
2,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
3,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
4,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30328758,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,both,Low-range product,Regularly busy
30328759,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,both,Mid-range product,Regularly busy
30328760,850996,204229,12,2,3,25.0,37595,20,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product,Regularly busy
30328761,2550789,204472,6,3,15,7.0,37595,9,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product,Regularly busy


## 04. Task Responses

Suppose your clients have changed their minds about the labels you created in your “busiest_day” column. Now, they want “Busiest day” to become “Busiest days” (plural). This label should correspond with the two busiest days of the week as opposed to the single busiest day. At the same time, they’d also like to know the two slowest days.

In [21]:
# Changing labels for busy days of week: Busiest day to Busiest days (2 days) and two slowest days 

result = []

for value in df_ords_prods_merged["orders_day_of_week"]:
    if value == 0:
        result.append("Busiest days")
    elif value == 1:
        result.append("Busiest days")
    elif value == 4:
        result.append("Least busy")
    elif value == 3:
        result.append("Least busy")
    else:
        result.append("Regularly busy")

In [22]:
# Replacing busiest_day column with newly created result

df_ords_prods_merged['busiest_day'] = result

In [23]:
# Checking results for busiest_day

df_ords_prods_merged['busiest_day'].value_counts(dropna = False)

Regularly busy    12090838
Busiest days      11082805
Least busy         7155120
Name: busiest_day, dtype: int64

In [24]:
# Checking output

df_ords_prods_merged

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
1,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
2,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
3,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy
4,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30328758,31526,202557,18,5,11,3.0,43553,2,1,Orange Energy Shots,64,7,3.7,both,Low-range product,Regularly busy
30328759,2745165,203436,2,3,5,15.0,42338,16,1,"Zucchini Chips, Pesto",50,19,6.9,both,Mid-range product,Least busy
30328760,850996,204229,12,2,3,25.0,37595,20,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product,Regularly busy
30328761,2550789,204472,6,3,15,7.0,37595,9,0,Dead Sea Minerals Eucalyptus Triple Milled Soap,25,11,13.5,both,Mid-range product,Least busy


When too many users make Instacart orders at the same time, the app freezes. The senior technical officer at Instacart wants you to identify the busiest hours of the day

In [25]:
# Frequency of orders by hour 

df_ords_prods_merged['order_hour_of_day'].value_counts(dropna = False)

10    2593725
11    2564597
14    2517238
15    2487586
13    2487500
12    2445841
16    2364969
9     2311334
17    1943858
8     1622394
18    1520954
19    1169224
20     910005
7      844665
21     746254
22     592432
23     375889
6      274801
0      203460
1      108110
5       82706
2       63961
4       49400
3       47860
Name: order_hour_of_day, dtype: int64

In [26]:
# Creating periods of time labels: Most orders, Average orders, and Fewest orders 

result = []

for value in df_ords_prods_merged["order_hour_of_day"]:
    if value <= 9 and value <= 16: 
        result.append ("Most orders")
    elif value <= 17 and value <= 22: 
        result.append ("Average orders")
    elif value == 7:
        result.append ("Average orders")
    elif value == 8:
        result.append ("Average orders")
    else:
        result.append ("Fewest orders")

In [27]:
# Creating new column 'busiest_period_of_day' with result

df_ords_prods_merged['busiest_period_of_day'] = result

In [29]:
# Checking that column was created

df_ords_prods_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range_loc,busiest_day,busiest_period_of_day
0,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Most orders
1,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Average orders
2,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Most orders
3,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Least busy,Average orders
4,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Most orders


In [30]:
# Frequency of orders

df_ords_prods_merged['busiest_period_of_day'].value_counts(dropna = False)

Average orders    19405314
Most orders        5608691
Fewest orders      5314758
Name: busiest_period_of_day, dtype: int64

## 05. Exporting dataframe 

In [31]:
# Export data to pickle 

df_ords_prods_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_variables.pkl'))