### Python Fundamentals for Data Analysts

# 4.7: Deriving New Variables

### Content

1. Create price_range label to categorize prices into High, Mid, and Low range.


In [1]:
# Import libraries
import numpy as np
import pandas as pd
import os
from datetime import datetime

In [2]:
# Import ord_prod_combined data

# Set path to access data files & Load data file
path = r'C:\Users\tsoew\OneDrive\Desktop\InstaCart Basket Analysis'

# Loading orders_products_combined_ver2.pkl
df_combined = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'ord_prod_combined_2.pkl'))

In [3]:
# Print the head to make sure all columns are imported properly
df_combined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both


In [4]:
# Check the shape of the data
df_combined.shape

(32404859, 14)

### Step 1: Create Flag to identify Price Range for each product

In [5]:
# Separating prices into High, Mid, and Low range

def price_label(row):

    if row['prices'] <= 5:
        return 'Low-range'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range'
    elif row['prices'] > 15:
        return 'High-range'
    else: return 'Not enough data'
    

In [6]:
# Apply the price_label list to new price_range column

print ('Start time: ' + datetime.now().strftime("%H:%M:%S"))
df_combined['price_range'] = df_combined.apply(price_label, axis=1)
print ('End time: ' + datetime.now().strftime("%H:%M:%S"))

Start time: 12:11:40
End time: 12:17:27


In [7]:
# Check the data frequency of 'price_range'
df_combined['price_range'].value_counts()

price_range
Mid-range     21860860
Low-range     10126321
High-range      417678
Name: count, dtype: int64

In [8]:
df_combined.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered
count,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,32404860.0,30328760.0,32404860.0,32404860.0
mean,25598.66,71.19612,9.919792,11.98023,1710745.0,102937.2,17.1423,2.738867,13.42515,11.10408,8.352547,0.5895873
std,14084.0,38.21139,6.281485,495.6554,987298.8,59466.1,17.53532,2.090077,4.24638,8.779064,7.127071,0.4919087
min,1.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
25%,13544.0,31.0,4.0,4.2,855947.0,51422.0,5.0,1.0,10.0,5.0,3.0,0.0
50%,25302.0,83.0,9.0,7.4,1711049.0,102616.0,11.0,3.0,13.0,8.0,6.0,1.0
75%,37947.0,107.0,16.0,11.3,2565499.0,154389.0,24.0,5.0,16.0,15.0,11.0,1.0
max,49688.0,134.0,21.0,99999.0,3421083.0,206209.0,99.0,6.0,23.0,30.0,145.0,1.0


In [9]:
df_combined.shape

(32404859, 15)

In [10]:
df_combined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range


### Step 2: Another way to create Price Range flag using loc function

In [13]:
# Using loc function will replace if-then statement

# Timing the process
print ('Start time: ' + datetime.now().strftime("%H:%M:%S"))

df_combined.loc[df_combined['prices'] > 15, 'price_range_loc'] = 'High-range'
df_combined.loc[(df_combined['prices'] <= 15) & (df_combined['prices'] > 5), 'price_range_loc'] = 'Mid-range' 
df_combined.loc[df_combined['prices'] <= 5, 'price_range_loc'] = 'Low-range'

print ('End time: ' + datetime.now().strftime("%H:%M:%S"))

Start time: 12:23:49
End time: 12:23:50


In [14]:
# Print the first five rows to make sure the new column is added properly
df_combined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range,price_range_loc
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range,Mid-range
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range,Mid-range
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range,Mid-range
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range,Mid-range
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range,Mid-range


In [15]:
# Checking the data frequency in the new column

df_combined['price_range_loc'].value_counts()

price_range_loc
Mid-range     21860860
Low-range     10126321
High-range      417678
Name: count, dtype: int64

### Step 3: Create Flag to identify the busiest day of customer orders

In [16]:
# Checking the frequency of number of orders in day of the week
# 0 = Saturday, 1 = Sunday, etc
df_combined['orders_day_of_week'].value_counts() 

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [18]:
# Running the For Loop to fill result with the new label

result = []

for value in df_combined["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [19]:
# Appending result to a new column named busiest_day

# Timing the process
print ('Start time: ' + datetime.now().strftime("%H:%M:%S"))
df_combined['busiest_day'] = result
print ('End time: ' + datetime.now().strftime("%H:%M:%S"))

Start time: 12:30:04
End time: 12:30:05


In [20]:
df_combined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range,price_range_loc,busiest_day
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range,Mid-range,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range,Mid-range,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range,Mid-range,Busiest day
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range,Mid-range,Regularly busy
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range,Mid-range,Least busy


### Step 4: Another flag to identify most 2 busiest days and 2 least busy days

In [22]:
# Assigning new variable for the new task

result2 = []

for value in df_combined["orders_day_of_week"]:
  if (value == 0) or (value==1):
    result2.append("Busiest days")
  elif (value == 4) or (value==3):
    result2.append("Least busy")
  else:
    result2.append("Regularly busy")


In [23]:
# Appending result2 to a new column named two_busiest_days, and timing the process

print ('Start time: ' + datetime.now().strftime("%H:%M:%S"))  #Start time
df_combined['two_busiest_days'] = result2
print ('End time: ' + datetime.now().strftime("%H:%M:%S"))   #Finish time

Start time: 12:35:27
End time: 12:35:28


In [24]:
# Print the first 10 rows to check result

df_combined.head(10)

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range,price_range_loc,busiest_day,two_busiest_days
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range,Mid-range,Regularly busy,Regularly busy
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range,Mid-range,Regularly busy,Regularly busy
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range,Mid-range,Busiest day,Busiest days
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range,Mid-range,Regularly busy,Least busy
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range,Mid-range,Least busy,Least busy
5,1,Chocolate Sandwich Cookies,61,19,5.8,1701441,777,16,1,7,26.0,7,0,both,Mid-range,Mid-range,Regularly busy,Busiest days
6,1,Chocolate Sandwich Cookies,61,19,5.8,1871483,825,3,2,14,30.0,2,0,both,Mid-range,Mid-range,Regularly busy,Regularly busy
7,1,Chocolate Sandwich Cookies,61,19,5.8,1290456,910,12,3,10,30.0,1,0,both,Mid-range,Mid-range,Regularly busy,Least busy
8,1,Chocolate Sandwich Cookies,61,19,5.8,369558,1052,10,1,20,19.0,1,0,both,Mid-range,Mid-range,Regularly busy,Busiest days
9,1,Chocolate Sandwich Cookies,61,19,5.8,589712,1052,15,1,12,15.0,2,1,both,Mid-range,Mid-range,Regularly busy,Busiest days


**Note**
The new column has been checked for accuracy. Whenever the value of order_dow is 0 or 1, 
the column two_busiest_days is marked 'Busiest Days'. And whenever the value of order_dow is
3 or 4, the column two_busiest_days is marked 'Least Busy'.


### Step 5: Flag to identify busiest hours (period) of the day

The senior technical officer at Instacart wants you to identify the busiest hours of the day. Rather than by hour, they want periods of time labeled “Most orders,” “Average orders,” and “Fewest orders.”

In [25]:
# Checking the frequency of orders in the hour of the day

df_combined['order_hour_of_day'].value_counts() 

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [26]:
# Create array to represent the group of hours

most_orders = [10,11,14,15,13,12,16,9]
least_orders = [3,4,2,5,1,0,6]

# assign the outcome to result3 variable list
result3 = []

for value in df_combined["order_hour_of_day"]:
  if value in most_orders:
    result3.append("Most orders")
  elif value in least_orders:
    result3.append("Fewest orders")
  else:
    result3.append("Average orders")


In [30]:
# Appending result3 to the dataframe under a new column named 'busiest_period_of_day'
df_combined['busiest_period'] = result3

In [35]:
df_combined.drop('busiest_period_of_day', axis=1, inplace=True)

In [36]:
# Print the first ten rows to check result
df_combined.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,price_range,price_range_loc,busiest_day,two_busiest_days,busiest_period
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,3.0,5,0,both,Mid-range,Mid-range,Regularly busy,Regularly busy,Most orders
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,20.0,1,1,both,Mid-range,Mid-range,Regularly busy,Regularly busy,Average orders
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,6.0,20,0,both,Mid-range,Mid-range,Busiest day,Busiest days,Average orders
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,,10,0,both,Mid-range,Mid-range,Regularly busy,Least busy,Most orders
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,9.0,11,1,both,Mid-range,Mid-range,Least busy,Least busy,Average orders


**Note** 
Now the set of hours with most orders has been assigned 'Most Orders' flag, the set hours with least orders with 'Least Orders' flag, and the rest with 'Average Orders'.

In [38]:
# Now export the updated dataframe to a pkl file, and name it orders_products_combined_ver3.pkl

print ('Start time: ' + datetime.now().strftime("%H:%M:%S"))
df_combined.to_pickle(os.path.join(path, 'Data','Prepared Data', 'ord_prod_combined_3.pkl'))
print ('End time: ' + datetime.now().strftime("%H:%M:%S"))

Start time: 12:43:14
End time: 12:43:38


# The End