# 4.7 DERIVING NEW VARIABLES

## CONTENTS:
1. Import libraries and data.
2. Check data.
3. Derive new variables and columns.
4. Export data set.

### 1. Import libraries and data

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [6]:
# define path
path = r'C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data\Prepared data'

In [29]:
# import ords_prods_merge
ords_prods_merge = pd.read_pickle(os.path.join(path, 'ords_prods_merge.pkl'))

### 2. Check data (and create a subset to use)

In [31]:
df_ords_prods_merge.shape

(59, 15)

In [73]:
# Create a subset to avoid issues with memory. The .iloc and .copy make a fresh copy and avoids panda warnings if removed already here.
df = ords_prods_merge.iloc[:1000000].copy()

### 3. Derive new variables and columns

In [75]:
# Define ranges and a column for price label. Define a function.
def price_label(row):

  if row['prices'] <= 5:
    return 'Low-range product'
  elif (row['prices'] > 5) and (row['prices'] <= 15):
    return 'Mid-range product'
  elif row['prices'] > 15:
    return 'High range'
  else: return 'Not enough data'

In [77]:
# Apply the function 
df['price_range'] = df.apply(price_label, axis=1)

In [79]:
# Counting how many times the value appears in the price_range column
df['price_range'].value_counts(dropna = False)

price_range
Low-range product    32
Mid-range product    27
Name: count, dtype: int64

In [81]:
# Check what product is the most expensive
df['prices'].max()

14.0

In [91]:
# Below the loc function is used to define the ranges

In [83]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [85]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [87]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [89]:
# Counting how many times the value appears in the price_range column
df['price_range'].value_counts(dropna = False)

price_range
Low-range product    32
Mid-range product    27
Name: count, dtype: int64

In [93]:
# Run the loc function for the ords_prods_merge data frame

In [95]:
ords_prods_merge.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

In [97]:
ords_prods_merge.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product' 

In [99]:
ords_prods_merge.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [103]:
# Counting how many times the value appears in the price_range column. (Small dataframe???)
df['price_range'].value_counts(dropna = False)

price_range
Low-range product    32
Mid-range product    27
Name: count, dtype: int64

In [109]:
# Counting how many times the value appears in the price_range column
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
4    22
1    17
3    11
2     9
Name: count, dtype: int64

In [111]:
# Create a new column for busiest day
result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [113]:
# Print the results list
result

['Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Regularly busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy',
 'Least busy

In [115]:
# Create a new column within the ords_prods_merge df called result
ords_prods_merge['busiest day'] = result

In [129]:
# Check the values, print the frequency. Frequency = 59 in both this one and the other df.
ords_prods_merge['busiest day'].value_counts(dropna = False)

busiest day
Regularly busy    37
Least busy        22
Name: count, dtype: int64

In [149]:
# (Exercise 2) Create a new column for "busiest days". Used the "in" operator. 
result = []

for value in ords_prods_merge["orders_day_of_week"]:
  if value in [0, 1]: # For days 0 and 1
    result.append("Busiest days")
  elif value in [6, 7]: # For days 6 and 7
    result.append("Slowest days")
  else:
    result.append("Regular days") # For all other days

In [151]:
# (Exercise 3)Check what to expect
ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
4    22
1    17
3    11
2     9
Name: count, dtype: int64

In [153]:
# Print the results
result

['Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days',
 'Regular days

In [157]:
ords_prods_merge.head(5)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest day
0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy
1,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy
2,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy
3,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy
4,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy


In [161]:
# (Exercise 4)Check what to expect
ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
7     15
8     14
15     8
14     6
16     6
12     5
9      5
Name: count, dtype: int64

In [167]:
# (Exercise 4) Create a new column for "busiest hour of day". 
result = []

for value in ords_prods_merge["order_hour_of_day"]:
  if value in [7, 8]: # For days 0 and 1
    result.append("Most orders")
  elif value in [15]: # For days 6 and 7
    result.append("Average orders")
  else:
    result.append("Few orders") # For all other days

In [169]:
# Print the results
result

['Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Average orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Few orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders',
 'Most orders']

### 4. Export data 

In [172]:
# Export data to .pickle 
df_ords_prods_merge.to_pickle(os.path.join(path, 'ords_prods_new_columns.pkl'))