# 3.1 IC Deriving New Variables

## Contents
### Import libraries and data sets
### Create subset with 1 million rows
### User defined functions
### Using loc() function
### Loc() derivation on entire data set
### If statements with for loops
### Deriving the "busiest_days" variable
### Identify busiest hours of the day
### Export as pickle

## Import libraries and data sets

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# path to project folder
path = r'/Users/susanwang/Documents/CF_Tasks/Instacart Basket Analysis'

In [3]:
# import ords_prods_merge pickle
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

In [4]:
# check data
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both


In [5]:
ords_prods_merge.shape

(32404859, 15)

## Create a subset with 1 million rows

In [6]:
df = ords_prods_merge[:1000000]

In [7]:
df.shape

(1000000, 15)

## User-Defined Functions

### We want to sort products into price ranges by flagging them.

In [8]:
# define function
def price_label(row):
    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High-range product'
    else:
        return 'Not enough data'

In [9]:
df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [10]:
# check value counts
df['price_range'].value_counts(dropna=False)

price_range
Mid-range product     673183
Low-range product     314392
High-range product     12425
Name: count, dtype: int64

## Using loc() Function

In [11]:
df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'


In [12]:
df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

In [13]:
df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [14]:
df['price_range_loc'].value_counts(dropna=False)

price_range_loc
Mid-range product     673183
Low-range product     314392
High-range product     12425
Name: count, dtype: int64

## Loc() Derivation on entire dataset

In [20]:
ords_prods_merge.loc[ords_prods_merge['prices'] <= 5, 'price_range'] = 'Low-range product'

In [21]:
ords_prods_merge.loc[(ords_prods_merge['prices'] <= 15) & (ords_prods_merge['prices'] > 5), 'price_range'] = 'Mid-range product'

In [22]:
ords_prods_merge.loc[ords_prods_merge['prices'] > 15, 'price_range'] = 'High-range product'

In [23]:
# check counts
price_count = ords_prods_merge['price_range'].value_counts(dropna=False)

In [24]:
price_count

price_range
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

In [25]:
# copy frequency table to excel report
price_count.to_clipboard()

## If Statements with For Loops

### We want to summarize how busy each day of the week is.

In [26]:
# view frequency
ords_prods_merge['orders_day_of_the_week'].value_counts(dropna=False)

orders_day_of_the_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [27]:
# loop through column and append to a list
result = []

for value in ords_prods_merge["orders_day_of_the_week"]:
  if value == 0:
    result.append("Busiest day")
  elif value == 4:
    result.append("Least busy")
  else:
    result.append("Regularly busy")

In [33]:
result

In [29]:
# set a new column to list
ords_prods_merge['busiest_day'] = result

In [30]:
# check value counts
bus_day_check = ords_prods_merge['busiest_day'].value_counts(dropna=False)

In [31]:
bus_day_check

busiest_day
Regularly busy    22416875
Busiest day        6204182
Least busy         3783802
Name: count, dtype: int64

In [32]:
# copy frequency table to excel report
bus_day_check.to_clipboard()

## Deriving the "Busiest days" variable

In [76]:
# view frequency of days in the df
ords_prods_merge['orders_day_of_the_week'].value_counts(dropna=False)

orders_day_of_the_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [34]:
# create list
busiest = []

for value in ords_prods_merge["orders_day_of_the_week"]:
  if value <= 1:
    busiest.append("Busiest days")
  elif value == 3:
    busiest.append("Slowest days")
  elif value == 4:
    busiest.append("Slowest days")
  else:
    busiest.append("Regular days")

In [35]:
# set list as new column
ords_prods_merge['busiest_days'] = busiest

In [36]:
# check value counts of new column
check_days = ords_prods_merge['busiest_days'].value_counts(dropna=False)

In [37]:
check_days

busiest_days
Regular days    12916111
Busiest days    11864412
Slowest days     7624336
Name: count, dtype: int64

In [38]:
# copy to excel report
check_days.to_clipboard()

In [39]:
# compare to original column
ords_prods_merge['orders_day_of_the_week'].value_counts(dropna=False)

orders_day_of_the_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

The results look accurate. The two busiest days have almost as many orders as three regular days.

## Identify busiest hours of the day

In [40]:
# view order_hour_of_day frequencies
ords_prods_merge['order_hour_of_day'].value_counts(dropna=False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

### I will divide the hours of day into three periods. The 'Most orders' will be the top 8 hours by frequency. 'Average orders' will be the middle 8 hours. 'Fewest orders' will be the last third of the hours.

In [41]:
# create lists of hours in each category
high = [10, 11, 14, 15, 13, 12, 16, 9]
mid = [17, 8, 18, 19, 20, 7, 21, 22]
low = [23, 6, 0, 1, 5, 2, 4, 3]

In [42]:
print(len(high))
print(len(mid))
print(len(low))

8
8
8


### Use a subset to practice and test

In [84]:
sub_df = ords_prods_merge[:1000]

In [85]:
sub_df.loc[sub_df['order_hour_of_day'].isin(high), 'busiest_period_of_day'] = 'Most orders'

In [86]:
sub_df.loc[sub_df['order_hour_of_day'].isin(mid), 'busiest_period_of_day'] = 'Average orders'

In [87]:
sub_df.loc[sub_df['order_hour_of_day'].isin(low), 'busiest_period_of_day'] = 'Fewest orders'

In [88]:
sub_df['busiest_period_of_day'].value_counts(dropna=False)

busiest_period_of_day
Most orders       692
Average orders    274
Fewest orders      34
Name: count, dtype: int64

### Apply to whole data set

In [43]:
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(high), 'busiest_period_of_day'] = 'Most orders'

In [44]:
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(mid), 'busiest_period_of_day'] = 'Average orders'

In [45]:
ords_prods_merge.loc[ords_prods_merge['order_hour_of_day'].isin(low), 'busiest_period_of_day'] = 'Fewest orders'

In [46]:
# check value counts and compare
check_hrs = ords_prods_merge['busiest_period_of_day'].value_counts(dropna=False)

In [47]:
check_hrs

busiest_period_of_day
Most orders       21118071
Average orders     9997651
Fewest orders      1289137
Name: count, dtype: int64

In [48]:
# copy to excel report
check_hrs.to_clipboard()

In [49]:
ords_prods_merge.shape

(32404859, 19)

In [50]:
# view head to check new columns
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regular days,Average orders
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both,Mid-range product,Regularly busy,Regular days,Average orders
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Regular days,Average orders
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Regular days,Average orders
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both,Low-range product,Regularly busy,Regular days,Average orders


## Export as pickle

In [51]:
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_derive.pkl'))