# 4.7 Deriving new variables

## This script contains the following points:

Create the "price_label" and "busiest_day" columns

Create a "busiest_days" column for the two busiest days

Check the values of this new column for accuracy

Create a "busiest_hours" column to identify the busiest hours of the day

Print the frequency for this new column

Ensure the notebook is clean structured

Export the dataframe as a pickle file

## 1. Create the "price_label" and "busiest_day" columns

In [27]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [28]:
# Import data

path = r'C:\Users\admin\06-2024 Instacart Basket Analysis'
df_ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge.pkl'))

### 1-1. Create the "price_label" column

In [30]:
# Create a subset for the first one million row

df = df_ords_prods_merge[:1000000]

In [31]:
df.shape

(1000000, 19)

In [32]:
# Define a function

def price_label(row):

    if row['prices'] <= 5:
        return 'Low-range product'
    elif (row['prices'] > 5) and (row['prices'] <= 15):
        return 'Mid-range product'
    elif row['prices'] > 15:
        return 'High range'
    else: return 'Not enough data'

In [33]:
# Apply the function

df['price_range'] = df.apply(price_label, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['price_range'] = df.apply(price_label, axis = 1)


In [34]:
df['price_range'].value_counts(dropna = False)

price_range
Mid-range product    652638
Low-range product    338018
High range             9344
Name: count, dtype: int64

## Note: An alternative way is to use loc() as following:

df.loc[df['prices'] > 15, 'price_range_loc'] = 'High-range product'

df.loc[(df['prices'] <= 15) & (df['prices'] > 5), 'price_range_loc'] = 'Mid-range product'

df.loc[df['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [37]:
# Apply the function on the entire dataframe using loc() (df_ords_prods_merge instead of df)

df_ords_prods_merge.loc[df_ords_prods_merge['prices'] > 15, 'price_range_loc'] = 'High-range product'
df_ords_prods_merge.loc[(df_ords_prods_merge['prices'] <= 15) & (df_ords_prods_merge['prices'] > 5), 'price_range_loc'] = 'Mid-range product'
df_ords_prods_merge.loc[df_ords_prods_merge['prices'] <= 5, 'price_range_loc'] = 'Low-range product'

In [38]:
df_ords_prods_merge['price_range_loc'].value_counts(dropna = False)

price_range_loc
Mid-range product     21860860
Low-range product     10126321
High-range product      417678
Name: count, dtype: int64

### 1-2. Create the "busiest day" column

In [40]:
# Print the frequency of the "orders_day_of_week"

df_ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [41]:
# Create a for-loop

result = []

for value in df_ords_prods_merge['orders_day_of_week']:
    if value == 0:
        result.append('Busiest day')
    elif value == 4:
        result.append('Least busy')
    else:
        result.append('Regular busy')

In [42]:
result

['Regular busy',
 'Regular busy',
 'Busiest day',
 'Regular busy',
 'Least busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Least busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Busiest day',
 'Least busy',
 'Regular busy',
 'Regular busy',
 'Busiest day',
 'Regular busy',
 'Regular busy',
 'Least busy',
 'Regular busy',
 'Busiest day',
 'Regular busy',
 'Busiest day',
 'Least busy',
 'Busiest day',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Busiest day',
 'Regular busy',
 'Regular busy',
 'Busiest day',
 'Least busy',
 'Re

In [43]:
df_ords_prods_merge['busiest_day'] = result

In [44]:
# Print the frequency of the "busiest_day"

df_ords_prods_merge['busiest_day'].value_counts(dropna = False)

busiest_day
Regular busy    22416875
Busiest day      6204182
Least busy       3783802
Name: count, dtype: int64

## 2. Create a "busiest_days" column for the two busiest days and the two slowest days

In [46]:
# Print the frequency of the "orders_day_of_week"

df_ords_prods_merge['orders_day_of_week'].value_counts(dropna = False)

orders_day_of_week
0    6204182
1    5660230
6    4496490
2    4213830
5    4205791
3    3840534
4    3783802
Name: count, dtype: int64

In [47]:
result = []

for value in df_ords_prods_merge["orders_day_of_week"]:
    if value == 0 or value == 1:
        result.append("Busiest days")
    elif value == 4 or value == 3:
        result.append("Least busy")
    else:
        result.append("Regular busy")

In [48]:
result

['Regular busy',
 'Regular busy',
 'Busiest days',
 'Least busy',
 'Least busy',
 'Busiest days',
 'Regular busy',
 'Least busy',
 'Busiest days',
 'Busiest days',
 'Regular busy',
 'Least busy',
 'Least busy',
 'Regular busy',
 'Least busy',
 'Regular busy',
 'Regular busy',
 'Regular busy',
 'Busiest days',
 'Busiest days',
 'Regular busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Regular busy',
 'Busiest days',
 'Busiest days',
 'Regular busy',
 'Regular busy',
 'Least busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Regular busy',
 'Regular busy',
 'Busiest days',
 'Busiest days',
 'Busiest days',
 'Least busy',
 'Regular busy',
 'Busiest days',
 'Regular busy',
 'Busiest days',
 'Busiest days',
 'Regular busy',
 'Busiest days',
 'Least busy',
 'Re

In [49]:
df_ords_prods_merge['busiest_days'] = result

## 3. Check the values of this new column for accuracy

In [51]:
# Print the frequency of the "busiest_day"

df_ords_prods_merge['busiest_days'].value_counts(dropna = False)

busiest_days
Regular busy    12916111
Busiest days    11864412
Least busy       7624336
Name: count, dtype: int64

In [52]:
12916111 + 11864412 + 7624336

32404859

In [53]:
df_ords_prods_merge.shape

(32404859, 22)

The number of 'Regular busy' has decreased. The number of 'Busiest days' and 'Least busy' has increased. But both of the result match the total number of rows (32404859).

## 4. Create a "busiest_period_of_day" column to identify the busiest hours of the day

In [56]:
df_ords_prods_merge.head()

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,orders_day_of_week,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,_merge2,price_range_loc,busiest_day,busiest_days
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,6,11,3.0,5,0,both,both,Mid-range product,Regular busy,Regular busy
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,6,17,20.0,1,1,both,both,Mid-range product,Regular busy,Regular busy
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,0,21,6.0,20,0,both,both,Mid-range product,Busiest day,Busiest days
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,3,13,11.0,10,0,both,both,Mid-range product,Regular busy,Least busy
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,4,17,9.0,11,1,both,both,Mid-range product,Least busy,Least busy


According to the column names, I should use "order_hour_of_day" column to identify the busiest hours of the day

In [58]:
# Check for the frequency of "order_hour_of_day"

df_ords_prods_merge['order_hour_of_day'].value_counts(dropna = False)

order_hour_of_day
10    2761760
11    2736140
14    2689136
15    2662144
13    2660954
12    2618532
16    2535202
9     2454203
17    2087654
8     1718118
18    1636502
19    1258305
20     976156
7      891054
21     795637
22     634225
23     402316
6      290493
0      218769
1      115700
5       87961
2       69375
4       53242
3       51281
Name: count, dtype: int64

In [59]:
df_ords_prods_merge.loc[(df_ords_prods_merge['order_hour_of_day'] >= 9) & (df_ords_prods_merge['order_hour_of_day'] <= 16), 'busiest_period_of_day_loc'] = 'Most orders'
df_ords_prods_merge.loc[(df_ords_prods_merge['order_hour_of_day'] <= 6), 'busiest_period_of_day_loc'] = 'Fewer orders'
df_ords_prods_merge.loc[df_ords_prods_merge['order_hour_of_day'] == 23, 'busiest_period_of_day_loc'] = 'Fewer orders'
df_ords_prods_merge.loc[df_ords_prods_merge['order_hour_of_day'] == 7, 'busiest_period_of_day_loc'] = 'Average orders'
df_ords_prods_merge.loc[df_ords_prods_merge['order_hour_of_day'] == 8, 'busiest_period_of_day_loc'] = 'Average orders'
df_ords_prods_merge.loc[(df_ords_prods_merge['order_hour_of_day'] >= 17) & (df_ords_prods_merge['order_hour_of_day'] <= 22), 'busiest_period_of_day_loc'] = 'Average orders'

## 5. Print the frequency for this new column

In [61]:
df_ords_prods_merge['busiest_period_of_day_loc'].value_counts(dropna = False)

busiest_period_of_day_loc
Most orders       21118071
Average orders     9997651
Fewer orders       1289137
Name: count, dtype: int64

## 6. Ensure the notebook is clean structured

In [63]:
df_ords_prods_merge.shape

(32404859, 23)

In [64]:
df_ords_prods_merge.head()

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,order_hour_of_day,days_since_prior_order,add_to_cart_order,reordered,_merge,_merge2,price_range_loc,busiest_day,busiest_days,busiest_period_of_day_loc
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,11,3.0,5,0,both,both,Mid-range product,Regular busy,Regular busy,Most orders
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,17,20.0,1,1,both,both,Mid-range product,Regular busy,Regular busy,Average orders
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,21,6.0,20,0,both,both,Mid-range product,Busiest day,Busiest days,Average orders
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,13,11.0,10,0,both,both,Mid-range product,Regular busy,Least busy,Most orders
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,17,9.0,11,1,both,both,Mid-range product,Least busy,Least busy,Average orders


## 7. Export the dataframe as a pickle file

In [66]:
df_ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_new_cols.pkl'))