## 4.8 Grouping Data & Aggregating Variables

### This script will contain the following:

#### Importing Libraries
#### Importing Orders Products Merged Dataframe
#### Group By & Aggregating
#### Creating a Loyalty Flag
#### Checking Spending Habits
#### Creating a Spending Flag
#### Creating a Frequent User Flag
#### Remove Unwanted Data
#### Exporting Dataframe

# 01. Importing Libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02. Importing Orders Products Merged Dataframe

In [2]:
# Path Creation
path = r'/Users/tyrasmussen/Desktop/05-2023 Instacart Basket Analysis'

In [3]:
# Import Orders Products Merged Dataframe
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge8ExerciseColumns.pkl'))

In [4]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,...,price_range_loc,busiest_days,busiest_hours,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days_since_prior_order,frequent_user_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,28,6,11,...,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,30,6,17,...,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,2,0,21,...,Mid-range product,Busiest days,Average orders,Average orders,5,New customer,3.0625,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,1,3,13,...,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0,Frequent customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,3,4,17,...,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0,Frequent customer


In [5]:
ords_prods_merge.shape

(32404859, 25)

# Group By & Aggregating

In [7]:
# Find the mean of 'order_number' grouped by 'department_id'
ords_prods_merge.groupby('department_id').agg({'order_number': ['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,15.457838
2,17.27792
3,17.170395
4,17.811403
5,15.215751
6,16.439806
7,17.225802
8,15.34065
9,15.895474
10,20.197148


### There are many differences between the results, for example department_id 21 had a mean of 22.902 in the entire dataframe and 25.535 in the subset of the dataframe.

## Creating a Loyalty Flag

In [8]:
# Create 'max_order' column
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

In [9]:
ords_prods_merge.head(15)

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,add_to_cart_order,reordered,_merge,busiest_day,price_range_loc,busiest_days,busiest_hours,busiest_period_of_day,max_order,loyalty_flag
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,5,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,1,1,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,20,0,both,Busiest day,Mid-range product,Busiest days,Average orders,Average orders,5,New customer
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,10,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,11,1,both,Least busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer
5,0,1,Chocolate Sandwich Cookies,61,19,5.8,12372,12372,1701441,777,...,7,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,26,Regular customer
6,0,1,Chocolate Sandwich Cookies,61,19,5.8,13096,13096,1871483,825,...,2,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,9,New customer
7,0,1,Chocolate Sandwich Cookies,61,19,5.8,14603,14603,1290456,910,...,1,0,both,Regularly busy,Mid-range product,Regularly busy,Most orders,Most orders,12,Regular customer
8,0,1,Chocolate Sandwich Cookies,61,19,5.8,17065,17065,369558,1052,...,1,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer
9,0,1,Chocolate Sandwich Cookies,61,19,5.8,17070,17070,589712,1052,...,2,1,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer


In [10]:
# Creating a loyalty flag
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'

In [11]:
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'

In [12]:
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [13]:
# Check new 'loyalty_flag' columns
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

## Checking Spending Habits

In [14]:
ords_prods_merge.groupby('loyalty_flag').agg({'prices': ['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,10.386336,1.0,99999.0
New customer,13.29467,1.0,99999.0
Regular customer,12.495717,1.0,99999.0


In [15]:
ords_prods_merge.columns

Index(['Unnamed: 0_x', 'product_id', 'product_name', 'aisle_id',
       'department_id', 'prices', 'Unnamed: 0.1', 'Unnamed: 0_y', 'order_id',
       'user_id', 'order_number', 'orders_day_of_week', 'order_hour_of_day',
       'days_since_prior_order', 'add_to_cart_order', 'reordered', '_merge',
       'busiest_day', 'price_range_loc', 'busiest_days', 'busiest_hours',
       'busiest_period_of_day', 'max_order', 'loyalty_flag'],
      dtype='object')

### The min and max prices of the products purchased are the same for our 3 loyalty groups. However, the mean ranges as the highest group is 'New customer', then 'Regular customer', and the lowest group is 'Loyal customer'.

## Creating a Spending Flag

In [16]:
# Create 'mean_prices' column
ords_prods_merge['mean_prices'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.mean)

In [17]:
ords_prods_merge.head(10)

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,reordered,_merge,busiest_day,price_range_loc,busiest_days,busiest_hours,busiest_period_of_day,max_order,loyalty_flag,mean_prices
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,1,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,0,both,Busiest day,Mid-range product,Busiest days,Average orders,Average orders,5,New customer,3.0625
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,1,both,Least busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241
5,0,1,Chocolate Sandwich Cookies,61,19,5.8,12372,12372,1701441,777,...,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,26,Regular customer,14.964602
6,0,1,Chocolate Sandwich Cookies,61,19,5.8,13096,13096,1871483,825,...,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,9,New customer,4.939394
7,0,1,Chocolate Sandwich Cookies,61,19,5.8,14603,14603,1290456,910,...,0,both,Regularly busy,Mid-range product,Regularly busy,Most orders,Most orders,12,Regular customer,7.7
8,0,1,Chocolate Sandwich Cookies,61,19,5.8,17065,17065,369558,1052,...,0,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer,10.958333
9,0,1,Chocolate Sandwich Cookies,61,19,5.8,17070,17070,589712,1052,...,1,both,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer,10.958333


In [18]:
# Create a Spending Flag
ords_prods_merge.loc[ords_prods_merge['mean_prices'] >= 10, 'spending_flag'] = 'High spender'

In [19]:
ords_prods_merge.loc[ords_prods_merge['mean_prices'] < 10, 'spending_flag'] = 'Low spender'

In [20]:
# Check new 'spending_flag' columns
ords_prods_merge['spending_flag'].value_counts(dropna = False)

High spender    20332281
Low spender     12072578
Name: spending_flag, dtype: int64

## Creating a Frequent User Flag

In [21]:
# Create 'median_days_since_prior_order' column
ords_prods_merge['median_days_since_prior_order'] = ords_prods_merge.groupby(['user_id'])['days_since_prior_order'].transform(np.median)

In [22]:
ords_prods_merge.head(10)

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,busiest_day,price_range_loc,busiest_days,busiest_hours,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days_since_prior_order
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,Busiest day,Mid-range product,Busiest days,Average orders,Average orders,5,New customer,3.0625,Low spender,8.0
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,Least busy,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0
5,0,1,Chocolate Sandwich Cookies,61,19,5.8,12372,12372,1701441,777,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,26,Regular customer,14.964602,High spender,11.0
6,0,1,Chocolate Sandwich Cookies,61,19,5.8,13096,13096,1871483,825,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,9,New customer,4.939394,Low spender,20.0
7,0,1,Chocolate Sandwich Cookies,61,19,5.8,14603,14603,1290456,910,...,Regularly busy,Mid-range product,Regularly busy,Most orders,Most orders,12,Regular customer,7.7,Low spender,6.0
8,0,1,Chocolate Sandwich Cookies,61,19,5.8,17065,17065,369558,1052,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer,10.958333,High spender,10.0
9,0,1,Chocolate Sandwich Cookies,61,19,5.8,17070,17070,589712,1052,...,Regularly busy,Mid-range product,Regularly busy,Average orders,Average orders,20,Regular customer,10.958333,High spender,10.0


In [23]:
# Creating a frequent user flag
ords_prods_merge.loc[ords_prods_merge['median_days_since_prior_order'] > 20, 'frequent_user_flag'] = 'Non-frequent customer'

In [28]:
ords_prods_merge.loc[(ords_prods_merge['median_days_since_prior_order'] > 10) & (ords_prods_merge['median_days_since_prior_order'] <= 20), 'frequent_user_flag'] = 'Regular customer'

In [24]:
ords_prods_merge.loc[ords_prods_merge['median_days_since_prior_order'] <= 10, 'frequent_user_flag'] = 'Frequent customer'

In [29]:
# Check new 'frequent_user_flag' columns
ords_prods_merge['frequent_user_flag'].value_counts(dropna = False)

Frequent customer        21559853
Regular customer          7208564
Non-frequent customer     3636437
NaN                             5
Name: frequent_user_flag, dtype: int64

In [30]:
ords_prods_merge.head()

Unnamed: 0,Unnamed: 0_x,product_id,product_name,aisle_id,department_id,prices,Unnamed: 0.1,Unnamed: 0_y,order_id,user_id,...,price_range_loc,busiest_days,busiest_hours,busiest_period_of_day,max_order,loyalty_flag,mean_prices,spending_flag,median_days_since_prior_order,frequent_user_flag
0,0,1,Chocolate Sandwich Cookies,61,19,5.8,1987,1987,3139998,138,...,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0,Frequent customer
1,0,1,Chocolate Sandwich Cookies,61,19,5.8,1989,1989,1977647,138,...,Mid-range product,Regularly busy,Average orders,Average orders,32,Regular customer,14.790541,High spender,8.0,Frequent customer
2,0,1,Chocolate Sandwich Cookies,61,19,5.8,11433,11433,389851,709,...,Mid-range product,Busiest days,Average orders,Average orders,5,New customer,3.0625,Low spender,8.0,Frequent customer
3,0,1,Chocolate Sandwich Cookies,61,19,5.8,12198,12198,652770,764,...,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0,Frequent customer
4,0,1,Chocolate Sandwich Cookies,61,19,5.8,12200,12200,1813452,764,...,Mid-range product,Regularly busy,Average orders,Average orders,3,New customer,2.017241,Low spender,9.0,Frequent customer


# Remove Unwanted Data

In [6]:
ords_prods_merge.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order',
       'reordered', '_merge', 'busiest_day', 'price_range_loc', 'busiest_days',
       'busiest_hours', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_prices', 'spending_flag', 'median_days_since_prior_order',
       'frequent_user_flag'],
      dtype='object')

In [6]:
# Remove 'Unnamed: 0_x' column from ords_prods_merge dataframe
ords_prods_merge = ords_prods_merge.drop(columns = ['Unnamed: 0_x'])

In [7]:
# Remove 'Unnamed: 0.1' column from ords_prods_merge dataframe
ords_prods_merge = ords_prods_merge.drop(columns = ['Unnamed: 0.1'])

In [8]:
# Remove 'Unnamed: 0_y' column from ords_prods_merge dataframe
ords_prods_merge = ords_prods_merge.drop(columns = ['Unnamed: 0_y'])

In [7]:
# Check columns
ords_prods_merge.columns

Index(['product_id', 'product_name', 'aisle_id', 'department_id', 'prices',
       'order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'add_to_cart_order',
       'reordered', '_merge', 'busiest_day', 'price_range_loc', 'busiest_days',
       'busiest_hours', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'mean_prices', 'spending_flag', 'median_days_since_prior_order',
       'frequent_user_flag'],
      dtype='object')

In [8]:
ords_prods_merge.isnull().sum()

product_id                             0
product_name                           0
aisle_id                               0
department_id                          0
prices                                 0
order_id                               0
user_id                                0
order_number                           0
orders_day_of_week                     0
order_hour_of_day                      0
days_since_prior_order           2076096
add_to_cart_order                      0
reordered                              0
_merge                                 0
busiest_day                            0
price_range_loc                        0
busiest_days                           0
busiest_hours                          0
busiest_period_of_day                  0
max_order                              0
loyalty_flag                           0
mean_prices                            0
spending_flag                          0
median_days_since_prior_order          5
frequent_user_fl

In [9]:
ords_prods_merge.shape

(32404859, 25)

In [10]:
# Removing missing values
ords_prods_merge_clean = ords_prods_merge[ords_prods_merge['days_since_prior_order'].isnull() == False]

In [11]:
# Removing missing values
ords_prods_merge_clean = ords_prods_merge[ords_prods_merge['median_days_since_prior_order'].isnull() == False]

In [12]:
# Removing missing values
ords_prods_merge_clean = ords_prods_merge[ords_prods_merge['frequent_user_flag'].isnull() == False]

In [13]:
ords_prods_merge_clean.shape

(32404854, 25)

# Exporting Dataframe

In [14]:
# Export data to pkl
ords_prods_merge_clean.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_clean_8.pkl'))