# 4.8 GROUP DATA AND AGGREGATE VARIABLES

## CONTENTS:
1. Import libraries and data
2. Check data
3. Group data, calculate mean and create a column with flags
4. Export data

### 1. Import libraries

In [73]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [75]:
# define path
path = r'C:\Users\susan\Documents\data analytics\Instacart Basket Analysis\02 Data\Prepared data'

In [77]:
# import ords_prods_merge
ords_prods_merge = pd.read_pickle(os.path.join(path, 'ords_prods_new_columns.pkl'))

In [79]:
# Create a subset with one million records
df = ords_prods_merge[:1000000]

### 2. Check data

In [81]:
# Check the dimensions
df.shape

(59, 18)

In [83]:
# Check the first columns. The newly created columns are included (price_range_loc and busiest day)
df.head(5)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest day,busiest_hour
0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Most orders
1,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Most orders
2,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Most orders
3,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Most orders
4,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Most orders


In [93]:
# Check just one column. It is called busiest hour, but I would like it to be called busiest time of day.
ords_prods_merge["busiest_hour"].head(5)

0    Most orders
1    Most orders
2    Most orders
3    Most orders
4    Most orders
Name: busiest_hour, dtype: object

### 3. Group data, calculate mean and create a column with flags

In [101]:
# Group by column product name dep id, aggregate mean of order nr. (SUBSET)
df.groupby('department_id')['order_number'].mean()

department_id
4     4.400000
7     6.307692
13    3.000000
14    6.333333
16    6.923077
17    2.500000
19    5.545455
Name: order_number, dtype: float64

In [103]:
# Group by department id, aggregate mean of order number. Easier to view version. (SUBSET)
department_order_means = df.groupby('department_id')['order_number'].mean().reset_index()
print(department_order_means.head())

   department_id  order_number
0              4      4.400000
1              7      6.307692
2             13      3.000000
3             14      6.333333
4             16      6.923077


In [117]:
# Mean of order_number grouped by department_id for the entire DataFrame (ENTIRE DATAFRAME, TAKEN FROM ORDS_PRODS_MERGE)
department_order_means = ords_prods_merge.groupby('department_id')['order_number'].mean().reset_index()

department_order_means

Unnamed: 0,department_id,order_number
0,4,4.4
1,7,6.307692
2,13,3.0
3,14,6.333333
4,16,6.923077
5,17,2.5
6,19,5.545455


In [123]:
# In this case I cannot see the difference, because the subset is not filtered (???). Should I have filtered them?

In [151]:
# (4) Create a column for max order, group by user id, transform to show max orders per user (TRANSFORM FUNCTION)
ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

  ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)


In [160]:
# Show file 
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag
0,2539329,1,prior,1,2,8,,196,1,0,both,Soda,77,7,9.0,Mid-range product,Regularly busy,Most orders,10,New customer
1,2539329,1,prior,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Most orders,10,New customer
2,2539329,1,prior,1,2,8,,12427,3,0,both,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Most orders,10,New customer
3,2539329,1,prior,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23,19,4.7,Low-range product,Regularly busy,Most orders,10,New customer
4,2539329,1,prior,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,Low-range product,Regularly busy,Most orders,10,New customer


In [175]:
# Group by dep nr and take the mean of the order nr without the agg function
df.groupby('user_id')['prices'].mean()

user_id
1    6.367797
Name: prices, dtype: float64

In [191]:
# Create a column for average price, group by user id, transform to show average price per user
ords_prods_merge['average_price'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)

  ords_prods_merge['average_price'] = ords_prods_merge.groupby(['user_id'])['prices'].transform(np.mean)


In [193]:
ords_prods_merge.head(3)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,product_name,aisle_id,department_id,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag,average_price
0,2539329,1,prior,1,2,8,,196,1,0,...,Soda,77,7,9.0,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797
1,2539329,1,prior,1,2,8,,14084,2,0,...,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797
2,2539329,1,prior,1,2,8,,12427,3,0,...,Original Beef Jerky,23,19,4.4,Low-range product,Regularly busy,Most orders,10,New customer,6.367797


In [196]:
# (4)Create a spending flag for the customers based on max order value (LOC FUNCTION)
ords_prods_merge.loc[ords_prods_merge['average_price'] <= 10, 'spending_flag'] = 'High spender'
ords_prods_merge.loc[ords_prods_merge['max_order'] > 10, 'loyalty_flag'] = 'Low spender'

In [200]:
# Count the values with the new flag. (Only new customers????)
ords_prods_merge['spending_flag'].value_counts(dropna = False)

spending_flag
High spender    59
Name: count, dtype: int64

In [206]:
# (4) Create a column for average price, group by user id, transform to show average price per user (TRANSFORM FUNCTION)
ords_prods_merge['avg_price'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.mean)

  ords_prods_merge['avg_price'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.mean)


In [227]:
# (6)Create a spending flag for the customers based on average price (LOC FUNCTION)
ords_prods_merge.loc[ords_prods_merge['average_price'] > 10, 'spending_flag'] = 'Low spender'
ords_prods_merge.loc[ords_prods_merge['average_price'] <= 10, 'spending_flag'] = 'High spender'

In [229]:
# Show columns and check if the column with spending flag is added and works correctly
ords_prods_merge.head(2)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag,average_price,spending_flag,avg_price,frequency_flag
0,2539329,1,prior,1,2,8,,196,1,0,...,9.0,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,
1,2539329,1,prior,1,2,8,,14084,2,0,...,12.5,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,


In [231]:
# (7)Create a frequency flag for the customers based on days since prior order (LOC FUNCTION)
ords_prods_merge.loc[ords_prods_merge['days_since_prior_order'] > 20, 'frequency_flag'] = 'Non frequent customer'
ords_prods_merge.loc[(ords_prods_merge['days_since_prior_order'] < 10) & (ords_prods_merge['days_since_prior_order'] >= 20), 'frequency_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['days_since_prior_order'] <= 10, 'frequency_flag'] = 'Frequent customer'

In [233]:
# Show columns and check the frequency flag column
ords_prods_merge.head(2)

Unnamed: 0,order_id,user_id,validation,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,prices,price_range_loc,busiest day,busiest_hour,max_order,loyalty_flag,average_price,spending_flag,avg_price,frequency_flag
0,2539329,1,prior,1,2,8,,196,1,0,...,9.0,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,
1,2539329,1,prior,1,2,8,,14084,2,0,...,12.5,Mid-range product,Regularly busy,Most orders,10,New customer,6.367797,High spender,5.813559,


In [238]:
# Overview of the file
df.shape

(59, 19)

### 4. Export data

In [241]:
# Export data to .pickle 
ords_prods_merge.to_pickle(os.path.join(path, 'ords_prods_newest.pkl'))

In [245]:
# Ulterior check, are all values present?
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59 entries, 0 to 58
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   order_id                59 non-null     int64   
 1   user_id                 59 non-null     int64   
 2   validation              59 non-null     object  
 3   order_number            59 non-null     int64   
 4   orders_day_of_week      59 non-null     int64   
 5   order_hour_of_day       59 non-null     int64   
 6   days_since_prior_order  54 non-null     float64 
 7   product_id              59 non-null     int64   
 8   add_to_cart_order       59 non-null     int64   
 9   reordered               59 non-null     int64   
 10  _merge                  59 non-null     category
 11  product_name            59 non-null     object  
 12  aisle_id                59 non-null     int64   
 13  department_id           59 non-null     int64   
 14  prices                  59 n