# Table of content
## 1. Setting up
## 2. Grouping and Aggregating Data
## 3. Creating various flags

# 1. Setting up

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# creating path for later import/export

path = r"C:\Users\Anwender\Documents\07-2023 Instacart Basket Analysis\02 Data"

In [7]:
# importing df

ords_prods_merge = pd.read_pickle(os.path.join(path, "Prepared Data", "df_merged.pkl"))

In [8]:
# limitting df to one million rows

df = ords_prods_merge[:1000000]

In [9]:
# checking shape

df.shape

(1000000, 14)

In [10]:
# checking head

df.head()

Unnamed: 0,order_id,user_id,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both


# 2. Grouping and Aggregating Data

In [11]:
# splitting data into group by product name

df.groupby("product_name")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000025974F15990>

In [14]:
# single aggregation to get mean of amount_of_orders

df.groupby("department_id").agg({"amount_of_orders": ["mean"]})

Unnamed: 0_level_0,amount_of_orders
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
4,18.82578
7,17.472355
13,17.993423
14,19.246334
16,19.463012
17,11.294069
19,19.305237
20,17.599636


In [16]:
# alternatively using mean() to get the same results

df.groupby("department_id")["amount_of_orders"].mean()

department_id
4     18.825780
7     17.472355
13    17.993423
14    19.246334
16    19.463012
17    11.294069
19    19.305237
20    17.599636
Name: amount_of_orders, dtype: float64

In [17]:
# performing multiple aggregations simultaneously

df.groupby("department_id").agg({"amount_of_orders": ["mean", "min", "max"]})

Unnamed: 0_level_0,amount_of_orders,amount_of_orders,amount_of_orders
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
4,18.82578,1,99
7,17.472355,1,99
13,17.993423,1,99
14,19.246334,1,99
16,19.463012,1,99
17,11.294069,1,98
19,19.305237,1,99
20,17.599636,1,99


In [18]:
# creating max_order derivation by grouping user_id and aggregating by amount_of_orders 

ords_prods_merge["max_order"] = ords_prods_merge.groupby(["user_id"])["amount_of_orders"].transform(np.max)

In [19]:
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10


In [20]:
# removing limit from head():

pd.options.display.max_rows = None

In [21]:
ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,amount_of_orders,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,max_order
0,2539329,1,1,2,8,,196,1,0,Soda,77,7,9.0,both,10
1,2398795,1,2,3,7,15.0,196,1,1,Soda,77,7,9.0,both,10
2,473747,1,3,3,12,21.0,196,1,1,Soda,77,7,9.0,both,10
3,2254736,1,4,4,7,29.0,196,1,1,Soda,77,7,9.0,both,10
4,431534,1,5,4,15,28.0,196,1,1,Soda,77,7,9.0,both,10
5,3367565,1,6,2,7,19.0,196,1,1,Soda,77,7,9.0,both,10
6,550135,1,7,1,9,20.0,196,1,1,Soda,77,7,9.0,both,10
7,3108588,1,8,1,14,14.0,196,2,1,Soda,77,7,9.0,both,10
8,2295261,1,9,1,16,0.0,196,4,1,Soda,77,7,9.0,both,10
9,2550362,1,10,4,8,30.0,196,1,1,Soda,77,7,9.0,both,10


# 3. Creating various flags

In [22]:
# creating loyalty flag based on max_order using loc:

ords_prods_merge.loc[ords_prods_merge["max_order"] > 40, "loyalty_flag"] = "Loyal customer"

In [23]:
ords_prods_merge.loc[(ords_prods_merge["max_order"] <= 40) & (ords_prods_merge ["max_order"] > 10), "loyalty_flag"] = "Regular customer"

In [24]:
ords_prods_merge.loc[ords_prods_merge["max_order"] <= 10, "loyalty_flag"] = "New customer"

In [27]:
# checking output

ords_prods_merge["loyalty_flag"].value_counts(dropna = False)

Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: loyalty_flag, dtype: int64

In [28]:
ords_prods_merge[["user_id", "loyalty_flag", "amount_of_orders"]].head(60)

Unnamed: 0,user_id,loyalty_flag,amount_of_orders
0,1,New customer,1
1,1,New customer,2
2,1,New customer,3
3,1,New customer,4
4,1,New customer,5
5,1,New customer,6
6,1,New customer,7
7,1,New customer,8
8,1,New customer,9
9,1,New customer,10


In [30]:
# creating spending flag based on average spending using loc:

ords_prods_merge["average_spending"] = ords_prods_merge.groupby(["user_id"])["prices"].transform(np.mean)

In [32]:
ords_prods_merge.loc[ords_prods_merge["average_spending"] >= 10, "spending_flag"] = "high spender"

In [33]:
ords_prods_merge.loc[ords_prods_merge["average_spending"] < 10, "spending_flag"] = "Low spender"

In [39]:
ords_prods_merge["spending_flag"].value_counts(dropna = False)

Low spender     31770614
high spender      634245
Name: spending_flag, dtype: int64