# This script contains the following chapters:
1. Importing data & Creating subset
2. Grouping and aggregating for the subset
3. Grouping and aggregating for the entire dataframe and comparing results to subset
4. Creating loyalty flag for the entire dataframe
5. Basic Statistics of product prices flagged by loyalty
6. Creating spending flag for each user
7. Creating order frequency flag for the entire dataframe
8. Merging new flags into dataframe and exporting

# 1. Importing data & Creating subset

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import os

In [2]:
# assigning path for easier import of data
path = r'C:\Users\magia\06-2025 Instacart Basket Analysis'

In [3]:
# importing data set using path variable
ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_new_columns.pkl'))

In [4]:
#creating subset of first one million entries
df = ords_prods_merge[:1000000]

In [5]:
# checking dimensions of df subset
df.shape

(1000000, 19)

In [6]:
# checking first 10 rows
df.head(10)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy days,Most orders
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both,Mid-range product,Regularly busy,Regularly busy days,Most orders
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Regularly busy days,Most orders
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Regularly busy days,Most orders
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both,Low-range product,Regularly busy,Regularly busy days,Most orders
5,2398795,1,2,3,7,15.0,False,196,1,1,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Least two busy days,Most orders
6,2398795,1,2,3,7,15.0,False,10258,2,0,Pistachios,117,19,3.0,both,Low-range product,Regularly busy,Least two busy days,Most orders
7,2398795,1,2,3,7,15.0,False,12427,3,1,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Least two busy days,Most orders
8,2398795,1,2,3,7,15.0,False,13176,4,0,Bag of Organic Bananas,24,4,10.3,both,Mid-range product,Regularly busy,Least two busy days,Most orders
9,2398795,1,2,3,7,15.0,False,26088,5,1,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Least two busy days,Most orders


In [7]:
# grouping by 'product_name'
df.groupby('product_name')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001602BCDE480>

# 2. Grouping and aggregating for the subset

In [8]:
# grouping by 'department_id' & aggregating 'order_number' to obtain mean values
df.groupby('department_id').agg({'order_number':['mean']})

Unnamed: 0_level_0,order_number
Unnamed: 0_level_1,mean
department_id,Unnamed: 1_level_2
1,14.800024
2,17.091743
3,17.913544
4,17.893092
5,15.21427
6,15.382135
7,17.694027
8,16.458105
9,15.957363
10,20.091818


In [9]:
# aggregating with mean function and not agg() function
df.groupby('department_id')['order_number'].mean()

department_id
1     14.800024
2     17.091743
3     17.913544
4     17.893092
5     15.214270
6     15.382135
7     17.694027
8     16.458105
9     15.957363
10    20.091818
11    16.482026
12    15.615061
13    16.484023
14    17.524632
15    15.691875
16    18.014071
17    16.150593
18    19.602850
19    17.631340
20    17.138607
21    21.956893
Name: order_number, dtype: float64

In [10]:
# aggregating with dot notation
df.groupby('department_id').order_number.mean()

department_id
1     14.800024
2     17.091743
3     17.913544
4     17.893092
5     15.214270
6     15.382135
7     17.694027
8     16.458105
9     15.957363
10    20.091818
11    16.482026
12    15.615061
13    16.484023
14    17.524632
15    15.691875
16    18.014071
17    16.150593
18    19.602850
19    17.631340
20    17.138607
21    21.956893
Name: order_number, dtype: float64

In [11]:
# aggregating multiple measures
df.groupby('department_id').agg({'order_number' : ['mean', 'min', 'max']})

Unnamed: 0_level_0,order_number,order_number,order_number
Unnamed: 0_level_1,mean,min,max
department_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,14.800024,1,99
2,17.091743,1,98
3,17.913544,1,99
4,17.893092,1,99
5,15.21427,1,99
6,15.382135,1,99
7,17.694027,1,99
8,16.458105,1,91
9,15.957363,1,99
10,20.091818,1,99


# 3. Grouping and aggregating for the entire dataframe and comparing results to subset

In [12]:
# grouping by 'department_id' and aggregating mean of 'order_number' for the entire df
ords_prods_merge.groupby('department_id')['order_number'].mean()

department_id
1     15.457838
2     17.277920
3     17.170395
4     17.811403
5     15.215751
6     16.439806
7     17.225802
8     15.340650
9     15.895474
10    20.197148
11    16.170638
12    15.887671
13    16.583536
14    16.773669
15    16.165037
16    17.665606
17    15.694469
18    19.310397
19    17.177343
20    16.473447
21    22.902379
Name: order_number, dtype: float64

In [13]:
# Sorting by descending mean order_number and adding rank number- entire dataframe
ords_prods_merge.groupby('department_id')['order_number'] \
    .mean() \
    .sort_values(ascending=False) \
    .reset_index() \
    .rename(columns={'order_number': 'mean_order_number'}) \
    .reset_index() \
    .rename(columns={'index': 'Rank'})

Unnamed: 0,Rank,department_id,mean_order_number
0,0,21,22.902379
1,1,10,20.197148
2,2,18,19.310397
3,3,4,17.811403
4,4,16,17.665606
5,5,2,17.27792
6,6,7,17.225802
7,7,19,17.177343
8,8,3,17.170395
9,9,14,16.773669


In [14]:
# Sorting by descending mean order_number and adding rank number - only subset of first million rows
df.groupby('department_id')['order_number'] \
    .mean() \
    .sort_values(ascending=False) \
    .reset_index() \
    .rename(columns={'order_number': 'mean_order_number'}) \
    .reset_index() \
    .rename(columns={'index': 'Rank'})

Unnamed: 0,Rank,department_id,mean_order_number
0,0,21,21.956893
1,1,10,20.091818
2,2,18,19.60285
3,3,16,18.014071
4,4,3,17.913544
5,5,4,17.893092
6,6,7,17.694027
7,7,19,17.63134
8,8,14,17.524632
9,9,20,17.138607


**Comparison notes**: Dpt. 21 - missing keeps its pole position in both subset and dataframe, however there is a higher average in the df than the subset. It is followed by department 10 - bulk and 18 - babies in both subset and entire dataframe. Some departments fluctuate considerably: 2 - other is in 11th place in the subset and in 6th in the entire dataframe; dpt. 8 - pets is in place 14th in the subset (with an average of 16,46) and in 20th in the entire dataframe(15,34)

In [15]:
# Create variables for two means
full_means = ords_prods_merge.groupby('department_id')['order_number'].mean()
subset_means = df.groupby('department_id')['order_number'].mean()

# Combine into a DataFrame
comparison = pd.DataFrame({
    'Full Data': full_means,
    'Subset': subset_means
})

# Add difference column and sort by absolute difference
comparison['Diff'] = comparison['Subset'] - comparison['Full Data']
comparison['Abs Diff'] = comparison['Diff'].abs()

# Sort by biggest change
comparison_sorted = comparison.sort_values('Abs Diff', ascending=False)

print(comparison_sorted)

               Full Data     Subset      Diff  Abs Diff
department_id                                          
8              15.340650  16.458105  1.117455  1.117455
6              16.439806  15.382135 -1.057671  1.057671
21             22.902379  21.956893 -0.945486  0.945486
14             16.773669  17.524632  0.750963  0.750963
3              17.170395  17.913544  0.743149  0.743149
20             16.473447  17.138607  0.665159  0.665159
1              15.457838  14.800024 -0.657814  0.657814
15             16.165037  15.691875 -0.473162  0.473162
7              17.225802  17.694027  0.468224  0.468224
17             15.694469  16.150593  0.456123  0.456123
19             17.177343  17.631340  0.453997  0.453997
16             17.665606  18.014071  0.348465  0.348465
11             16.170638  16.482026  0.311388  0.311388
18             19.310397  19.602850  0.292453  0.292453
12             15.887671  15.615061 -0.272610  0.272610
2              17.277920  17.091743 -0.186177  0

**Additional comparison notes**: Departments that have fluctuated the most are 8 - pets (its average is higher in subset than entire dataframe) followed by 6 - international and 21- missing (both lower in subset than entire dataframe)

# 4. Creating loyalty flag for the entire dataframe

In [16]:
# create 'max_order' column with transform function with aggregated order numbers per user

ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)

  ords_prods_merge['max_order'] = ords_prods_merge.groupby(['user_id'])['order_number'].transform(np.max)


Checking both head and tail since we had the same user_id number in each. New column was created and shows consistent number of max orders per user (10 and 13 respectively)

In [17]:
# checking with an argument of 100
ords_prods_merge.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders,new_customer,product_id,add_to_cart_order,reordered,product_name,aisle_id,department_id,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order
0,2539329,1,1,2,8,,True,196,1,0,Soda,77,7,9.0,both,Mid-range product,Regularly busy,Regularly busy days,Most orders,10
1,2539329,1,1,2,8,,True,14084,2,0,Organic Unsweetened Vanilla Almond Milk,91,16,12.5,both,Mid-range product,Regularly busy,Regularly busy days,Most orders,10
2,2539329,1,1,2,8,,True,12427,3,0,Original Beef Jerky,23,19,4.4,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10
3,2539329,1,1,2,8,,True,26088,4,0,Aged White Cheddar Popcorn,23,19,4.7,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10
4,2539329,1,1,2,8,,True,26405,5,0,XL Pick-A-Size Paper Towel Rolls,54,17,1.0,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,738281,2,4,2,10,8.0,False,21150,13,0,Fire Grilled Steak Bowl,38,1,5.9,both,Mid-range product,Regularly busy,Regularly busy days,Most orders,14
96,1673511,2,5,3,11,8.0,False,47144,1,0,Unsweetened Original Almond Breeze Almond Milk,91,16,14.0,both,Mid-range product,Regularly busy,Least two busy days,Most orders,14
97,1673511,2,5,3,11,8.0,False,5322,2,0,Gluten Free Dark Chocolate Chunk Chewy with a ...,3,19,2.9,both,Low-range product,Regularly busy,Least two busy days,Most orders,14
98,1673511,2,5,3,11,8.0,False,17224,3,0,Oats & Honey Gluten Free Granola,3,19,1.6,both,Low-range product,Regularly busy,Least two busy days,Most orders,14


In [18]:
# creating loyalty flags with loc() function
ords_prods_merge.loc[ords_prods_merge['max_order'] > 40, 'loyalty_flag'] = 'Loyal customer'
ords_prods_merge.loc[(ords_prods_merge['max_order'] <= 40) & (ords_prods_merge['max_order'] > 10), 'loyalty_flag'] = 'Regular customer'
ords_prods_merge.loc[ords_prods_merge['max_order'] <= 10, 'loyalty_flag'] = 'New customer'

In [19]:
# checking frequency of new 'loyalty_flag' column
ords_prods_merge['loyalty_flag'].value_counts(dropna = False)

loyalty_flag
Regular customer    15876776
Loyal customer      10284093
New customer         6243990
Name: count, dtype: int64

In [20]:
# checking new column
ords_prods_merge[['user_id', 'loyalty_flag', 'order_number']].head(60)

Unnamed: 0,user_id,loyalty_flag,order_number
0,1,New customer,1
1,1,New customer,1
2,1,New customer,1
3,1,New customer,1
4,1,New customer,1
5,1,New customer,2
6,1,New customer,2
7,1,New customer,2
8,1,New customer,2
9,1,New customer,2


# 5. Basic Statistics of product prices flagged by loyalty

In [21]:
# basic statistics of product prices per loyalty flag
ords_prods_merge.groupby('loyalty_flag').agg({'prices':['mean', 'min', 'max']})

Unnamed: 0_level_0,prices,prices,prices
Unnamed: 0_level_1,mean,min,max
loyalty_flag,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Loyal customer,7.772777,1.0,25.0
New customer,7.800235,1.0,25.0
Regular customer,7.797355,1.0,25.0


**Comment**:  New customers buy, on average, the most expensive products, followed by regular customers, and finally loyal customers. Importantly, however, there is very little difference in the average product prices among these three loyalty segments.

# 6. Creating spending flag for each user

In [22]:
# grouping and aggregating data
ords_prods_merge.groupby('user_id')['prices'].mean()

user_id
1         6.367797
2         7.515897
3         8.197727
4         8.205556
5         9.189189
            ...   
206205    8.909375
206206    7.646667
206207    7.313453
206208    8.366617
206209    7.058915
Name: prices, Length: 206209, dtype: float64

In [23]:
# storing grouped result in new datafame
user_avg = ords_prods_merge.groupby('user_id')['prices'].mean()

In [24]:
# converting it into a dataframe with a column name
user_avg = user_avg.to_frame(name='avg_spending')

In [25]:
user_avg

Unnamed: 0_level_0,avg_spending
user_id,Unnamed: 1_level_1
1,6.367797
2,7.515897
3,8.197727
4,8.205556
5,9.189189
...,...
206205,8.909375
206206,7.646667
206207,7.313453
206208,8.366617


In [26]:
# creating spending_flag column
user_avg['spending_flag'] = 'Low spender'
user_avg.loc[user_avg['avg_spending'] >= 10, 'spending_flag'] = 'High spender'

In [27]:
# checking frequency of new "spending_flag" column
user_avg['spending_flag'].value_counts(dropna = False)

spending_flag
Low spender     202822
High spender      3387
Name: count, dtype: int64

In [28]:
# get percentages
user_avg['spending_flag'].value_counts(normalize=True)

spending_flag
Low spender     0.983575
High spender    0.016425
Name: proportion, dtype: float64

**Comment: Over 98% of customers are low-spenders, buying products that cost on average less than 10 USD**

# 7. Creating order frequency flag for the entire dataframe

In [29]:
# grouping and aggregating data
ords_prods_merge.groupby('user_id')['days_between_orders'].median()

user_id
1         20.5
2         13.0
3         10.0
4         20.0
5         11.0
          ... 
206205    30.0
206206     3.0
206207    16.0
206208     7.0
206209    22.0
Name: days_between_orders, Length: 206209, dtype: float32

In [30]:
# creating user_id dataframe
user_freq = ords_prods_merge.groupby('user_id')['days_between_orders'].median()
user_freq = user_freq.reset_index()
user_freq.columns = ['user_id', 'frequency']


In [31]:
# creating frequency_flag
user_freq.loc[user_freq['frequency'] <= 10, 'frequency_flag'] = 'Frequent customer'
user_freq.loc[(user_freq['frequency'] > 10) & (user_freq['frequency'] <= 20), 'frequency_flag'] = 'Regular customer'
user_freq.loc[user_freq['frequency'] > 20, 'frequency_flag'] = 'Non-frequent customer'

In [32]:
# checking frequency of frequency_flag column
user_freq['frequency_flag'].value_counts(dropna = False)

frequency_flag
Frequent customer        86596
Regular customer         59993
Non-frequent customer    59619
NaN                          1
Name: count, dtype: int64

In [33]:
# get percentages
user_freq['frequency_flag'].value_counts(normalize=True)

frequency_flag
Frequent customer        0.419945
Regular customer         0.290934
Non-frequent customer    0.289121
Name: proportion, dtype: float64

**Comment: Over 40% of users are frequent customers (up to 10 days between orders). Around 30% are regular customers (10–20 days), and the rest are non-frequent customers (over 20 days between orders).**


# 8. Merging new flags into dataframe and exporting

In [34]:
# Merge spending_flag into ords_prods_merge
ords_prods_merge = ords_prods_merge.merge(user_avg[['spending_flag']], 
                                           left_on='user_id', right_index=True, how='left')

# Merge frequency_flag into ords_prods_merge
ords_prods_merge = ords_prods_merge.merge(user_freq[['frequency_flag']], 
                                           left_on='user_id', right_index=True, how='left')


In [35]:
#checking updated dataframe
ords_prods_merge.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders,new_customer,product_id,add_to_cart_order,reordered,...,prices,_merge,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,spending_flag,frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,9.0,both,Mid-range product,Regularly busy,Regularly busy days,Most orders,10,New customer,Low spender,Regular customer
1,2539329,1,1,2,8,,True,14084,2,0,...,12.5,both,Mid-range product,Regularly busy,Regularly busy days,Most orders,10,New customer,Low spender,Regular customer
2,2539329,1,1,2,8,,True,12427,3,0,...,4.4,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10,New customer,Low spender,Regular customer
3,2539329,1,1,2,8,,True,26088,4,0,...,4.7,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10,New customer,Low spender,Regular customer
4,2539329,1,1,2,8,,True,26405,5,0,...,1.0,both,Low-range product,Regularly busy,Regularly busy days,Most orders,10,New customer,Low spender,Regular customer


In [36]:
ords_prods_merge.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   order_id               int32   
 1   user_id                int32   
 2   order_number           int8    
 3   orders_day_of_week     int8    
 4   order_hour_of_day      int8    
 5   days_between_orders    float32 
 6   new_customer           bool    
 7   product_id             int32   
 8   add_to_cart_order      int32   
 9   reordered              int8    
 10  product_name           object  
 11  aisle_id               int8    
 12  department_id          int8    
 13  prices                 float64 
 14  _merge                 category
 15  price_range            object  
 16  busiest_day            object  
 17  busiest_days           object  
 18  busiest_period_of_day  object  
 19  max_order              int8    
 20  loyalty_flag           object  
 21  spending_flag          object

In [38]:
# exporting data frame with new columns as pickle file
ords_prods_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_customer_segments_new.pkl'))