# This script contains the following chapters:
1. Importing libraries and customer data 
2. Data Wrangling
3. Combine customer data with large ords_prods data and export

# 1. Importing data and libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# create path for easier data import
path = r'C:\Users\magia\06-2025 Instacart Basket Analysis'

In [3]:
# importing data set using path variable
cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

# 2. Data Wrangling

In [4]:
# explore data with shape attribute
cust.shape

(206209, 10)

In [5]:
# explore data with info() function
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   First Name    194950 non-null  object
 2   Surnam        206209 non-null  object
 3   Gender        206209 non-null  object
 4   STATE         206209 non-null  object
 5   Age           206209 non-null  int64 
 6   date_joined   206209 non-null  object
 7   n_dependants  206209 non-null  int64 
 8   fam_status    206209 non-null  object
 9   income        206209 non-null  int64 
dtypes: int64(4), object(6)
memory usage: 15.7+ MB


We have ca. 10.000 first names missing

In [6]:
# explore data with describe() function
cust.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


- **Age**: Ranges from 18 to 81 years, with a median of 49. The distribution is fairly spread, with approximately 15–17 year gaps between quartiles, reflecting a wide adult age range.

- **Income**: The first three quartiles cover a range of roughly 31k–34k USD, but there’s a substantial jump from the 75th percentile (ca.120k) to the maximum (ca.600k), indicating a right-skewed distribution. Since product prices at IC max out around $25, extreme high incomes probably have no impact on purchasing behavior.

In [7]:
# explore data with head() function
cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [8]:
# explore data with tail() function
cust.tail()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,4/1/2020,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,4/1/2020,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,4/1/2020,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,4/1/2020,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,4/1/2020,1,married,57095


In [9]:
# rename columns - fix typos and unclear names

cust = cust.rename(columns={
    'First Name': 'first_name',
    'Surnam': 'surname',
    'Gender': 'gender',      
    'STATE': 'state',
    'n_dependants': 'household_size'  # suggested name for clarity
})

In [10]:
# check changes with columns attribute
cust.columns

Index(['user_id', 'first_name', 'surname', 'gender', 'state', 'Age',
       'date_joined', 'household_size', 'fam_status', 'income'],
      dtype='object')

In [11]:
# drop columns not useful for analysis: personal identifiers and redundant family status
cust = cust.drop(columns=['first_name', 'surname', 'fam_status'])

`fam_status` was dropped because its information is already captured by `household_size`:
- A value of 1 implies married (couple),
- 0 implies single,
- 2+ implies married with children.
This makes `fam_status` redundant for analysis.

In [12]:
# check for missing values
cust.isnull().sum()

user_id           0
gender            0
state             0
Age               0
date_joined       0
household_size    0
income            0
dtype: int64

 We have no missing values in the customer dataset.

In [13]:
# checking for duplicates
cust.duplicated().sum()

0

We have no duplicate values in the customer dataset.

Age values look normal, we have no technical way of knowing whether gender information are accurate or not, so we check if there are maybe typos in the income column.

In [14]:
# show summary stats for income above 75th percentile
q3 = cust['income'].quantile(0.75)
high_income = cust[cust['income'] > q3][['income', 'Age', 'state']]

print(high_income.describe())

              income           Age
count   51551.000000  51551.000000
mean   150506.380904     60.654051
std     29544.677697     12.247085
min    124245.000000     18.000000
25%    135918.000000     50.000000
50%    147695.000000     61.000000
75%    159473.500000     71.000000
max    593901.000000     81.000000


In [16]:
#check if state column is correct content-wise
cust.state.nunique()

51

In [17]:
print(cust.state.unique())

['Missouri' 'New Mexico' 'Idaho' 'Iowa' 'Maryland' 'Kentucky' 'Montana'
 'South Carolina' 'Texas' 'Virginia' 'Nevada' 'Nebraska' 'Georgia'
 'Wyoming' 'Colorado' 'North Dakota' 'Wisconsin' 'Alaska' 'Vermont'
 'Arkansas' 'Maine' 'North Carolina' 'West Virginia' 'Indiana' 'Oregon'
 'Florida' 'California' 'Pennsylvania' 'Ohio' 'Connecticut' 'Arizona'
 'Louisiana' 'Washington' 'New York' 'Mississippi' 'Oklahoma' 'Utah'
 'New Hampshire' 'Hawaii' 'District of Columbia' 'Alabama' 'Massachusetts'
 'Rhode Island' 'Michigan' 'New Jersey' 'Kansas' 'South Dakota'
 'Minnesota' 'Illinois' 'Tennessee' 'Delaware']


No mistakes in the state column

In [18]:
#renaming age column for consistency in lower case use
cust = cust.rename(columns= {'Age' :'age'})

In [19]:
cust.columns

Index(['user_id', 'gender', 'state', 'age', 'date_joined', 'household_size',
       'income'],
      dtype='object')

In [20]:
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   user_id         206209 non-null  int64 
 1   gender          206209 non-null  object
 2   state           206209 non-null  object
 3   age             206209 non-null  int64 
 4   date_joined     206209 non-null  object
 5   household_size  206209 non-null  int64 
 6   income          206209 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 11.0+ MB


In [21]:
# change data type of date_joined column from string to datetime
cust['date_joined'] = pd.to_datetime(cust['date_joined'])

In [22]:
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         206209 non-null  int64         
 1   gender          206209 non-null  object        
 2   state           206209 non-null  object        
 3   age             206209 non-null  int64         
 4   date_joined     206209 non-null  datetime64[ns]
 5   household_size  206209 non-null  int64         
 6   income          206209 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(2)
memory usage: 11.0+ MB


# 3. Combine customer data with large ords_prods data and export

In [23]:
# import large ords_prods dataframe
ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_customer_segments_new.pkl'))

In [24]:
#check column data types once more before we merge dataframes
ords_prods.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32404859 entries, 0 to 32404858
Data columns (total 23 columns):
 #   Column                 Dtype   
---  ------                 -----   
 0   order_id               int32   
 1   user_id                int32   
 2   order_number           int8    
 3   orders_day_of_week     int8    
 4   order_hour_of_day      int8    
 5   days_between_orders    float32 
 6   new_customer           bool    
 7   product_id             int32   
 8   add_to_cart_order      int32   
 9   reordered              int8    
 10  product_name           object  
 11  aisle_id               int8    
 12  department_id          int8    
 13  prices                 float64 
 14  _merge                 category
 15  price_range            object  
 16  busiest_day            object  
 17  busiest_days           object  
 18  busiest_period_of_day  object  
 19  max_order              int8    
 20  loyalty_flag           object  
 21  spending_flag          object

user_id, the commonly shared column, is once a int32 (ords_prods) and once an int64(cust). We need to fix this before merging.

In [25]:
# convert user_id in cust to an int32
cust['user_id'] = cust['user_id'].astype('int32')

In [26]:
#merge the two dataframes, save in df_merged
df_merged = ords_prods.merge(cust, on='user_id', how='left')

In [27]:
#inspect merged dataframe with shape attribute
df_merged.shape

(32404859, 29)

In [28]:
#inspect merged dataframe with head() function
df_merged.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_between_orders,new_customer,product_id,add_to_cart_order,reordered,...,max_order,loyalty_flag,spending_flag,frequency_flag,gender,state,age,date_joined,household_size,income
0,2539329,1,1,2,8,,True,196,1,0,...,10,New customer,Low spender,Regular customer,Female,Alabama,31,2019-02-17,3,40423
1,2539329,1,1,2,8,,True,14084,2,0,...,10,New customer,Low spender,Regular customer,Female,Alabama,31,2019-02-17,3,40423
2,2539329,1,1,2,8,,True,12427,3,0,...,10,New customer,Low spender,Regular customer,Female,Alabama,31,2019-02-17,3,40423
3,2539329,1,1,2,8,,True,26088,4,0,...,10,New customer,Low spender,Regular customer,Female,Alabama,31,2019-02-17,3,40423
4,2539329,1,1,2,8,,True,26405,5,0,...,10,New customer,Low spender,Regular customer,Female,Alabama,31,2019-02-17,3,40423


Looks good, we can export.

In [29]:
# export merged dataframe as pickle file
df_merged.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_cust.pkl'))