# 3.3 IC Customer Data Preparation and Merge

## Contents
### Import libraries and customer data set
### Import customer data set with column restrictions
### Wrangling Procedures
### Data consistency checks
### Export cleaned data set
### Merge cleaned data set with ords_prods data set
### Crosstabs for NaN Values
### Test merge
### Merge into df
### Export merged df

## Import libraries and customer data set

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
# path to project folder
path = r'/Users/susanwang/Documents/CF_Tasks/Instacart Basket Analysis'

Take a preliminary look at the data set, import only first 1000 rows.

In [3]:
df_check = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col=False, nrows=1000)

In [4]:
df_check.shape

(1000, 10)

In [5]:
df_check.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


The 'date_joined' may also not be necessary, as the 'loyalty_flag' will address the customer's status. 

For consistencies sake, all the columns should be in lower case, and 'Surnam' would be better named 'last_name'.

## Import customer data set with column restrictions

In [3]:
cust_list = ['user_id', 'First Name', 'Surnam', 'Gender', 'STATE', 'Age', 'n_dependants', 'fam_status', 'income']

In [4]:
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), usecols=cust_list, index_col=False)

In [5]:
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


## Wrangling Procedures

### Rename columns

In [6]:
df_cust.rename(columns={'First Name': 'first_name', 'Surnam': 'last_name', 'Gender': 'gender', 'STATE': 'state','Age': 'age'}, inplace=True)

In [7]:
#check column names
df_cust.head()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1,married,40374


### Explore data

In [8]:
# shape of data
df_cust.shape

(206209, 9)

In [9]:
# tail
df_cust.tail()

Unnamed: 0,user_id,first_name,last_name,gender,state,age,n_dependants,fam_status,income
206204,168073,Lisa,Case,Female,North Carolina,44,1,married,148828
206205,49635,Jeremy,Robbins,Male,Hawaii,62,3,married,168639
206206,135902,Doris,Richmond,Female,Missouri,66,2,married,53374
206207,81095,Rose,Rollins,Female,California,27,1,married,99799
206208,80148,Cynthia,Noble,Female,New York,55,1,married,57095


In [10]:
# info
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   user_id       206209 non-null  int64 
 1   first_name    194950 non-null  object
 2   last_name     206209 non-null  object
 3   gender        206209 non-null  object
 4   state         206209 non-null  object
 5   age           206209 non-null  int64 
 6   n_dependants  206209 non-null  int64 
 7   fam_status    206209 non-null  object
 8   income        206209 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 14.2+ MB


- Seems like there may be missing values in the first_name column.
- User_id should not be integer.

### Changing data type for user_id

In [11]:
df_cust['user_id'] = df_cust['user_id'].astype('str')

In [12]:
# check dtypee
df_cust['user_id'].dtype

dtype('O')

### Descriptive stats

In [18]:
df_cust.describe()

Unnamed: 0,age,n_dependants,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


## Data consistency checks

In [13]:
# check for mixed_type data
for col in df_cust.columns.to_list():
    weird = (df_cust[[col]].map(type) != df_cust[[col]].iloc[0].apply(type)).any(axis=1)
    if len(df_cust[weird]) > 0:
        print(col)
    else:
        print(f'all consistent in {col}')

all consistent in user_id
first_name
all consistent in last_name
all consistent in gender
all consistent in state
all consistent in age
all consistent in n_dependants
all consistent in fam_status
all consistent in income


In [14]:
# check for missing values
df_cust.isnull().sum()

user_id             0
first_name      11259
last_name           0
gender              0
state               0
age                 0
n_dependants        0
fam_status          0
income              0
dtype: int64

In [15]:
df_cust[df_cust['first_name'].isnull() == True]

Unnamed: 0,user_id,first_name,last_name,gender,state,age,n_dependants,fam_status,income
53,76659,,Gilbert,Male,Colorado,26,2,married,41709
73,13738,,Frost,Female,Louisiana,39,0,single,82518
82,89996,,Dawson,Female,Oregon,52,3,married,117099
99,96166,,Oconnor,Male,Oklahoma,51,1,married,155673
105,29778,,Dawson,Female,Utah,63,3,married,151819
...,...,...,...,...,...,...,...,...,...
206038,121317,,Melton,Male,Pennsylvania,28,3,married,87783
206044,200799,,Copeland,Female,Hawaii,52,2,married,108488
206090,167394,,Frost,Female,Hawaii,61,1,married,45275
206162,187532,,Floyd,Female,California,39,0,single,56325


There are many NaN values in the first_name column. As it won't affect the analysis results, nor will I be using the column for grouping or aggregating purposes, I can leave the NaN values as is, or even drop the entire column.

In [16]:
# drop first_name column
df_cust = df_cust.drop(columns=['first_name'])

In [17]:
# check head
df_cust.head()

Unnamed: 0,user_id,last_name,gender,state,age,n_dependants,fam_status,income
0,26711,Esquivel,Female,Missouri,48,3,married,165665
1,33890,Hart,Female,New Mexico,36,0,single,59285
2,65803,Farley,Male,Idaho,35,2,married,99568
3,125935,Hicks,Female,Iowa,40,0,single,42049
4,130797,Gilmore,Female,Maryland,26,1,married,40374


In [18]:
# check for missing values again
df_cust.isnull().sum()

user_id         0
last_name       0
gender          0
state           0
age             0
n_dependants    0
fam_status      0
income          0
dtype: int64

In [19]:
# check for duplicates
cust_dups = df_cust[df_cust.duplicated()]

In [20]:
cust_dups

Unnamed: 0,user_id,last_name,gender,state,age,n_dependants,fam_status,income


No duplicates found.

In [21]:
# check shape again
df_cust.shape

(206209, 8)

## Export cleaned data set to Prepared Data folder

In [22]:
df_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'cust_clean.pkl'))

## Merge cleaned customer data with the ords_prods data set

In [23]:
# import ords_prods_agg data set
df_ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_agg.pkl'))

In [24]:
# check data
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,first_order,product_id,add_to_cart_order,reordered,...,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days,frequency_flag
0,2539329,1,1,2,8,,True,196,1,0,...,Mid-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,1,2,8,,True,14084,2,0,...,Mid-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,1,2,8,,True,12427,3,0,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,1,2,8,,True,26088,4,0,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,1,2,8,,True,26405,5,0,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


In [25]:
df_ords_prods.shape

(32404859, 25)

## Crosstabs for NaN Values in 'days_since_prior_order'

In [6]:
# create a crosstab 
crosstab = pd.crosstab(df_ords_prods['days_since_prior_order'], df_ords_prods['order_number'], dropna=False)

In [7]:
# save result to clipboard (and then paste into Excel)
crosstab.to_clipboard()

In [32]:
# check columns
df_ords_prods.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'first_order',
       'product_id', 'add_to_cart_order', 'reordered', 'product_name',
       'aisle_id', 'department_id', 'prices', '_merge', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'price_range', 'max_order',
       'loyalty_flag', 'avg_price', 'spending_flag', 'median_days',
       'frequency_flag'],
      dtype='object')

- Common key value is user_id.
- _merge flag column must be removed before making another merge, or else I cannot use the indicator flag.
- I will drop 'first_order' because it is also not needed for analysis.

In [26]:
# drop _merge column
df_ords_prods = df_ords_prods.drop(columns=['_merge', 'first_order'])

In [27]:
# check head
df_ords_prods.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,price_range,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days,frequency_flag
0,2539329,1,1,2,8,,196,1,0,Soda,...,Mid-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Mid-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Low-range product,Regularly busy,Regular days,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer


## Test merge

In [35]:
# pd.merge(df_cust, df_ords_prods, on='user_id', indicator=True)

I get a warning that I am merging on integer and object, which means one of my user_id is not the right data type.

In [28]:
# check data types
df_cust['user_id'].dtype

dtype('O')

In [29]:
df_ords_prods['user_id'].dtype

dtype('int64')

In [30]:
# change data type for df_ords_prods
df_ords_prods['user_id'] = df_ords_prods['user_id'].astype('str')

In [31]:
# check again
df_ords_prods['user_id'].dtype

dtype('O')

In [32]:
pd.merge(df_cust, df_ords_prods, on='user_id', indicator=True)

KeyboardInterrupt: 

## Merge into df

In [33]:
ords_prods_all = df_cust.merge(df_ords_prods, on='user_id', indicator=True)

In [34]:
# check value counts for _merge
ords_prods_all['_merge'].value_counts(dropna=False)

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

In [35]:
# check shape
ords_prods_all.shape

(32404859, 31)

## Export merged df

In [36]:
ords_prods_all.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_all.pkl'))