# 4.9 Intro to Data Visualization with Python Part 1

### This script contains the following points:
#### 1. Import resources
#### 2. Data wrangling
#### 3. Consistency checks
#### 4. Merge dataframes

### 1. Import Resources

In [1]:
# Import libs
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# Set a path variable for the import
path = r'C:\Users\zhoux\Documents\CF-Data Analyst Program\DA-Immersion\10-2023 Instacart Basket Analysis'

In [4]:
# Import data set
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'))

In [5]:
ords_prods = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_merged_cleaned_for_viz.pkl'))

### 2. Data wrangling

##### dataframe overview and basic statistic stats

In [6]:
customers.head(50)

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374
5,133128,Cynthia,Noble,Female,Kentucky,43,1/1/2017,2,married,49643
6,152052,Chris,Walton,Male,Montana,20,1/1/2017,0,single,61746
7,168851,Joseph,Hickman,Male,South Carolina,30,1/1/2017,0,single,63712
8,69965,Jeremy,Vang,Male,Texas,47,1/1/2017,1,married,162432
9,82820,Shawn,Chung,Male,Virginia,26,1/1/2017,2,married,32072


In [7]:
customers.shape

(206209, 10)

In [8]:
customers.describe()

Unnamed: 0,user_id,Age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


##### Data wrangling

Though we only need a few columns from this dataframe, it is unknown whehter the clients will ask more business questions later. It is better to clean the entire dataframe, than pick the columns first and only clean these columns.

In [9]:
# Rename columns with illogical names
customers.rename(columns={'First Name': 'first_name', 'Surnam': 'surname', 'Gender': 'gender', 'STATE': 'state', 'Age': 'age', 'date_joined': 'date_joined', 'n_dependants': 'n_dependents'}, inplace=True)
customers.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependents,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [10]:
# Change the data type of 'user_id' from numeric to string and check whether the change is successful
customers['user_id']=customers['user_id'].astype('str')
customers.describe()

Unnamed: 0,age,n_dependents,income
count,206209.0,206209.0,206209.0
mean,49.501646,1.499823,94632.852548
std,18.480962,1.118433,42473.786988
min,18.0,0.0,25903.0
25%,33.0,0.0,59874.0
50%,49.0,1.0,93547.0
75%,66.0,3.0,124244.0
max,81.0,3.0,593901.0


### 3. Consistency checks

In [11]:
# Check duplicated rows (result: no duplicated rows)
dups = customers.duplicated()
dups.value_counts(dropna = False)

False    206209
Name: count, dtype: int64

In [12]:
# Check for mixed data type in all columns
for col in customers.columns.tolist():
  weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

first_name


In [13]:
# Change the data type in 'first_name' to string
customers['first_name'] = customers['first_name'].astype('str')

In [14]:
# Check whether the change was successful
for col in customers.columns.tolist():
  weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (customers[weird]) > 0:
    print (col)

In [15]:
# Check for missing values (results: no missing values)
customers.isnull().sum()

user_id         0
first_name      0
surname         0
gender          0
state           0
age             0
date_joined     0
n_dependents    0
fam_status      0
income          0
dtype: int64

##### Create a subset with only the required columns

In [16]:
sub_customers = customers[['user_id', 'gender', 'state', 'age', 'n_dependents', 'fam_status', 'income']]
sub_customers.head()

Unnamed: 0,user_id,gender,state,age,n_dependents,fam_status,income
0,26711,Female,Missouri,48,3,married,165665
1,33890,Female,New Mexico,36,0,single,59285
2,65803,Male,Idaho,35,2,married,99568
3,125935,Female,Iowa,40,0,single,42049
4,130797,Female,Maryland,26,1,married,40374


In [17]:
sub_customers.shape

(206209, 7)

### 4. Merge dataframes

In [18]:
# In ords_prods, count the number of unique user_id by using groupby and .ngroups
ords_prods.groupby('user_id').ngroups

206209

The number of unique user_id matches with the number of unique user_id in sub_customers. We can perform a full join.

In [19]:
# Change the data type of user_id in ords_prods to string.
ords_prods['user_id']=ords_prods['user_id'].astype('str')

In [20]:
# Merge ords_prods with sub_customers, add a merge flag to indicate the type of merge
df_merged = ords_prods.merge(sub_customers, on = 'user_id', indicator = True)

In [21]:
df_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,spending_flag,median_days_since_prior_order,regularity_flag,gender,state,age,n_dependents,fam_status,income,_merge
0,2539329,1,prior,1,2,8,,196,1,0,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
1,2398795,1,prior,2,3,7,15.0,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
2,473747,1,prior,3,3,12,21.0,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
3,2254736,1,prior,4,4,7,29.0,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both
4,431534,1,prior,5,4,15,28.0,196,1,1,...,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423,both


In [22]:
# Check the merge flags (The merge was a successful full merge)
df_merged['_merge'].value_counts(dropna = False)

_merge
both          32404859
left_only            0
right_only           0
Name: count, dtype: int64

##### Export as .pkl

In [23]:
# Remove the merge flag
df_merged = df_merged.drop('_merge', axis=1)
df_merged.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,...,avg_prods_price,spending_flag,median_days_since_prior_order,regularity_flag,gender,state,age,n_dependents,fam_status,income
0,2539329,1,prior,1,2,8,,196,1,0,...,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423
1,2398795,1,prior,2,3,7,15.0,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423
2,473747,1,prior,3,3,12,21.0,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423
3,2254736,1,prior,4,4,7,29.0,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423
4,431534,1,prior,5,4,15,28.0,196,1,1,...,6.367797,Low spender,20.5,Non-frequent customer,Female,Alabama,31,3,married,40423


In [24]:
df_merged.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_merged_sub_customers.pkl'))