# 4.9 Data visualization with Python

## This script contains the following points:

1. Download the data

2. Create a notebook

3. Import libraries and data

4. Wrangle the data

5. Data quality and consistency checks

6. Combine the customers data with the rest of the prepared Instacart data

7. Ensure the notebook contains logical titles, section headings, and descriptive code comments

8. Export the new dataframe

## 1. Download the data

## 2. Create a notebook

## 3. Import libraries and data

In [64]:
# Import libraries

import pandas as pd
import numpy as np
import os

In [65]:
# Import data

path = r'C:\Users\admin\06-2024 Instacart Basket Analysis'
customers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col = False)

## 4. Wrangle the data

In [67]:
customers.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [68]:
customers.rename(columns = {'Surnam' : 'surname'}, inplace = True)

In [69]:
customers.rename(columns = {'First Name' : 'first_name'}, inplace = True)

In [70]:
customers.rename(columns = {'Gender' : 'gender'}, inplace = True)

In [71]:
customers.rename(columns = {'STATE' : 'state'}, inplace = True)

In [72]:
customers.rename(columns = {'Age' : 'age'}, inplace = True)

In [73]:
customers.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


## 5. Data quality and consistency checks

In [75]:
# Investigate the accuracy of the columns

customers.describe()

Unnamed: 0,user_id,age,n_dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


In [76]:
# Check for mixed-type data

for col in customers.columns.tolist():
    weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (customers[weird]) > 0:
        print(col)

first_name


In [77]:
# Change the data type of "first_name" to string

customers['first_name'] = customers['first_name'].astype('str')

In [78]:
# Check for mixed-type data again

for col in customers.columns.tolist():
    weird = (customers[[col]].map(type) != customers[[col]].iloc[0].apply(type)).any(axis = 1)
    if len (customers[weird]) > 0:
        print(col)

In [79]:
# Check for missing values

customers.isnull().sum()

user_id         0
first_name      0
surname         0
gender          0
state           0
age             0
date_joined     0
n_dependants    0
fam_status      0
income          0
dtype: int64

No missing values are found.

In [81]:
# Check for duplicates

dups = customers[customers.duplicated()]

In [82]:
dups

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income


No duplicates are found.

## 6. Combine the customers data with the rest of the prepared Instacart data

In [85]:
# Import the prepared Instacart dataframe

ords_prods_merge = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_new_cols_grouped_dropped.pkl'))

In [86]:
ords_prods_merge.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices,order_id,user_id,eval_set,number_of_order,orders_day_of_week,...,price_range_loc,busiest_day,busiest_days,busiest_period_of_day_loc,max_order,loyalty_flag,average_prices,spending_flag,median_days_since_prior_order,order_frequency_flag
0,1,Chocolate Sandwich Cookies,61,19,5.8,3139998,138,prior,28,6,...,Mid-range product,Regular busy,Regular busy,Most orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
1,1,Chocolate Sandwich Cookies,61,19,5.8,1977647,138,prior,30,6,...,Mid-range product,Regular busy,Regular busy,Average orders,32,Regular customer,6.935811,Low spender,8.0,Frequent customer
2,1,Chocolate Sandwich Cookies,61,19,5.8,389851,709,prior,2,0,...,Mid-range product,Busiest day,Busiest days,Average orders,5,New customer,7.930208,Low spender,8.0,Frequent customer
3,1,Chocolate Sandwich Cookies,61,19,5.8,652770,764,prior,1,3,...,Mid-range product,Regular busy,Least busy,Most orders,3,New customer,4.972414,Low spender,11.0,Regular customer
4,1,Chocolate Sandwich Cookies,61,19,5.8,1813452,764,prior,3,4,...,Mid-range product,Least busy,Least busy,Average orders,3,New customer,4.972414,Low spender,11.0,Regular customer


In [87]:
ords_prods_merge.shape

(32404859, 26)

In [88]:
customers.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [89]:
customers.shape

(206209, 10)

In [90]:
# Use inner join to combine the dataframe with the customer data set

df_merge = customers.merge(ords_prods_merge, on = 'user_id', indicator = '_merge3')

In [91]:
df_merge.shape

(32404859, 36)

In [92]:
df_merge.head()

Unnamed: 0,user_id,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income,...,busiest_day,busiest_days,busiest_period_of_day_loc,max_order,loyalty_flag,average_prices,spending_flag,median_days_since_prior_order,order_frequency_flag,_merge3
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regular busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regular busy,Regular busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regular busy,Busiest days,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regular busy,Regular busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Regular busy,Least busy,Most orders,8,New customer,7.988889,Low spender,19.0,Regular customer,both


## 7. Ensure the notebook contains logical titles, section headings, and descriptive code comments

## 8. Export the new dataframe

In [95]:
df_merge.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'ords_prods_merge_new_cols_grouped_dropped_merged.pkl'))