## Tannis McCartney
## May 26, 2022

### This notebook goes through importing, wrangling, and checking the customers dataframe in preparation for merging with the orders-products dataframe.

## Contents
### 01 Import libraries
### 02 Import customers data
### 03 Data wrangling products data
### 04 Consistency checks on products data
### 05 Change products data types to reduce memory usage
### 06 Export customers data

# 01 Import libraries

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

# 02 Import customers data

In [2]:
# Turn project folder path into a string
path = r'C:\Users\tmmcc\Google Drive\Data Analytics Bootcamp\4 Python Fundamentals for Data Analysts\05-2022 Instacart Basket Analysis'

In [3]:
# Import customers.csv to df_prods
df_cust = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'customers.csv'), index_col=False)
df_cust.head()

Unnamed: 0,user_id,First Name,Surnam,Gender,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665
1,33890,Patricia,Hart,Female,New Mexico,36,1/1/2017,0,single,59285
2,65803,Kenneth,Farley,Male,Idaho,35,1/1/2017,2,married,99568
3,125935,Michelle,Hicks,Female,Iowa,40,1/1/2017,0,single,42049
4,130797,Ann,Gilmore,Female,Maryland,26,1/1/2017,1,married,40374


In [4]:
# Check shape of df_cust
df_cust.shape

(206209, 10)

# 03 Data wrangling products data

#### The customer names and genders are not needed for this analysis, and as this information is PII, these columns will be dropped.

In [5]:
df_cust = df_cust.drop(columns = ['First Name', 'Surnam', 'Gender'])
df_cust.head()

Unnamed: 0,user_id,STATE,Age,date_joined,n_dependants,fam_status,income
0,26711,Missouri,48,1/1/2017,3,married,165665
1,33890,New Mexico,36,1/1/2017,0,single,59285
2,65803,Idaho,35,1/1/2017,2,married,99568
3,125935,Iowa,40,1/1/2017,0,single,42049
4,130797,Maryland,26,1/1/2017,1,married,40374


In [6]:
# Check the shape of the df_cust dataframe
df_cust.shape

(206209, 7)

In [7]:
# Check the frequency of the date_joined column - make sure it is more than one value
df_cust['date_joined'].value_counts()

9/17/2018     213
2/10/2018     212
4/1/2019      211
9/21/2019     211
12/19/2017    210
             ... 
9/1/2018      141
1/22/2018     140
11/24/2017    139
7/18/2019     138
8/6/2018      128
Name: date_joined, Length: 1187, dtype: int64

In [8]:
# Rename STATE column
df_cust = df_cust.rename(columns={'STATE' : 'state'})

In [9]:
# Rename Age column
df_cust = df_cust.rename(columns={'Age' : 'age'})

In [10]:
# Rename n_dependants column
df_cust = df_cust.rename(columns={'n_dependants' : 'dependants'})

In [11]:
# Rename fam_status
df_cust = df_cust.rename(columns={'fam_status' : 'household_status'})

In [12]:
# Check column names
df_cust.head()

Unnamed: 0,user_id,state,age,date_joined,dependants,household_status,income
0,26711,Missouri,48,1/1/2017,3,married,165665
1,33890,New Mexico,36,1/1/2017,0,single,59285
2,65803,Idaho,35,1/1/2017,2,married,99568
3,125935,Iowa,40,1/1/2017,0,single,42049
4,130797,Maryland,26,1/1/2017,1,married,40374


# 04 Consistency checks on customers dataframe

In [13]:
# Check for mixed types in the customers dataframe
for col in df_cust.columns.tolist():
    weird = (df_cust[[col]].applymap(type) != df_cust[[col]].iloc[0].apply(type)).any(axis=1)
    if len (df_cust[weird]) > 0:
        print(col)

#### The customers dataframe has no mixed-type data

In [14]:
# Check for missing data in the customers dataframe
df_cust.isnull().sum()

user_id             0
state               0
age                 0
date_joined         0
dependants          0
household_status    0
income              0
dtype: int64

#### There is no missing data in the customers dataframe

In [15]:
# Look for full duplicates in the products dataframe
df_dups = df_cust[df_cust.duplicated()]
df_dups

Unnamed: 0,user_id,state,age,date_joined,dependants,household_status,income


#### There are no duplicates in the customers dataframe

In [16]:
# Describe the customers dataframe
df_cust.describe()

Unnamed: 0,user_id,age,dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


#### The descriptive statistics look reasonable.

In [17]:
# Check the shape of df_cust
df_cust.shape

(206209, 7)

# 05 Change customer data types to reduce memory usage

In [18]:
# Check data types and memory usage for df_cust
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int64 
 1   state             206209 non-null  object
 2   age               206209 non-null  int64 
 3   date_joined       206209 non-null  object
 4   dependants        206209 non-null  int64 
 5   household_status  206209 non-null  object
 6   income            206209 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 11.0+ MB


In [19]:
# Change data types for df_cust
df_cust['user_id'] = df_cust['user_id'].astype('int32')
df_cust['state'] = df_cust['state'].astype('string')
df_cust['age'] = df_cust['age'].astype('int32')
df_cust['dependants'] = df_cust['dependants'].astype('int32')
df_cust['household_status'] = df_cust['household_status'].astype('string')
df_cust['income'] = df_cust['income'].astype('int32')

In [20]:
# Recheck data types and memory usage for df_cust
df_cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 206209 entries, 0 to 206208
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_id           206209 non-null  int32 
 1   state             206209 non-null  string
 2   age               206209 non-null  int32 
 3   date_joined       206209 non-null  object
 4   dependants        206209 non-null  int32 
 5   household_status  206209 non-null  string
 6   income            206209 non-null  int32 
dtypes: int32(4), object(1), string(2)
memory usage: 7.9+ MB


#### The memory usage has been reduced from 11.0+ Mb to 7.9+ Mb

In [21]:
# Check statistics for df_cust
df_cust.describe()

Unnamed: 0,user_id,age,dependants,income
count,206209.0,206209.0,206209.0,206209.0
mean,103105.0,49.501646,1.499823,94632.852548
std,59527.555167,18.480962,1.118433,42473.786988
min,1.0,18.0,0.0,25903.0
25%,51553.0,33.0,0.0,59874.0
50%,103105.0,49.0,1.0,93547.0
75%,154657.0,66.0,3.0,124244.0
max,206209.0,81.0,3.0,593901.0


# Export customer data

In [22]:
# Export df_cust to pkl
df_cust.to_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'customers.pkl'))