## Script Contents
### Part 1 of Exercise 10, action 5

In [1]:
# Import libraries and data

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
path = r'C:\Users\anon\Documents\CareerFoundry\Data Analytics Immersion\Instacart Basket Analysis'

In [3]:
high_act_cust = pd.read_pickle(os.path.join(path, 'Data', 'Prepared Data', 'high_activity_customers.pkl'))

In [4]:
high_act_cust.shape

(30964564, 28)

## 5) Create a profiling variable based on age, income, certain goods in the “department_id” column, and number of dependents

### Age Groups

In [5]:
# Determining age groups and creating flag

high_act_cust['age'].min()

18

In [6]:
high_act_cust['age'].max()

81

In [7]:
high_act_cust.loc[high_act_cust['age'] <= 29, 'age_group'] = '18-29'
high_act_cust.loc[(high_act_cust['age'] > 29) & (high_act_cust['age'] < 40), 'age_group'] = '30-39'
high_act_cust.loc[(high_act_cust['age'] > 39) & (high_act_cust['age'] < 50), 'age_group'] = '40-49'
high_act_cust.loc[(high_act_cust['age'] > 49) & (high_act_cust['age'] < 60), 'age_group'] = '50-59'
high_act_cust.loc[(high_act_cust['age'] > 59) & (high_act_cust['age'] < 70), 'age_group'] = '60-69'
high_act_cust.loc[high_act_cust['age'] > 69, 'age_group'] = '70 and over'

In [8]:
# Checking that total of column values equals total rows in pickle

high_act_cust['age_group'].value_counts(dropna = False)

18-29          5817603
70 and over    5812883
40-49          4883179
30-39          4847507
50-59          4841771
60-69          4761621
Name: age_group, dtype: int64

### Income Groups

In [9]:
# Determining income groups and creating flag

high_act_cust['income'].min()

25903

In [10]:
high_act_cust['income'].max()

593901

In [11]:
# Loosely basing income groups on US Census Bureau income and wealth data

high_act_cust.loc[high_act_cust['income'] <= 75000, 'income_group'] = 'Working'
high_act_cust.loc[(high_act_cust['income'] > 75000) & (high_act_cust['income'] < 120001), 'income_group'] = 'Middle'
high_act_cust.loc[(high_act_cust['income'] > 120000) & (high_act_cust['income'] < 400001), 'income_group'] = 'Upper'
high_act_cust.loc[high_act_cust['income'] > 400000, 'income_group'] = 'High Wealth'

In [12]:
# Checking that total of column values equals total rows in pickle

high_act_cust['income_group'].value_counts(dropna = False)

Middle         11878317
Working         9906734
Upper           9132517
High Wealth       46996
Name: income_group, dtype: int64

### Departments

In [13]:
# Importing department data

depts = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'departments_wrangled.csv'), index_col = False)

In [14]:
depts

Unnamed: 0,department_id,department
0,1,frozen
1,2,other
2,3,bakery
3,4,produce
4,5,alcohol
5,6,international
6,7,beverages
7,8,pets
8,9,dry goods pasta
9,10,bulk


In [15]:
high_act_cust.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'max_order', 'loyalty_flag', 'average_price',
       'spending_flag', 'order_frequency', 'median_days', 'gender', 'state',
       'age', 'date_joined', 'num_dependents', 'fam_status', 'income',
       'region', 'customer_activity', 'age_group', 'income_group'],
      dtype='object')

In [16]:
# Merging the departments dataframe with the high activity customer dataframe

cust_depts_merged = high_act_cust.merge(depts, on = 'department_id')

In [17]:
cust_depts_merged.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_the_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'max_order', 'loyalty_flag', 'average_price',
       'spending_flag', 'order_frequency', 'median_days', 'gender', 'state',
       'age', 'date_joined', 'num_dependents', 'fam_status', 'income',
       'region', 'customer_activity', 'age_group', 'income_group',
       'department'],
      dtype='object')

In [18]:
# Checking data and creating a flag for babies or no babies in household based on purchases from 'babies' department

cust_depts_merged['num_dependents'].min()

0

In [19]:
cust_depts_merged['num_dependents'].max()

3

In [20]:
# Creating a flag for babies in household based on purchasing from 'babies' department
# Using a lambda rather than loc because there may be rows in which customers will have >0 dependents and thave purchased from the 'babies' department, 
# and the fact that they purchased from the 'babies' department needs to be associated with their user_id, 
# so that they are correctly flagged even in rows that represent purchases from other departments

mask = (cust_depts_merged['num_dependents'] > 0) & (cust_depts_merged['department'] == 'babies')
cust_depts_merged['dependent_type'] = cust_depts_merged.groupby('user_id')['user_id'].transform(
    lambda x: 'babies in household' if any(mask[x.index]) else 'no babies in household'
)

In [21]:
cust_depts_merged.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,date_joined,num_dependents,fam_status,income,region,customer_activity,age_group,income_group,department,dependent_type
0,2539329,1,1,2,8,,196,1,0,Soda,...,2/17/2019,3,married,40423,South,high activity,30-39,Working,beverages,no babies in household
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,2/17/2019,3,married,40423,South,high activity,30-39,Working,beverages,no babies in household
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,2/17/2019,3,married,40423,South,high activity,30-39,Working,beverages,no babies in household
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,2/17/2019,3,married,40423,South,high activity,30-39,Working,beverages,no babies in household
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,2/17/2019,3,married,40423,South,high activity,30-39,Working,beverages,no babies in household
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2984916,21,6,5,6,4.0,32553,1,0,"Country Stand Juice, Medium Pulp",...,6/18/2019,0,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household
96,2984916,21,6,5,6,4.0,12615,2,0,Sparking Apple Cider,...,6/18/2019,0,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household
97,2984916,21,6,5,6,4.0,11187,3,0,Revel Berry Yerba Mate,...,6/18/2019,0,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household
98,1716848,21,7,5,9,28.0,35221,3,0,Lime Sparkling Water,...,6/18/2019,0,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household


In [22]:
# Creating a flag for young parents based on age and number of dependents
# Using a lambda in order to use an else statement since flag is binary, rather than using loc to assign 'older parent' to multiple different age groups

mask2 = (cust_depts_merged['num_dependents'] > 0) & (cust_depts_merged['age_group'] == '18-29')
cust_depts_merged['parental_age'] = cust_depts_merged.groupby('user_id')['user_id'].transform(
    lambda x: 'young parent' if any(mask2[x.index]) else 'older parent'
)

In [None]:
# Creating a flag for young parents based on age and number of dependents

cust_depts_merged.loc[(cust_depts_merged['num_dependents'] > 0) & (cust_depts_merged['age_group'] == 0), '18-29'] = 'young parent'
cust_depts_merged.loc[(cust_depts_merged['num_dependents'] > 0) & (cust_depts_merged['age_group'] == 0), '18-29'] = 'young parent'

In [23]:
cust_depts_merged['parental_age'].value_counts()

older parent    26594579
young parent     4369985
Name: parental_age, dtype: int64

In [24]:
# Creating family type groupings

cust_depts_merged.loc[(cust_depts_merged['fam_status'].isin(['divorced/widowed', 'living with parents and siblings', 'single'])) & (cust_depts_merged['num_dependents'] == 0), 'family_type'] = 'single no dependents'
cust_depts_merged.loc[(cust_depts_merged['fam_status'].isin(['divorced/widowed', 'living with parents and siblings', 'single'])) & (cust_depts_merged['num_dependents'] > 0), 'family_type'] = 'Single with dependents'
cust_depts_merged.loc[(cust_depts_merged['fam_status'] == 'married') & (cust_depts_merged['num_dependents'] == 1), 'family_type'] = 'married no dependents'
cust_depts_merged.loc[(cust_depts_merged['fam_status'] == 'married') & (cust_depts_merged['num_dependents'] > 1), 'family_type'] = 'married with dependents'

In [25]:
cust_depts_merged['family_type'].value_counts()

married with dependents    14532650
single no dependents        7739681
married no dependents       7211061
Single with dependents      1481172
Name: family_type, dtype: int64

In [26]:
cust_depts_merged.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_the_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,fam_status,income,region,customer_activity,age_group,income_group,department,dependent_type,parental_age,family_type
0,2539329,1,1,2,8,,196,1,0,Soda,...,married,40423,South,high activity,30-39,Working,beverages,no babies in household,older parent,married with dependents
1,2398795,1,2,3,7,15.0,196,1,1,Soda,...,married,40423,South,high activity,30-39,Working,beverages,no babies in household,older parent,married with dependents
2,473747,1,3,3,12,21.0,196,1,1,Soda,...,married,40423,South,high activity,30-39,Working,beverages,no babies in household,older parent,married with dependents
3,2254736,1,4,4,7,29.0,196,1,1,Soda,...,married,40423,South,high activity,30-39,Working,beverages,no babies in household,older parent,married with dependents
4,431534,1,5,4,15,28.0,196,1,1,Soda,...,married,40423,South,high activity,30-39,Working,beverages,no babies in household,older parent,married with dependents
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2984916,21,6,5,6,4.0,32553,1,0,"Country Stand Juice, Medium Pulp",...,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household,older parent,single no dependents
96,2984916,21,6,5,6,4.0,12615,2,0,Sparking Apple Cider,...,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household,older parent,single no dependents
97,2984916,21,6,5,6,4.0,11187,3,0,Revel Berry Yerba Mate,...,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household,older parent,single no dependents
98,1716848,21,7,5,9,28.0,35221,3,0,Lime Sparkling Water,...,divorced/widowed,124643,West,high activity,70 and over,Upper,beverages,no babies in household,older parent,single no dependents


In [27]:
# Exporting and saving dataframe

cust_depts_merged.to_pickle(os.path.join(path, 'Data', 'Prepared Data', 'cust_depts_merged.pkl'))