## 4.3 Anonymizing and Excluding Low Activity Customers

### Contents
#### Importing Libraries
#### Importing Data
#### Addressing PII
#### Examining Spending Habits by Region
#### Exclude Low Activity Customers
#### Export as pickle

### Importing Libraries

In [3]:
# Import Libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

### 1. Importing Data

In [5]:
# Define pathway to relevant folder
path = r'/Users/sydneyjohnson/Documents/CF Data Analytics Course/07-2024 Instacart Basket Analysis'

In [6]:
# Import total_data.pkl
df = pd.read_pickle(os.path.join(path, '02 Data','Prepared Data', 'total_data.pkl'))

### 2. Addressing PII

In [8]:
# View head of dataframe
df.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,first_name,surname,gender,state,age,date_joined,n_dependants,fam_status,income,_merge
0,2539329,1,1,2,8,,196,1,0,Soda,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both


In [9]:
# The data includes first and last names of customers. I am going to drop the 'first_name' column to anonymize the data.
df_anon = df.drop(columns = ['first_name'])

### 3. Examining Spending Habits by Region

#### Testing loc function on subset of the dataframe

In [12]:
# Create subset of first 10,000 rows to test code.
df_mini = df_anon[:10000]

In [13]:
# Check new dataframe
df_mini.shape

(10000, 29)

In [14]:
# Check value counts for 'state'
df['state'].value_counts()

state
Pennsylvania            667082
California              659783
Rhode Island            656913
Georgia                 656389
New Mexico              654494
Arizona                 653964
North Carolina          651900
Oklahoma                651739
Alaska                  648495
Minnesota               647825
Massachusetts           646358
Wyoming                 644255
Virginia                641421
Missouri                640732
Texas                   640394
Colorado                639280
Maine                   638583
North Dakota            638491
Alabama                 638003
Kansas                  637538
Louisiana               637482
Delaware                637024
South Carolina          636754
Oregon                  636425
Arkansas                636144
Nevada                  636139
New York                635983
Montana                 635265
South Dakota            633772
Illinois                633024
Hawaii                  632901
Washington              632852
Mi

In [15]:
# Use the loc() function to create a region column based on state in a test df.
# Northeast
df_mini = df_mini.copy()
df_mini.loc[df_mini['state'].isin([
    'Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 
    'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 
    'New Jersey'
]), 'region'] = 'Northeast'

In [16]:
df_mini = df_mini.copy()
df_mini.loc[df_mini['state'].isin([
    'Wisconsin','Michigan','Illinois','Indiana','Ohio',
    'North Dakota','South Dakota','Nebraska','Kansas','Minnesota',
    'Iowa','Missouri'
]), 'region'] = 'Midwest'

In [17]:
df_mini = df_mini.copy()
df_mini.loc[df_mini['state'].isin([
    'Delaware','Maryland','District of Columbia','Virginia',
    'West Virginia','North Carolina','South Carolina','Georgia',
    'Florida','Kentucky','Tennessee','Mississippi','Alabama',
    'Oklahoma','Texas','Arkansas','Louisiana'
]), 'region'] = 'South'

In [18]:
df_mini = df_mini.copy()
df_mini.loc[df_mini['state'].isin([
    'Idaho','Montana','Wyoming','Nevada','Utah','Colorado',
    'Arizona','New Mexico','Alaska','Washington','Oregon','California','Hawaii'
]), 'region'] = 'West'

In [19]:
# Check head of df_mini
df_mini.head()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_placed,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,...,surname,gender,state,age,date_joined,n_dependants,fam_status,income,_merge,region
0,2539329,1,1,2,8,,196,1,0,Soda,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South
1,2539329,1,1,2,8,,14084,2,0,Organic Unsweetened Vanilla Almond Milk,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South
2,2539329,1,1,2,8,,12427,3,0,Original Beef Jerky,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South
3,2539329,1,1,2,8,,26088,4,0,Aged White Cheddar Popcorn,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South
4,2539329,1,1,2,8,,26405,5,0,XL Pick-A-Size Paper Towel Rolls,...,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South


In [20]:
# Check value counts of 'region'
df_mini['region'].value_counts(dropna= False)

region
West         2780
South        2459
Midwest      2397
Northeast    2364
Name: count, dtype: int64

#### Applying Region Flag to Entire Dataframe 

In [None]:
# Create region flag for Northeast
df_anon.loc[df_anon['state'].isin([
    'Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 
    'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 
    'New Jersey'
]), 'region'] = 'Northeast'

# Create region flag for Midwest
df_anon.loc[df_anon['state'].isin([
    'Wisconsin','Michigan','Illinois','Indiana','Ohio',
    'North Dakota','South Dakota','Nebraska','Kansas','Minnesota',
    'Iowa','Missouri'
]), 'region'] = 'Midwest'

# Create region flag for South
df_anon.loc[df_anon['state'].isin([
    'Delaware','Maryland','District of Columbia','Virginia',
    'West Virginia','North Carolina','South Carolina','Georgia',
    'Florida','Kentucky','Tennessee','Mississippi','Alabama',
    'Oklahoma','Texas','Arkansas','Louisiana'
]), 'region'] = 'South'

# Create region flag for West
df_anon.loc[df_anon['state'].isin([
    'Idaho','Montana','Wyoming','Nevada','Utah','Colorado',
    'Arizona','New Mexico','Alaska','Washington','Oregon','California','Hawaii'
]), 'region'] = 'West'

In [26]:
# Check value counts for 'region
df_anon['region'].value_counts(dropna = False)

region
South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: count, dtype: int64

#### Examine Relationship between Region and Spending Flag

In [28]:
df_anon.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_placed', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'product_name', 'aisle_id',
       'department_id', 'prices', 'max_order', 'loyalty_flag', 'avg_price',
       'spending_flag', 'med_days_prior', 'frequency_flag', 'Unnamed: 0',
       'surname', 'gender', 'state', 'age', 'date_joined', 'n_dependants',
       'fam_status', 'income', '_merge', 'region'],
      dtype='object')

In [29]:
# Create crosstab for 'region' and 'spending_flag'
crosstab = pd.crosstab(df_anon['region'], df_anon['spending_flag'], dropna = False)

In [30]:
# Save crosstab to clipboard
crosstab.to_clipboard(index=True, header=True)

##### The proportion of customers who are "high spenders" in any given region is between 0.33% and 0.39%. High spenders make up the highest proportion in the midwest (0.39%) and the lowest proportion in the Northeast (0.33%). In the West and the South, the proportion of high spenders is 0.38%.

### Exclude low-activity (<5 orders) Customers

In [66]:
df_anon['max_order'].value_counts(dropna = False)

max_order
99    1171333
8      811843
6      811396
9      810213
7      803979
       ...   
97      44949
98      44587
96      40453
2           6
1           5
Name: count, Length: 99, dtype: int64

In [68]:
# Create low-activity flag for customers with fewer than 5 orders
df_anon.loc[df_anon['max_order'] < 5, 'Activity'] = 'Low'

In [72]:
df_anon.loc[df_anon['max_order'] >= 5, 'Activity'] = 'High'

In [76]:
df_anon['Activity'].value_counts(dropna = False)

Activity
High    30964564
Low      1440295
Name: count, dtype: int64

In [78]:
df_2 = df_anon.loc[df_anon['Activity'] == 'High']

In [84]:
df_2.shape

(30964564, 31)

### Export New Dataframe as Pickle

In [86]:
# Export new anonymized and high activity df
df_2.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'total_anon_8_1.pkl'))