# List of Content
## 1. Import data
## 2. Customer behavior based on geographic areas
### 2.1 Regional segmentation
### 2.2 Regional differences in spending habits
## 3. Exclusion flag
### 3.1 Subset containing active customers
### 3.2 Export data

In [1]:
# Import libraries

import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

# 1. Import data

In [2]:
path = r'C:\Users\Yan Peng\10-2020 Instacart Basket Analysis'

In [3]:
df = pd.read_pickle(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_cust.pkl'))

In [4]:
df.shape

(32435059, 35)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32435059 entries, 0 to 32435058
Data columns (total 35 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   order_id                int64   
 1   user_id                 int64   
 2   order_number            int64   
 3   orders_day_of_week      int64   
 4   order_hour_of_day       int64   
 5   days_since_prior_order  float64 
 6   product_id              int64   
 7   add_to_cart_order       int64   
 8   reordered               int64   
 9   _merge                  category
 10  product_name            object  
 11  aisle_id                float64 
 12  department_id           float64 
 13  prices                  float64 
 14  _merge2                 category
 15  price_label             object  
 16  busiest_day             object  
 17  busiest_days            object  
 18  busiest_period_of_day   object  
 19  max_order               int64   
 20  loyalty_flag            object  
 21  avg_pr

##### Data security note: The customer information in columns 25-33 may contain personally identifiable details. To ensure privacy, we recommend excluding the first name and surname of customers from the analysis.

# 2. Customer behavior based on geographic areas

## 2.1 Regional segmentation

In [6]:
# Frequency check on "state" column

df['state'].value_counts()

state
Pennsylvania            667738
California              660428
Rhode Island            657662
Georgia                 657092
New Mexico              655188
Arizona                 654553
North Carolina          652329
Oklahoma                652275
Alaska                  649026
Minnesota               648424
Massachusetts           647020
Wyoming                 644927
Virginia                642103
Missouri                641323
Texas                   641051
Colorado                639820
Maine                   639175
North Dakota            638997
Alabama                 638650
Louisiana               638179
Kansas                  638021
Delaware                637477
South Carolina          637423
Oregon                  636966
Arkansas                636737
New York                636707
Nevada                  636697
Montana                 635838
South Dakota            634362
Illinois                633476
Hawaii                  633446
Washington              633445
Mi

In [7]:
# Create regional lists

Northwest = ['Maine', 'New Hampshire', 'Vermont', 'Massachusetts', 'Rhode Island', 'Connecticut', 'New York', 'Pennsylvania', 'New Jersey']

In [8]:
Midwest = ['Wisconsin', 'Michigan', 'Illinois', 'Indiana', 'Ohio', 'North Dakota', 'South Dakota', 'Nebraska', 'Kansas', 'Minnesota', 'Iowa', 'Missouri']

In [9]:
South = ['Delaware', 'Maryland', 'District of Columbia', 'Virginia', 'West Virginia', 'North Carolina', 'South Carolina', 'Georgia', 'Florida', 'Kentucky', 'Tennessee', 'Mississippi', 'Alabama', 'Oklahoma', 'Texas', 'Arkansas', 'Louisiana']

In [10]:
West = ['Idaho', 'Montana', 'Wyoming', 'Nevada', 'Utah', 'Colorado', 'Arizona', 'New Mexico', 'Alaska', 'Washington', 'Oregon', 'California', 'Hawaii']

In [11]:
# Create "region" column based on "state" column

df.loc[df['state'].isin(Northwest), 'region'] = 'Northwest'

In [12]:
df.loc[df['state'].isin(Midwest), 'region'] = 'Midwest'

In [13]:
df.loc[df['state'].isin(South), 'region'] = 'South'

In [14]:
df.loc[df['state'].isin(West), 'region'] = 'West'

In [15]:
# Check for empty rows in the new column

df['region'].isna().sum()

0

In [16]:
# Frequency check

df['region'].value_counts()

region
South        10801982
West          8300617
Midwest       7604016
Northwest     5728444
Name: count, dtype: int64

## 2.2 Regional differences in spending habits

In [17]:
# Count rows grouped by "region" and "spending_flag"

df.groupby(['region', 'spending_flag']).size()

region     spending_flag
Midwest    High spender       156084
           Low spender       7447932
Northwest  High spender       108276
           Low spender       5620168
South      High spender       209878
           Low spender      10592104
West       High spender       160440
           Low spender       8140177
dtype: int64

In [18]:
# Copy the results to Excel to calculate the percentages of the two spender types for each region

pivot_tbl = df.groupby(['region', 'spending_flag']).size()

In [19]:
pivot_tbl.to_clipboard()

##### All four regions have the same distribution of the two spender types: 2% are high spenders, 98% low spenders.

# 3. Exclusion flag

In [20]:
# Create "activity_flag" column based on maximum order number

df.loc[df['max_order'] >= 5, 'activity_flag'] = 'Active customers'

In [21]:
df.loc[df['max_order'] < 5, 'activity_flag'] = 'Low-activity customers'

In [22]:
df['activity_flag'].value_counts()

activity_flag
Active customers          30993489
Low-activity customers     1441570
Name: count, dtype: int64

## 3.1 Subset containing active customers

In [23]:
# Create a subset of active customers

df_2 =  df[df['activity_flag'] == 'Active customers']

In [24]:
df_2.shape

(30993489, 37)

In [25]:
pd.options.display.max_rows = None

In [26]:
pd.options.display.max_columns = None

In [27]:
df_2.head(100)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,_merge,product_name,aisle_id,department_id,prices,_merge2,price_label,busiest_day,busiest_days,busiest_period_of_day,max_order,loyalty_flag,avg_price,spending_flag,median_days_prior,order_freq_flag,first_name,surname,gender,state,age,date_joined,nr_dependants,fam_status,income,_merge3,region,activity_flag
0,2539329,1,1,2,8,,196,1,0,both,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
1,2539329,1,1,2,8,,14084,2,0,both,Organic Unsweetened Vanilla Almond Milk,91.0,16.0,12.5,both,Mid-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
2,2539329,1,1,2,8,,12427,3,0,both,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
3,2539329,1,1,2,8,,26088,4,0,both,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
4,2539329,1,1,2,8,,26405,5,0,both,XL Pick-A-Size Paper Towel Rolls,54.0,17.0,1.0,both,Low-range product,Regularly busy,Regularly busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
5,2398795,1,2,3,7,15.0,196,1,1,both,Soda,77.0,7.0,9.0,both,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
6,2398795,1,2,3,7,15.0,10258,2,0,both,Pistachios,117.0,19.0,3.0,both,Low-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
7,2398795,1,2,3,7,15.0,12427,3,1,both,Original Beef Jerky,23.0,19.0,4.4,both,Low-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
8,2398795,1,2,3,7,15.0,13176,4,0,both,Bag of Organic Bananas,24.0,4.0,10.3,both,Mid-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers
9,2398795,1,2,3,7,15.0,26088,5,1,both,Aged White Cheddar Popcorn,23.0,19.0,4.7,both,Low-range product,Regularly busy,Least busy,Average orders,10,New customer,6.367797,Low spender,20.5,Non-frequent customer,Linda,Nguyen,Female,Alabama,31,2/17/2019,3,married,40423,both,South,Active customers


## 3.2 Export data

In [28]:
df.shape

(32435059, 37)

In [29]:
df.columns

Index(['order_id', 'user_id', 'order_number', 'orders_day_of_week',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', '_merge', 'product_name', 'aisle_id',
       'department_id', 'prices', '_merge2', 'price_label', 'busiest_day',
       'busiest_days', 'busiest_period_of_day', 'max_order', 'loyalty_flag',
       'avg_price', 'spending_flag', 'median_days_prior', 'order_freq_flag',
       'first_name', 'surname', 'gender', 'state', 'age', 'date_joined',
       'nr_dependants', 'fam_status', 'income', '_merge3', 'region',
       'activity_flag'],
      dtype='object')

In [30]:
# Export final data

df.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_all.pkl'))

In [31]:
# Export data with exclusions

df_2.to_pickle(os.path.join(path, '02 Data','Prepared Data', 'orders_products_cust_active.pkl'))