# Excel report on IC analysis

# Table of contents
## 1. Setting up
## 2. Addressing data security concerns (PII)
## 3. Flag for regional segmentation of data
## 4. Crosstab of region and spending columns
## 5. Exclusion flag based on order activity

# 1. Setting up

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

In [2]:
# creating path for later import/export

path = r"C:\Users\Anwender\Documents\07-2023 Instacart Basket Analysis"

In [3]:
# importing df

df = pd.read_pickle(os.path.join(path, "02 Data", "Prepared Data", "orders_products_all.pkl"))

# 2. Addressing data security concerns:

In [None]:
# As the dataset contains information on customers full name, I would drop those columns along with the "date_joined", "_merge" and "aisle_id" columns.

df=df.drop(["First Name", "Surname", "_merge" ,"aisle_id", "date_joined"], axis=1)

# 3. Creating a flag for the regional segmentation of the data

In [12]:
# defining Northeastern region

northeast = ["Maine", "New Hampshire", "Vermont", "Massachusetts", "Rhode Island", "Connecticut", "New York", "Pennsylvania", "New Jersey"]

In [13]:
# creating new column Region by inserting northeast with loc.isin()

df.loc[df["State"].isin(northeast), "Region"] = "Northeast"

In [6]:
# checking output

df.loc[df["State"].isin(["Vermont"])]

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,number_of_dependents,fam_status,income,...,busiest_days,busiest_period_of_day,max_order,loyalty_flag,average_spending,spending_flag,median_days_previous_order,order_frequency,_merge,Region
2714,17335,Patrick,Fowler,Male,Vermont,65,1/1/2017,1,married,34514,...,Regularly busy,Average orders,4,New customer,5,Low spender,7.0,Frequent customer,both,Northeast
2715,17335,Patrick,Fowler,Male,Vermont,65,1/1/2017,1,married,34514,...,Regularly busy,Average orders,4,New customer,5,Low spender,7.0,Frequent customer,both,Northeast
2716,17335,Patrick,Fowler,Male,Vermont,65,1/1/2017,1,married,34514,...,Regularly busy,Average orders,4,New customer,5,Low spender,7.0,Frequent customer,both,Northeast
2717,17335,Patrick,Fowler,Male,Vermont,65,1/1/2017,1,married,34514,...,Busiest days,Average orders,4,New customer,5,Low spender,7.0,Frequent customer,both,Northeast
2718,17335,Patrick,Fowler,Male,Vermont,65,1/1/2017,1,married,34514,...,Busiest days,Most orders,4,New customer,5,Low spender,7.0,Frequent customer,both,Northeast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32398027,65428,Gary,Arnold,Male,Vermont,35,4/1/2020,0,single,65646,...,Regularly busy,Average orders,4,New customer,8,Low spender,15.0,Regular customer,both,Northeast
32398028,65428,Gary,Arnold,Male,Vermont,35,4/1/2020,0,single,65646,...,Regularly busy,Most orders,4,New customer,8,Low spender,15.0,Regular customer,both,Northeast
32398029,65428,Gary,Arnold,Male,Vermont,35,4/1/2020,0,single,65646,...,Busiest days,Most orders,4,New customer,8,Low spender,15.0,Regular customer,both,Northeast
32398030,65428,Gary,Arnold,Male,Vermont,35,4/1/2020,0,single,65646,...,Regularly busy,Average orders,4,New customer,8,Low spender,15.0,Regular customer,both,Northeast


In [14]:
# defining other regions

midwest = ["Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio", "North Dakota", "South Dakota", "Nebraska", "Kansas", 
           "Minnesota", "Iowa", "Missouri"]

In [15]:
south = ["Delaware", "Maryland", "District of Columbia", "Virginia", "West Virginia", "North Carolina", "South Carolina", 
         "Georgia", "Florida", "Kentucky", "Tennessee", "Mississippi", "Alabama", "Oklahoma", "Texas", "Arkansas", "Louisiana"]

In [16]:
west = ["Idaho", "Montana", "Wyoming", "Nevada", "Utah", "Colorado", "Arizona", "New Mexico", "Alaska", "Washington", "Oregon",
        "California", "Hawaii"]

In [17]:
# inserting midwest, south and west into Region

df.loc[df["State"].isin(midwest), "Region"] = "Midwest"

In [18]:
df.loc[df["State"].isin(south), "Region"] = "South"

In [19]:
df.loc[df["State"].isin(west), "Region"] = "West"

In [20]:
# checking for missing values

df["Region"].value_counts(dropna = False)

South        10791885
West          8292913
Midwest       7597325
Northeast     5722736
Name: Region, dtype: int64

# 4. Examining correlation between region and spending

In [14]:
crosstab = pd.crosstab(df["Region"], df["spending_flag"])

In [15]:
crosstab

spending_flag,High spender,Low spender
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Midwest,155975,7441350
Northeast,108225,5614511
South,209691,10582194
West,160354,8132559


#### Every region sees predominantly Low spenders, the ratio of Low spenders to High Spenders is the same everywhere.
#### There is no difference in spending habits in regards of U.S. regions.

# 5. Creating an exclusion flag for customers, who ordered less than 5 times.

In [21]:
df.loc[df["max_order"] >= 5, "order_activity"] = "High activity"

In [22]:
df.loc[df["max_order"] < 5, "order_activity"] = "Low activity"

In [24]:
df.head()

Unnamed: 0,user_id,First Name,Surname,Gender,State,Age,date_joined,number_of_dependents,fam_status,income,...,busiest_period_of_day,max_order,loyalty_flag,average_spending,spending_flag,median_days_previous_order,order_frequency,_merge,Region,order_activity
0,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Average orders,8,New customer,7,Low spender,19.0,Regular customer,both,Midwest,High activity
1,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Most orders,8,New customer,7,Low spender,19.0,Regular customer,both,Midwest,High activity
2,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Most orders,8,New customer,7,Low spender,19.0,Regular customer,both,Midwest,High activity
3,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Average orders,8,New customer,7,Low spender,19.0,Regular customer,both,Midwest,High activity
4,26711,Deborah,Esquivel,Female,Missouri,48,1/1/2017,3,married,165665,...,Most orders,8,New customer,7,Low spender,19.0,Regular customer,both,Midwest,High activity


In [23]:
# Creating a subset of data with high activity status customers only

df_high_activity = df.loc[df["order_activity"].isin(["High activity"])]

In [24]:
# Checking shape

df_high_activity.shape

(30964564, 34)

In [25]:
df.shape

(32404859, 34)

In [29]:
# checking for success

df_high_activity["order_activity"].value_counts(dropna = False)

High activity    30964564
Name: order_activity, dtype: int64

In [31]:
# exporting new df as "orders_products_high.pkl"

df_high_activity.to_pickle(os.path.join(path, "02 Data", "Prepared Data", "orders_products_high.pkl"))