## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Date manipulation
import datetime as dt
import calendar
calendar.setfirstweekday(calendar.SUNDAY) 

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_master = pd.read_csv(filepaths.master_file_data, sep=',')

df_transactions = pd.read_csv(filepaths.interim_transactions_data, sep=',')

In [4]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape
df_master.head(2)

Data has 5647 unique customer_id


(5647, 161)

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,...,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,1_day_num.avg,7_day_num.avg,30_day_num.avg,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,RFM_Score,rfm_customer_segment
0,268408,1970-02-01,M,converted,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,-5.0,5.0,33.0,2526.93,24272.43,2014-12-02,1091.0,2.99,02_03,5.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,1243.57,6491.88,1298.01,3834.35,0.26,0.27,0.16,0.32,3.0,1.0,1.0,2.0,2.0,2.0,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,354.7,1033.17,890.63,3834.35,3763.08,3044.83,0.04,0.04,0.04,0.32,0.31,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2163.96,0.0,0.0,0.0,0.0,977.37,0.0,0.12,0.0,0.2,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.25,0.27,0.0,0.0,0.0,0.0,0.08,2011.0,12.0,12.0_Dec,2011.0_12.0,44.83,45.0,40_45,yes,yes,22.25,155.74,667.44,0.03,0.21,0.91,323.0,10.0,23112.18,1.0,1.0,1.0,111.0,best customers
1,269696,1970-07-01,F,converted,Dallas,TX,Texas,South,West South Central,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,-4.0,4.0,3.0,1043.91,4488.51,2014-12-02,1171.0,3.21,03_04,0.0,3.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,44.42,44.0,40_45,yes,yes,3.83,26.83,114.99,0.0,0.02,0.08,961.0,2.0,1239.81,4.0,4.0,4.0,444.0,lost cheap customers


In [5]:
mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape
df_transactions.head(2)

Data has 5506 unique customer_id


(20996, 32)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
0,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,no,yes,Clothing,Mens,1,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10,existing,no


## Sales Performance Metrics

In [6]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [7]:
df = df_transactions[['customer_id', 'trans_date', 'trans_date.year_month', 'total_amt', 'qty']] 
df = df.sort_values(['customer_id', 'trans_date'])
df.set_index(['trans_date.year_month'], inplace=True)

mt.check_unique_no(df, ['customer_id'])
df.shape
df.head(2)

Data has 5506 unique customer_id


(20996, 4)

Unnamed: 0_level_0,customer_id,trans_date,total_amt,qty
trans_date.year_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2011_09,266783,2011-09-24,-5838.82,-4
2012_10,266783,2012-10-21,308.3,3


### Monthly Revenue

In [8]:
df_revenue = df.groupby(['trans_date.year_month'])['total_amt'].sum().reset_index()

mt.check_unique_no(df_revenue, ['trans_date.year_month'])
df_revenue.shape
df_revenue.head(2)

Data has 48 unique trans_date.year_month


(48, 2)

Unnamed: 0,trans_date.year_month,total_amt
0,2011_01,756653.17
1,2011_02,934599.06


### Monthly Revenue Growth Rate

In [9]:
df_revenue['monthly_growth'] = df_revenue['total_amt'].pct_change()
df_revenue.dropna(axis=0, how='any', inplace=True)

mt.check_unique_no(df_revenue, ['trans_date.year_month'])
df_revenue.shape
df_revenue.head(2)

Data has 47 unique trans_date.year_month


(47, 3)

Unnamed: 0,trans_date.year_month,total_amt,monthly_growth
1,2011_02,934599.06,0.24
2,2011_03,1213495.53,0.3


### Monthly Active Customers

In [10]:
df_active_customers = df.groupby('trans_date.year_month')['customer_id'].nunique().reset_index()

mt.check_unique_no(df_active_customers, ['trans_date.year_month'])
df_active_customers.shape
df_active_customers.head(2)

Data has 48 unique trans_date.year_month


(48, 2)

Unnamed: 0,trans_date.year_month,customer_id
0,2011_01,315
1,2011_02,472


### Monthly Order Count

In [11]:
df_active_customers = df.groupby('trans_date.year_month')['qty'].sum().reset_index()

mt.check_unique_no(df_active_customers, ['trans_date.year_month'])
df_active_customers.shape
df_active_customers.head(2)

Data has 48 unique trans_date.year_month


(48, 2)

Unnamed: 0,trans_date.year_month,qty
0,2011_01,856
1,2011_02,1099


### Average Revenue Per Order

In [12]:
df_avg_revenue_per_order = df.groupby('trans_date.year_month')['total_amt'].mean().reset_index()

mt.check_unique_no(df_avg_revenue_per_order, ['trans_date.year_month'])
df_avg_revenue_per_order.shape
df_avg_revenue_per_order.head(2)

Data has 48 unique trans_date.year_month


(48, 2)

Unnamed: 0,trans_date.year_month,total_amt
0,2011_01,2342.58
1,2011_02,1884.27


## Customer Growth Metrics
### New vs Existing Customers

In [13]:
df_new_vs_existing_customers = df.copy()
df_first_purchase_date = df_new_vs_existing_customers.groupby(['customer_id']).agg({'trans_date':'first'})
df_first_purchase_date.rename(columns={'trans_date':'first_purchase_date'}, inplace=True)
df_new_vs_existing_customers = pd.merge(df_new_vs_existing_customers, df_first_purchase_date, on='customer_id', how='left')
df_new_vs_existing_customers['first_purchase.cohort'] = df_new_vs_existing_customers['first_purchase_date'].dt.year.map(str) + "_" + df_new_vs_existing_customers['first_purchase_date'].dt.month.map("{:02}".format)
df_new_vs_existing_customers['purchase_date.cohort'] = df_new_vs_existing_customers['trans_date'].dt.year.map(str) + "_" + df_new_vs_existing_customers['trans_date'].dt.month.map("{:02}".format)
df_earliest_purchase_date = df_first_purchase_date.copy()
df_earliest_purchase_date.rename(columns={'first_purchase_date':'trans_date'}, inplace=True)
df_earliest_purchase_date['customer_type'] = 'new'
df_new_vs_existing_customers = pd.merge(df_new_vs_existing_customers, df_earliest_purchase_date, on=['customer_id', 'trans_date'], how='left')
df_new_vs_existing_customers['customer_type']  = np.where(df_new_vs_existing_customers['customer_type'].isnull(), 'existing', df_new_vs_existing_customers['customer_type'])

In [14]:
mt.check_unique_no(df_new_vs_existing_customers, ['customer_id'])
df_new_vs_existing_customers.shape
df_new_vs_existing_customers.head(2)

Data has 5506 unique customer_id


(20996, 8)

Unnamed: 0,customer_id,trans_date,total_amt,qty,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type
0,266783,2011-09-24,-5838.82,-4,2011-09-24,2011_09,2011_09,new
1,266783,2012-10-21,308.3,3,2011-09-24,2011_09,2012_10,existing


In [15]:
df_new_vs_existing_customers['returned_first_purchase'] = np.where(((df_new_vs_existing_customers['customer_type'] == 'new') & (df_new_vs_existing_customers['qty'] <0)), 'yes', 'no')

In [16]:
df_new_vs_existing_customers['returned_first_purchase'].value_counts(dropna=False)

no     20464
yes      532
Name: returned_first_purchase, dtype: int64

In [17]:
df_new_vs_existing_customers[df_new_vs_existing_customers['returned_first_purchase'] == 'yes']['customer_id'].nunique()

529

### New Customer Ratio

In [18]:
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
df_new_vs_existing_customers['trans_date'] = pd.to_datetime(df_new_vs_existing_customers['trans_date'])
df_new_vs_existing_customers['trans_date.year_month'] = df_new_vs_existing_customers['trans_date'].dt.year.map(str) + "_" + df_new_vs_existing_customers['trans_date'].dt.month.map("{:02}".format)
df_user_ratio = df_new_vs_existing_customers.query("customer_type == 'new'").groupby(['trans_date.year_month'])['customer_id'].nunique()/df_new_vs_existing_customers.query("customer_type == 'existing'").groupby(['trans_date.year_month'])['customer_id'].nunique() 
df_user_ratio = df_user_ratio.reset_index()
df_user_ratio = df_user_ratio.dropna()

df_user_ratio.head(2)

Unnamed: 0,trans_date.year_month,customer_id
0,2011_01,39.38
1,2011_02,9.29


## Customer Profitability
### Power User Curve

In [19]:
df_purchase_frequency_distribution = df_master.groupby(['frequency'], as_index=False).agg({'customer_id':'count'})

mt.check_unique_no(df_purchase_frequency_distribution, ['customer_id'])
df_purchase_frequency_distribution.shape
df_purchase_frequency_distribution.head(2)

Data has 11 unique customer_id


(11, 2)

Unnamed: 0,frequency,customer_id
0,1.0,508
1,2.0,951


### Whale Curve

In [20]:
df_master = df_master.sort_values('30_day_amt.avg',ascending=False)
df_master['30_day_amt.avg.cumsum'] = df_master['30_day_amt.avg'].cumsum()
df_master = df_master.sort_values('30_day_amt.avg.cumsum',ascending=True)
df_master['cum_count'] = df_master.reset_index().index+1
df_master['30_day_amt.avg.cumsum_pct'] = (df_master['30_day_amt.avg.cumsum']/df_master['30_day_amt.avg.cumsum'].max())#*100
df_master['cum_count_pct'] = (df_master['cum_count']/df_master['cum_count'].max())#*100

mt.check_unique_no(df_master, ['customer_id'])
df_master.shape
df_master.head(2)

Data has 5647 unique customer_id


(5647, 165)

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,...,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,1_day_num.avg,7_day_num.avg,30_day_num.avg,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,RFM_Score,rfm_customer_segment,30_day_amt.avg.cumsum,cum_count,30_day_amt.avg.cumsum_pct,cum_count_pct
3951,274213,1986-03-27,M,converted,Houston,TX,Texas,South,West South Central,2014-12-01,2014-12-01,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,279.72,2943.72,2014-12-02,1.0,0.0,00_01,0.0,1.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,12.0,12.0_Dec,2014.0_12.0,28.69,29.0,25_30,no,no,2943.72,20606.04,88311.6,2.0,14.0,60.0,1.0,1.0,2943.72,1.0,4.0,4.0,144.0,other,88311.6,1,0.06,0.0
1700,268801,1977-04-01,M,converted,Philadelphia,PA,Pennsylvania,Northeast,Middle Atlantic,2014-10-02,2014-10-02,1.0,61.0,61.0,61.0,61.0,1.0,4.0,4.0,4.0,359.94,3787.94,2014-12-02,61.0,0.17,00_01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,10.0,10.0_Oct,2014.0_10.0,37.67,38.0,35_40,no,no,62.1,434.68,1862.92,0.07,0.46,1.97,61.0,1.0,3787.94,1.0,4.0,3.0,143.0,other,90174.52,2,0.07,0.0


## Customer Retention
### Monthly Retention Curve

In [21]:
df_customer_count_per_cohort = df_new_vs_existing_customers.groupby(['first_purchase.cohort'], as_index=False).agg({'customer_id':pd.Series.nunique})
df_customer_count_per_cohort.rename(columns={'customer_id':'unique_customer.count'}, inplace=True)

df_customer_count_per_cohort['unique_customer.cumulative_count'] = df_customer_count_per_cohort['unique_customer.count'].cumsum()

mt.check_unique_no(df_customer_count_per_cohort, ['first_purchase.cohort'])
df_customer_count_per_cohort.shape
df_customer_count_per_cohort.head(2)

Data has 47 unique first_purchase.cohort


(47, 3)

Unnamed: 0,first_purchase.cohort,unique_customer.count,unique_customer.cumulative_count
0,2011_01,315,315
1,2011_02,446,761


In [22]:
#identify which users are active by looking at their revenue per month
df_user_purchase = df.groupby(['customer_id','trans_date.year_month'])['total_amt'].sum().reset_index()

#create retention matrix with crosstab
df_retention = pd.crosstab(df_user_purchase['customer_id'], df_user_purchase['trans_date.year_month']).reset_index()
mt.check_unique_no(df_retention, ['customer_id'])
df_retention.shape
df_retention.head(2)

Data has 5506 unique customer_id


(5506, 49)

trans_date.year_month,customer_id,2011_01,2011_02,2011_03,2011_04,2011_05,2011_06,2011_07,2011_08,2011_09,2011_10,2011_11,2011_12,2012_01,2012_02,2012_03,2012_04,2012_05,2012_06,2012_07,2012_08,2012_09,2012_10,2012_11,2012_12,2013_01,2013_02,2013_03,2013_04,2013_05,2013_06,2013_07,2013_08,2013_09,2013_10,2013_11,2013_12,2014_01,2014_02,2014_03,2014_04,2014_05,2014_06,2014_07,2014_08,2014_09,2014_10,2014_11,2014_12
0,266783,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,266784,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [23]:
#create an array of dictionary which keeps Retained & Total User count for each month
months = df_retention.columns[0:] #2:
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['trans_date.year_month'] = int(selected_month)
    retention_data['total_user_count'] = df_retention[selected_month].sum()
    retention_data['retained_user_count'] = df_retention[(df_retention[selected_month]>0) & (df_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)

In [24]:
#convert the array to dataframe and calculate Retention Rate
df_retention = pd.DataFrame(retention_array)
df_retention['new_customers.count'] = df_retention['total_user_count'] - df_retention['retained_user_count']

In [25]:
df_customer_count_per_cohort['first_purchase.cohort'].dtype
df_retention['trans_date.year_month'] = df_retention['trans_date.year_month'].astype('O')
df_retention['trans_date.year_month'].dtype

dtype('O')

dtype('O')

In [26]:
df_retention = pd.merge(df_customer_count_per_cohort, df_retention, left_on='first_purchase.cohort', right_on='trans_date.year_month', how='left')
df_retention = df_retention.drop(['trans_date.year_month'], axis=1)
df_retention['retention.rate'] = df_retention['retained_user_count']/df_retention['unique_customer.cumulative_count']
df_retention = df_retention.rename(columns={'first_purchase.cohort':'registration_cohort', 'unique_customer.count':'total_customers_registered.count', 
                                                                             'unique_customer.cumulative_count':'total_customers_registered.cumulative_count'})
df_retention.head()
df_retention.tail()

Unnamed: 0,registration_cohort,total_customers_registered.count,total_customers_registered.cumulative_count,total_user_count,retained_user_count,new_customers.count,retention.rate
0,2011_01,315,315,,,,
1,2011_02,446,761,,,,
2,2011_03,476,1237,,,,
3,2011_04,414,1651,,,,
4,2011_05,333,1984,,,,


Unnamed: 0,registration_cohort,total_customers_registered.count,total_customers_registered.cumulative_count,total_user_count,retained_user_count,new_customers.count,retention.rate
42,2014_07,1,5502,,,,
43,2014_08,1,5503,,,,
44,2014_10,1,5504,,,,
45,2014_11,1,5505,,,,
46,2014_12,1,5506,,,,


In [27]:
df_sales_cohort = df_sub.groupby(['SalesYearMonth', 'ProduceSubGroup']).agg(aggregation)

NameError: name 'df_sub' is not defined

#### Formulas
https://towardsdatascience.com/data-driven-growth-with-python-part-1-know-your-metrics-812781e66a5b
https://medium.com/data-science-at-microsoft/retain-more-customers-by-understanding-churn-ae31d9b2aa2b
https://medium.com/data-science-at-microsoft/calculating-customer-lifetime-value-a-python-solution-85aa55754b33

In [None]:
m

In [None]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

In [None]:
EXTRACTION_DATE = dt.datetime(2014,12,2)

In [None]:
def categorize_customers(rfm_score):
    x = rfm_score
    if (x == '111'):
        return 'best customers'
    elif (x == '311'):
        return 'almost lost'
    elif (x == '411'):
        return 'lost customers'
    elif (x == '444'):
        return 'lost cheap customers'
    else:
        return 'other'  

In [None]:
df_rfm= df_transactions.groupby('customer_id', as_index=False).agg({'trans_date': lambda date: (EXTRACTION_DATE - date.max()).days,
                                    'trans_id': lambda num: len(num),
                                    'total_amt': lambda price: price.sum()})

In [None]:
df_rfm.rename(columns={'trans_date':'recency', 'trans_id':'frequency', 'total_amt':'monetary'}, inplace=True)

In [None]:
df_rfm['r_quartile'] = pd.qcut(df_rfm['recency'], 4, ['1','2','3','4'])
df_rfm['f_quartile'] = pd.qcut(df_rfm['frequency'], 4, ['4','3','2','1'])
df_rfm['m_quartile'] = pd.qcut(df_rfm['monetary'], 4, ['4','3','2','1'])

In [None]:
df_rfm['RFM_Score'] = df_rfm.r_quartile.astype(str)+ df_rfm.f_quartile.astype(str) + df_rfm.m_quartile.astype(str)

mt.check_unique_no(df_rfm, ['customer_id'])
df_rfm.shape

In [None]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape

In [None]:
df_master = pd.merge(df_master, df_rfm, on='customer_id', how='left')

In [None]:
df_master['rfm_customer_segment'] = df_master['RFM_Score'].apply(categorize_customers)    

In [None]:
df_master['rfm_customer_segment'] = np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                (df_master['m_quartile'] == '1')), 'big spender',
                                           np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                     (df_master['f_quartile'] == '1')), 'loyal customers',
                                                   df_master['rfm_customer_segment']))

df_master[['customer_id', 'rfm_customer_segment']].head()

In [None]:
df_master['rfm_customer_segment'].value_counts(dropna=False)

In [None]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape

## Export Data

In [None]:
df_master.to_csv(filepaths.master_file_data, index=False)
df_transactions.to_csv(filepaths.interim_transactions_data, index=False)