## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Date manipulation
import datetime as dt
import calendar
calendar.setfirstweekday(calendar.SUNDAY) 

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_master = pd.read_csv(filepaths.master_file_data, sep=',')

df_transactions = pd.read_csv(filepaths.derived_transactions_data_v1, sep=',')

In [4]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,conversion_date,conversion_date_cohort,purchase_date_cohort
0,25890929042,266783,2011-09-23,1,2,0,0,0.0,0.0,e-Shop,Footwear,Mens,0,Footwear_Mens,yes,2014-12-02,394.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10


In [5]:
# df_transactions = df_transactions[~((df_transactions['returned_item'] == 'yes') & (df_transactions['drop_record'] == 'yes'))]

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape
df_transactions.head(2)

Data has 5506 unique customer_id


(20876, 30)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,conversion_date,conversion_date_cohort,purchase_date_cohort
0,25890929042,266783,2011-09-23,1,2,0,0,0.0,0.0,e-Shop,Footwear,Mens,0,Footwear_Mens,yes,2014-12-02,394.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10


In [6]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

### RFM Customer Segmentation

In [7]:
EXTRACTION_DATE = dt.datetime(2014,12,2)

In [8]:
def categorize_customers(rfm_score):
    x = rfm_score
    if (x == '111'):
        return 'best customers'
    elif (x == '311'):
        return 'almost lost'
    elif (x == '411'):
        return 'lost customers'
    elif (x == '444'):
        return 'lost cheap customers'
    else:
        return 'other'  

In [9]:
df_rfm= df_transactions.groupby('customer_id', as_index=False).agg({'trans_date': lambda date: (EXTRACTION_DATE - date.max()).days,
                                    'trans_id': lambda num: len(num),
                                    'total_amt': lambda price: price.sum()})

In [10]:
df_rfm.rename(columns={'trans_date':'recency', 'trans_id':'frequency', 'total_amt':'monetary'}, inplace=True)

In [11]:
df_rfm['r_quartile'] = pd.qcut(df_rfm['recency'], 4, ['1','2','3','4'])
df_rfm['f_quartile'] = pd.qcut(df_rfm['frequency'], 4, ['4','3','2','1'])
df_rfm['m_quartile'] = pd.qcut(df_rfm['monetary'], 4, ['4','3','2','1'])

In [12]:
df_rfm['RFM_Score'] = df_rfm.r_quartile.astype(str)+ df_rfm.f_quartile.astype(str) + df_rfm.m_quartile.astype(str)

mt.check_unique_no(df_rfm, ['customer_id'])
df_rfm.shape

Data has 5506 unique customer_id


(5506, 8)

In [13]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape

Data has 5506 unique customer_id


(5506, 110)

In [14]:
df_master = pd.merge(df_master, df_rfm, on='customer_id', how='left')

In [15]:
df_master['rfm_customer_segment'] = df_master['RFM_Score'].apply(categorize_customers)    

In [16]:
df_master['rfm_customer_segment'] = np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                (df_master['m_quartile'] == '1')), 'big spender',
                                           np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                     (df_master['f_quartile'] == '1')), 'loyal customers',
                                                   df_master['rfm_customer_segment']))

df_master[['customer_id', 'rfm_customer_segment']].head()

Unnamed: 0,customer_id,rfm_customer_segment
0,274213,other
1,268801,other
2,267634,best customers
3,273398,big spender
4,267592,big spender


In [17]:
df_master['rfm_customer_segment'].value_counts(dropna=False)

other                   3364
big spender              900
lost cheap customers     492
best customers           305
loyal customers          273
almost lost              131
lost customers            41
Name: rfm_customer_segment, dtype: int64

In [18]:
df_master.head(2)

Unnamed: 0,customer_id,dob,gender,city,state_code,state,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,7_day_amt.avg,14_day_amt.avg,21_day_amt.avg,30_day_amt.avg,60_day_amt.avg,90_day_amt.avg,7_day_num.avg,14_day_num.avg,21_day_num.avg,30_day_num.avg,60_day_num.avg,90_day_num.avg,number_of_unique_times_purchases_returned,30_day_amt.avg.cumsum,cum_count,30_day_amt.avg.cumsum_pct,cum_count_pct,recency,frequency,monetary,r_quartile,f_quartile,m_quartile,RFM_Score,rfm_customer_segment
0,274213,1986-03-27,M,Houston,TX,Texas,South,West South Central,converted,2014-12-01,2014-12-01,1.0,1.0,2.0,279.72,2943.72,1.0,1.0,1.0,1.0,1.0,1.0,2014-12-02,1.0,0.0,00_01,0.0,1.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,2014.0,12.0,12.0_Dec,2014.0_12.0,28.69,29.0,25_30,no,no,20606.04,41212.08,61818.12,88311.6,176623.2,264934.8,14.0,28.0,42.0,60.0,120.0,180.0,0.0,88311.6,1,0.06,0.0,1,1,2943.72,1,4,4,144,other
1,268801,1977-04-01,M,Philadelphia,PA,Pennsylvania,Northeast,Middle Atlantic,converted,2014-10-02,2014-10-02,1.0,1.0,4.0,359.94,3787.94,1.0,61.0,61.0,61.0,61.0,61.0,2014-12-02,61.0,0.17,00_01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,2014.0,10.0,10.0_Oct,2014.0_10.0,37.67,38.0,35_40,no,no,434.68,869.36,1304.04,1862.92,3725.84,5588.76,0.46,0.92,1.38,1.97,3.93,5.9,0.0,90174.52,2,0.07,0.0,61,1,3787.94,1,4,4,144,other


In [19]:
pd.pivot_table(df_master, index=['rfm_customer_segment'], columns=['total_unique_purchase_days'], values=['avg_duration_btwn_purchase_days'], aggfunc=['median'], margins=True, fill_value=0)

Unnamed: 0_level_0,median,median,median,median,median,median,median,median,median,median,median,median
Unnamed: 0_level_1,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,avg_duration_btwn_purchase_days
total_unique_purchase_days,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,All
rfm_customer_segment,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
almost lost,0.0,0.0,0.0,0.0,0.0,213.5,186.14,160.31,146.17,128.45,0.0,193.33
best customers,0.0,0.0,0.0,0.0,179.92,209.17,183.43,167.75,151.0,139.55,124.45,183.43
big spender,0.0,623.0,385.33,300.0,254.0,213.25,186.29,166.5,150.11,135.7,128.55,256.6
lost cheap customers,1045.0,607.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,782.0
lost customers,0.0,0.0,0.0,0.0,0.0,227.67,191.0,173.5,143.06,0.0,0.0,220.33
loyal customers,0.0,0.0,0.0,0.0,211.0,214.25,185.29,165.62,158.22,140.1,0.0,199.0
other,547.5,516.5,391.67,305.88,254.6,0.0,0.0,0.0,0.0,0.0,0.0,344.71
All,853.0,548.0,390.17,305.38,254.4,213.5,185.79,166.5,150.67,137.2,125.73,314.75


In [20]:
pd.pivot_table(df_master, index=['rfm_customer_segment'], columns=['total_unique_purchase_days'], values=['min_duration_btwn_purchase_days'], aggfunc=['median'], margins=True, fill_value=0)

Unnamed: 0_level_0,median,median,median,median,median,median,median,median,median,median,median,median
Unnamed: 0_level_1,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days,min_duration_btwn_purchase_days
total_unique_purchase_days,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,All
rfm_customer_segment,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
almost lost,0.0,0.0,0,0.0,0,23,22.0,12.5,5.0,3.5,0,21.0
best customers,0.0,0.0,0,0.0,0,25,22.0,17.0,9.5,8.5,7,20.0
big spender,0.0,356.0,91,56.5,37,35,24.0,17.0,12.5,16.0,6,39.0
lost cheap customers,1045.0,209.0,0,0.0,0,0,0.0,0.0,0.0,0.0,0,782.0
lost customers,0.0,0.0,0,0.0,0,19,8.5,13.0,13.5,0.0,0,17.0
loyal customers,0.0,0.0,0,0.0,0,24,22.0,16.0,10.0,21.0,0,23.0
other,540.5,344.0,112,61.0,36,0,0.0,0.0,0.0,0.0,0,106.0
All,853.0,319.5,111,61.0,36,25,22.0,16.5,9.0,11.5,7,74.0


In [21]:
pd.pivot_table(df_master, index=['rfm_customer_segment'], columns=['total_unique_purchase_days'], values=['max_duration_btwn_purchase_days'], aggfunc=['median'], margins=True, fill_value=0)

Unnamed: 0_level_0,median,median,median,median,median,median,median,median,median,median,median,median
Unnamed: 0_level_1,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days,max_duration_btwn_purchase_days
total_unique_purchase_days,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,11.0,All
rfm_customer_segment,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3
almost lost,0,0.0,0,0,0.0,571.0,559.0,540.5,570.0,509.5,0,559.0
best customers,0,0.0,0,0,309.0,420.0,365.0,356.0,371.5,359.5,352,394.0
big spender,0,773.5,617,559,514.0,442.5,429.0,422.0,400.5,412.0,387,500.5
lost cheap customers,1045,905.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,976.0
lost customers,0,0.0,0,0,0.0,781.0,749.5,715.0,699.5,0.0,0,748.0
loyal customers,0,0.0,0,0,601.0,488.0,441.0,434.0,427.0,359.0,0,470.0
other,553,648.0,634,573,515.0,0.0,0.0,0.0,0.0,0.0,0,597.5
All,855,713.5,634,571,514.5,482.0,433.5,427.0,401.0,371.5,369,582.0


In [22]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape

Data has 5506 unique customer_id


(5506, 118)

Data has 5506 unique customer_id


(20876, 30)

## Export Data

In [23]:
df_master.to_csv(filepaths.master_file_data, index=False)
df_transactions.to_csv(filepaths.derived_transactions_data_v1, index=False)