## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
import scipy
import statistics as stats

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

### Create Project Directories and Sub-Directories

In [3]:
mt.create_directories(filepaths.project_parent_directory)

Directory  deliverables  already exists
Directory  deprecated  already exists
Directory  dictionary  already exists
Directory  visualizations  already exists
Subdirectory  data/raw  already exists
Subdirectory  data/interim  already exists
Subdirectory  data/external  already exists
Subdirectory  data/processed  already exists


## Load Data

In [4]:
df_customer = pd.read_csv(filepaths.raw_customer_data, sep=',')

In [5]:
df_transactions = pd.read_csv(filepaths.raw_transactions_data, sep=',')

In [6]:
df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Data Audit

In [7]:
df_customer.columns = [c.lower() for c in df_customer.columns]
df_customer.shape
mt.check_unique_no(df_customer, ['customer_id'])
mt.missing_data_table(df_customer)
df_customer.head()

(5647, 137)

Data has 5647 unique customer_id
Missing data distribution:

                  Variable  Count  Proportion
                    gender      2        0.00
                 city_code      2        0.00
           conversion_date    141        0.02
        last_purchase_date    141        0.02
              duration.min    141        0.02
              duration.max    141        0.02
             duration.mean    141        0.02
             duration.last    141        0.02
            trans_id.count    141        0.02
                   qty.min    141        0.02
                   qty.max    141        0.02
                   qty.sum    141        0.02
                   tax.sum    141        0.02
             total_amt.sum    141        0.02
           assessment_date    141        0.02
              customer_age    141        0.02
        customer_age.group    141        0.02
      count.flagship store    141        0.02
                 count.mbr    141        0.02
            count.t

Unnamed: 0,customer_id,dob,gender,city_code,customer_status,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.flagship store,count.mbr,count.teleshop,count.e-shop,sum.flagship store,sum.mbr,sum.teleshop,sum.e-shop,mean.flagship store,mean.mbr,mean.teleshop,mean.e-shop,flagship_store_spend.prop,mbr_spend.prop,teleshop_spend.prop,e-shop.prop,count.bags,count.books,count.clothing,count.electronics,count.footwear,count.home and kitchen,sum.bags,sum.books,sum.clothing,sum.electronics,sum.footwear,sum.home and kitchen,mean.bags,mean.books,mean.clothing,mean.electronics,mean.footwear,mean.home and kitchen,bags.prop,books.prop,clothing.prop,electronics.prop,footwear.prop,home and kitchen.prop,count.academic,count.audio and video,count.bath,count.cameras,count.children,count.comics,count.computers,count.diy,count.fiction,count.furnishing,count.kids,count.kitchen,count.mens,count.mobiles,count.non-fiction,count.personal appliances,count.tools,count.women,sum.academic,sum.audio and video,sum.bath,sum.cameras,sum.children,sum.comics,sum.computers,sum.diy,sum.fiction,sum.furnishing,sum.kids,sum.kitchen,sum.mens,sum.mobiles,sum.non-fiction,sum.personal appliances,sum.tools,sum.women,mean.academic,mean.audio and video,mean.bath,mean.cameras,mean.children,mean.comics,mean.computers,mean.diy,mean.fiction,mean.furnishing,mean.kids,mean.kitchen,mean.mens,mean.mobiles,mean.non-fiction,mean.personal appliances,mean.tools,mean.women,academic.prop,audio and video.prop,bath.prop,cameras.prop,children.prop,comics.prop,computers.prop,diy.prop,fiction.prop,furnishing.prop,kids.prop,kitchen.prop,mens.prop,mobiles.prop,non-fiction.prop,personal appliances.prop,tools.prop,women.prop,biological_age.actual,biological_age,biological_age.group,repeat_purchaser
0,268408,1970-02-01,M,4.0,converted,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,1.0,5.0,43.0,2526.93,26592.93,2014-12-02,2.99,02_03,5.0,1.0,3.0,2.0,8538.33,6491.88,3894.02,7668.7,1707.67,6491.88,1298.01,3834.35,0.32,0.24,0.15,0.29,3.0,1.0,1.0,2.0,2.0,2.0,3384.61,1033.17,890.63,7668.7,7526.15,6089.66,1128.2,1033.17,890.63,3834.35,3763.08,3044.83,0.13,0.04,0.03,0.29,0.28,0.23,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,8812.38,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2937.46,0.0,0.0,0.0,0.0,977.37,0.0,0.11,0.0,0.18,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.23,0.33,0.0,0.0,0.0,0.0,0.07,44.83,45.0,40_45,yes
1,269696,1970-07-01,F,8.0,converted,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,3.0,4.0,11.0,1043.91,10985.91,2014-12-02,3.21,03_04,0.0,3.0,0.0,0.0,0.0,10985.91,0.0,0.0,0.0,3661.97,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,10985.91,0.0,0.0,0.0,0.0,0.0,3661.97,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6497.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0,44.42,44.0,40_45,yes
2,268159,1970-08-01,F,8.0,converted,2012-06-02,2013-03-31,6.0,21.0,611.0,152.17,611.0,6.0,1.0,5.0,19.0,1699.85,17888.84,2014-12-02,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.75,0.0,1182.35,7458.75,2311.94,0.0,0.07,0.42,0.52,1.0,1.0,2.0,0.0,0.0,2.0,779.02,1182.35,8141.64,0.0,0.0,7785.83,779.02,1182.35,4070.82,0.0,0.0,3892.91,0.04,0.07,0.46,0.0,0.0,0.44,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,8141.64,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,4070.82,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.46,0.42,0.04,0.0,0.0,0.0,0.0,0.0,44.34,44.0,40_45,yes
3,270181,1970-10-01,F,2.0,converted,2011-03-18,2014-09-01,10.0,2.0,306.0,135.5,92.0,10.0,1.0,5.0,28.0,2440.31,25681.31,2014-12-02,3.71,03_04,3.0,1.0,1.0,5.0,8428.94,408.85,617.7,16225.82,2809.65,408.85,617.7,3245.16,0.33,0.02,0.02,0.63,0.0,4.0,1.0,2.0,1.0,2.0,0.0,7232.23,5348.2,6532.76,1962.48,4605.64,0.0,1808.06,5348.2,3266.38,1962.48,2302.82,0.0,0.28,0.21,0.25,0.08,0.18,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,408.85,0.0,0.0,0.0,6823.38,0.0,0.0,0.0,0.0,4605.64,5348.2,0.0,1962.48,0.0,0.0,6532.76,0.0,0.0,408.85,0.0,0.0,0.0,2274.46,0.0,0.0,0.0,0.0,2302.82,5348.2,0.0,1962.48,0.0,0.0,3266.38,0.0,0.0,0.02,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.18,0.21,0.0,0.08,0.0,0.0,0.25,0.0,0.0,44.17,44.0,40_45,yes
4,268073,1970-11-01,M,1.0,converted,2011-11-19,2013-12-30,3.0,1.0,771.0,369.67,337.0,3.0,1.0,5.0,7.0,896.28,9432.28,2014-12-02,3.04,03_04,0.0,3.0,0.0,0.0,0.0,9432.28,0.0,0.0,0.0,3144.09,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,7956.0,0.0,0.0,1476.28,0.0,0.0,7956.0,0.0,0.0,738.14,0.0,0.0,0.84,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1476.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16,44.09,44.0,40_45,yes


In [8]:
df_transactions.rename(columns={'transaction_id':'trans_id', 'cust_id':'customer_id', 'tran_date':'trans_date'}, inplace=True)
df_transactions.columns = [c.lower() for c in df_transactions.columns]
df_transactions.shape
mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
mt.missing_data_table(df_transactions)
df_transactions.head()
df_transactions.describe()

(23053, 10)

Data has 5506 unique customer_id
Data has 20878 unique trans_id
There is no missing data


Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop


Unnamed: 0,trans_id,customer_id,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt
count,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0
mean,50073480358.45,271021.75,6.15,3.76,2.43,636.37,248.67,2107.31
std,28981936062.0,2431.69,3.73,1.68,2.27,622.36,187.18,2507.56
min,3268991.0,266783.0,1.0,1.0,-5.0,-1499.0,7.35,-8270.92
25%,24938639453.0,268935.0,3.0,2.0,1.0,312.0,98.28,762.45
50%,50093131361.0,270980.0,5.0,4.0,3.0,710.0,199.08,1754.74
75%,75329995679.0,273114.0,10.0,5.0,4.0,1109.0,365.71,3569.15
max,99987549630.0,275265.0,12.0,6.0,5.0,1500.0,787.5,8287.5


In [9]:
df_transactions[df_transactions['qty'] < 0].shape
df_transactions[df_transactions['qty'] < 0].head()
df_transactions[df_transactions['qty'] < 0]['store_type'].value_counts(dropna=False)

(2177, 10)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop


e-Shop            882
MBR               451
Flagship store    432
TeleShop          412
Name: store_type, dtype: int64

In [10]:
df_transactions[df_transactions['qty'] > 0].shape
df_transactions[df_transactions['qty'] > 0].head()
df_transactions[df_transactions['qty'] > 0]['store_type'].value_counts(dropna=False)

(20876, 10)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
10,29258453508,270384,20-02-2014,5,3,5,1497,785.92,8270.92,e-Shop
11,25455265351,267750,20-02-2014,12,6,3,1360,428.4,4508.4,e-Shop
12,1571002198,275023,20-02-2014,6,5,4,587,246.54,2594.54,e-Shop
14,36554696014,269345,20-02-2014,3,5,3,1253,394.69,4153.69,e-Shop
15,56814940239,268799,20-02-2014,7,5,5,368,193.2,2033.2,e-Shop


e-Shop            8429
MBR               4210
Flagship store    4145
TeleShop          4092
Name: store_type, dtype: int64

In [11]:
df_products.rename(columns={'prod_sub_cat_code':'prod_subcat_code'}, inplace=True)
df_products['counter'] = 1
mt.missing_data_table(df_products)
df_products.head()

There is no missing data


Unnamed: 0,prod_cat_code,prod_cat,prod_subcat_code,prod_subcat,counter
0,1,Clothing,4,Mens,1
1,1,Clothing,1,Women,1
2,1,Clothing,3,Kids,1
3,2,Footwear,1,Mens,1
4,2,Footwear,3,Women,1


In [12]:
df_products.groupby(['prod_cat'])['prod_subcat'].value_counts(dropna=False)

prod_cat          prod_subcat        
Bags              Mens                   1
                  Women                  1
Books             Academic               1
                  Children               1
                  Comics                 1
                  DIY                    1
                  Fiction                1
                  Non-Fiction            1
Clothing          Kids                   1
                  Mens                   1
                  Women                  1
Electronics       Audio and video        1
                  Cameras                1
                  Computers              1
                  Mobiles                1
                  Personal Appliances    1
Footwear          Kids                   1
                  Mens                   1
                  Women                  1
Home and kitchen  Bath                   1
                  Furnishing             1
                  Kitchen                1
                

In [13]:
df_products['prod_cat'].value_counts(dropna=False)

Books               6
Electronics         5
Home and kitchen    4
Footwear            3
Clothing            3
Bags                2
Name: prod_cat, dtype: int64

In [14]:
df_products['prod_subcat'].value_counts(dropna=False)

Mens                   3
Women                  3
Kids                   2
Tools                  1
Audio and video        1
Mobiles                1
Kitchen                1
Comics                 1
Children               1
Computers              1
Academic               1
Fiction                1
Furnishing             1
Cameras                1
Personal Appliances    1
DIY                    1
Non-Fiction            1
Bath                   1
Name: prod_subcat, dtype: int64

### Transactions Data

In [15]:
converted_customers = df_transactions['customer_id'].unique().tolist()

In [16]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])
df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [17]:
df_transactions = pd.merge(df_transactions, df_products, on=['prod_cat_code', 'prod_subcat_code'], how='left')
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter
0,80712190438,270351,2014-02-28,1,1,-5,-772,405.3,-4265.3,e-Shop,Clothing,Women,1
1,29258453508,270384,2014-02-27,5,3,-5,-1497,785.92,-8270.92,e-Shop,Electronics,Computers,1
2,51750724947,273420,2014-02-24,6,5,-2,-791,166.11,-1748.11,TeleShop,Books,DIY,1
3,93274880719,271509,2014-02-24,11,6,-3,-1363,429.35,-4518.35,e-Shop,Home and kitchen,Bath,1
4,51750724947,273420,2014-02-23,6,5,-2,-791,166.11,-1748.11,TeleShop,Books,DIY,1


In [18]:
df_transactions['qty'] = np.where((df_transactions['qty'] < 0), (-1*df_transactions['qty']), df_transactions['qty'])
df_transactions['rate'] = np.where((df_transactions['rate'] < 0), (-1*df_transactions['rate']), df_transactions['rate'])
df_transactions['total_amt'] = np.where((df_transactions['total_amt'] < 0), (-1*df_transactions['total_amt']), df_transactions['total_amt'])
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter
0,80712190438,270351,2014-02-28,1,1,5,772,405.3,4265.3,e-Shop,Clothing,Women,1
1,29258453508,270384,2014-02-27,5,3,5,1497,785.92,8270.92,e-Shop,Electronics,Computers,1
2,51750724947,273420,2014-02-24,6,5,2,791,166.11,1748.11,TeleShop,Books,DIY,1
3,93274880719,271509,2014-02-24,11,6,3,1363,429.35,4518.35,e-Shop,Home and kitchen,Bath,1
4,51750724947,273420,2014-02-23,6,5,2,791,166.11,1748.11,TeleShop,Books,DIY,1


In [19]:
df_transactions['store_type'].value_counts(dropna=False)

e-Shop            9311
MBR               4661
Flagship store    4577
TeleShop          4504
Name: store_type, dtype: int64

In [20]:
df_transactions = df_transactions.sort_values(['customer_id','trans_date'])

In [21]:
df_transactions['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [22]:
df_transactions['duration'] = df_transactions.groupby(['customer_id'])['trans_date'].transform(pd.Series.diff).shift(-1)
df_transactions['duration'] = df_transactions.apply(lambda r: r['assessment_date'] - r['trans_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_transactions['duration'] = (df_transactions['duration']/np.timedelta64(1, 'D'))

In [23]:
df_transactions.head(7)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,assessment_date,duration
18130,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,Footwear,Mens,1,2014-12-02,1.0
18114,25890929042,266783,2011-09-24,1,2,4,1321,554.82,5838.82,e-Shop,Footwear,Mens,1,2014-12-02,393.0
10030,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,2014-12-02,122.0
7515,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,2014-12-02,194.0
7722,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,Books,Non-Fiction,1,2014-12-02,456.0
9055,36310127403,266784,2012-04-12,4,3,2,200,42.0,442.0,Flagship store,Electronics,Mobiles,1,2014-12-02,133.0
11286,54234600611,266784,2012-08-23,10,5,3,1291,406.67,4279.66,TeleShop,Books,Non-Fiction,1,2014-12-02,17.0


In [24]:
df_trans_overall = df_transactions.groupby(['customer_id'], as_index=False).agg({'trans_date':['first', 'last', pd.Series.nunique],
                                                                                 'duration': ['min', 'max', 'mean', 'last'],
                                                        'trans_id':'count',
                                                        'qty':['min', 'max', 'sum'],
                                                       'tax':'sum',
                                                       'total_amt':'sum' }) #, lambda x: stats.mode(x)[0][0]]|, 'prod_cat':[pd.Series.nunique],
#                                                          'prod_subcat':[pd.Series.nunique],
#                                                        'store_type':[pd.Series.nunique]

In [25]:
df_trans_overall.shape
df_trans_overall.columns = [".".join(x).strip('.') for x in df_trans_overall.columns.ravel()] 
df_trans_overall.rename(columns={'trans_date.first':'conversion_date', 'trans_date.last':'last_purchase_date',
                                'trans_date.nunique':'number_of_purchase_days'}, inplace=True)
mt.check_unique_no(df_trans_overall, ['customer_id'])
df_trans_overall.head()

(5506, 14)

Data has 5506 unique customer_id


Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,1,4,14,1405.53,14791.53
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,2,5,33,3351.6,35271.6
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,1,4,32,2684.74,28253.75


In [26]:
df_trans_overall['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [27]:
df_trans_overall['customer_age'] = (df_trans_overall['assessment_date'] - df_trans_overall['conversion_date'])/np.timedelta64(1,'Y')

In [28]:
age_bins =  [0, 1, 2, 3, 4]
labels = ['_<01','01_02', '02_03', '03_04']
df_trans_overall['customer_age.group'] = pd.cut(df_trans_overall['customer_age'], age_bins, labels = labels,include_lowest = True)

In [29]:
def customer_profile(_df, unique_id, value, featurelist):
    df_list = []
    for feature in featurelist:
        _df[feature] = _df[feature].astype(str)
        _df_temp = pd.crosstab(_df[unique_id], _df[feature], values=_df[value], 
                               aggfunc=['count','sum', 'mean'], dropna=False).fillna(0).reset_index()
        _df_temp.columns = [".".join(x).strip('.') for x in _df_temp.columns.ravel()] 
        df_list.append(_df_temp)
        dfs = [df.set_index(unique_id) for df in df_list]
        df = pd.concat(dfs, axis=1)
        df.reset_index(inplace=True)
    return df

In [30]:
df_product_cat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['prod_cat'])
df_product_cat.head()

Unnamed: 0,customer_id,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen
0,266783,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,11677.64,0.0,0.0,1845.35,634.27,0.0,5838.82,0.0
1,266784,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0
2,266785,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,13657.8,682.89,5066.43,0.0,0.0,3966.12,6828.9
3,266788,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0
4,266794,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5600.14,4480.78,0.0,2846.48,4190.16,2049.78,1400.04,2240.39,0.0


In [31]:
df_product_subcat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['prod_subcat'])
df_product_subcat.head()

Unnamed: 0,customer_id,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women
0,266783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12946.18,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3236.55,0.0,1845.35,0.0,0.0,0.0
1,266784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0
2,266785,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,13657.8,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,6828.9,2166.54,0.0,0.0,0.0,0.0,0.0
3,266788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0
4,266794,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,990.08,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,330.03,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53


In [32]:
df_stores = customer_profile(df_transactions, 'customer_id', 'total_amt', ['store_type'])
df_stores.head()

Unnamed: 0,customer_id,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop
0,266783,0.0,0.0,1.0,4.0,0.0,0.0,308.3,14483.24,0.0,0.0,308.3,3620.81
1,266784,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4
2,266785,4.0,0.0,3.0,1.0,19474.52,0.0,12661.09,3135.99,4868.63,0.0,4220.36,3135.99
3,266788,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93
4,266794,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13650.07,718.25,3091.79,4610.06,1950.01


In [33]:
df_trans_overall = pd.merge(df_trans_overall, df_stores, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,1,4,14,1405.53,14791.53,2014-12-02,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,14483.24,0.0,0.0,308.3,3620.81
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,2,5,33,3351.6,35271.6,2014-12-02,3.72,03_04,4.0,0.0,3.0,1.0,19474.52,0.0,12661.09,3135.99,4868.63,0.0,4220.36,3135.99
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,1,4,32,2684.74,28253.75,2014-12-02,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13650.07,718.25,3091.79,4610.06,1950.01


In [34]:
df_trans_overall['Flagship_store_spend.prop'] = df_trans_overall['sum.Flagship store']/df_trans_overall['total_amt.sum']
df_trans_overall['MBR_spend.prop'] = df_trans_overall['sum.MBR']/df_trans_overall['total_amt.sum']
df_trans_overall['TeleShop_spend.prop'] = df_trans_overall['sum.TeleShop']/df_trans_overall['total_amt.sum']
df_trans_overall['e-Shop.prop'] = df_trans_overall['sum.e-Shop']/df_trans_overall['total_amt.sum']

In [35]:
df_trans_overall = pd.merge(df_trans_overall, df_product_cat, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,1,4,14,1405.53,14791.53,2014-12-02,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,14483.24,0.0,0.0,308.3,3620.81,0.0,0.0,0.02,0.98,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,11677.64,0.0,0.0,1845.35,634.27,0.0,5838.82,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,2,5,33,3351.6,35271.6,2014-12-02,3.72,03_04,4.0,0.0,3.0,1.0,19474.52,0.0,12661.09,3135.99,4868.63,0.0,4220.36,3135.99,0.55,0.0,0.36,0.09,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,13657.8,682.89,5066.43,0.0,0.0,3966.12,6828.9
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,1,4,32,2684.74,28253.75,2014-12-02,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13650.07,718.25,3091.79,4610.06,1950.01,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5600.14,4480.78,0.0,2846.48,4190.16,2049.78,1400.04,2240.39,0.0


In [36]:
df_trans_overall['Bags.prop'] = df_trans_overall['sum.Bags']/df_trans_overall['total_amt.sum']
df_trans_overall['Books.prop'] = df_trans_overall['sum.Books']/df_trans_overall['total_amt.sum']
df_trans_overall['Clothing.prop'] = df_trans_overall['sum.Clothing']/df_trans_overall['total_amt.sum']
df_trans_overall['Electronics.prop'] = df_trans_overall['sum.Electronics']/df_trans_overall['total_amt.sum']
df_trans_overall['Footwear.prop'] = df_trans_overall['sum.Footwear']/df_trans_overall['total_amt.sum']
df_trans_overall['Home and kitchen.prop'] = df_trans_overall['sum.Home and kitchen']/df_trans_overall['total_amt.sum']

In [37]:
df_trans_overall = pd.merge(df_trans_overall, df_product_subcat, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,1,4,14,1405.53,14791.53,2014-12-02,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,14483.24,0.0,0.0,308.3,3620.81,0.0,0.0,0.02,0.98,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,11677.64,0.0,0.0,1845.35,634.27,0.0,5838.82,0.0,0.0,0.12,0.09,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12946.18,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3236.55,0.0,1845.35,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0,0.0,0.92,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,2,5,33,3351.6,35271.6,2014-12-02,3.72,03_04,4.0,0.0,3.0,1.0,19474.52,0.0,12661.09,3135.99,4868.63,0.0,4220.36,3135.99,0.55,0.0,0.36,0.09,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,13657.8,682.89,5066.43,0.0,0.0,3966.12,6828.9,0.02,0.14,0.0,0.0,0.45,0.39,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,13657.8,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,6828.9,2166.54,0.0,0.0,0.0,0.0,0.0
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0,0.24,0.22,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,1,4,32,2684.74,28253.75,2014-12-02,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13650.07,718.25,3091.79,4610.06,1950.01,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5600.14,4480.78,0.0,2846.48,4190.16,2049.78,1400.04,2240.39,0.0,0.2,0.3,0.15,0.2,0.16,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,990.08,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,330.03,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53


In [38]:
df_trans_overall['Academic.prop'] = df_trans_overall['sum.Academic']/df_trans_overall['total_amt.sum']
df_trans_overall['Audio and video.prop'] = df_trans_overall['sum.Audio and video']/df_trans_overall['total_amt.sum']
df_trans_overall['Bath.prop'] = df_trans_overall['sum.Bath']/df_trans_overall['total_amt.sum']
df_trans_overall['Cameras.prop'] = df_trans_overall['sum.Cameras']/df_trans_overall['total_amt.sum']
df_trans_overall['Children.prop'] = df_trans_overall['sum.Children']/df_trans_overall['total_amt.sum']
df_trans_overall['Comics.prop'] = df_trans_overall['sum.Comics']/df_trans_overall['total_amt.sum']
df_trans_overall['Computers.prop'] = df_trans_overall['sum.Computers']/df_trans_overall['total_amt.sum']
df_trans_overall['DIY.prop'] = df_trans_overall['sum.DIY']/df_trans_overall['total_amt.sum']
df_trans_overall['Fiction.prop'] = df_trans_overall['sum.Fiction']/df_trans_overall['total_amt.sum']
df_trans_overall['Furnishing.prop'] = df_trans_overall['sum.Furnishing']/df_trans_overall['total_amt.sum']
df_trans_overall['Kids.prop'] = df_trans_overall['sum.Kids']/df_trans_overall['total_amt.sum']
df_trans_overall['Kitchen.prop'] = df_trans_overall['sum.Kitchen']/df_trans_overall['total_amt.sum']
df_trans_overall['Mens.prop'] = df_trans_overall['sum.Mens']/df_trans_overall['total_amt.sum']
df_trans_overall['Mobiles.prop'] = df_trans_overall['sum.Mobiles']/df_trans_overall['total_amt.sum']
df_trans_overall['Non-Fiction.prop'] = df_trans_overall['sum.Non-Fiction']/df_trans_overall['total_amt.sum']
df_trans_overall['Personal Appliances.prop'] = df_trans_overall['sum.Personal Appliances']/df_trans_overall['total_amt.sum']
df_trans_overall['Tools.prop'] = df_trans_overall['sum.Tools']/df_trans_overall['total_amt.sum']
df_trans_overall['Women.prop'] = df_trans_overall['sum.Women']/df_trans_overall['total_amt.sum']

In [39]:
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,1,4,14,1405.53,14791.53,2014-12-02,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,14483.24,0.0,0.0,308.3,3620.81,0.0,0.0,0.02,0.98,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,11677.64,0.0,0.0,1845.35,634.27,0.0,5838.82,0.0,0.0,0.12,0.09,0.0,0.79,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12946.18,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3236.55,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.88,0.0,0.12,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0,0.0,0.92,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.08,0.75,0.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,2,5,33,3351.6,35271.6,2014-12-02,3.72,03_04,4.0,0.0,3.0,1.0,19474.52,0.0,12661.09,3135.99,4868.63,0.0,4220.36,3135.99,0.55,0.0,0.36,0.09,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,13657.8,682.89,5066.43,0.0,0.0,3966.12,6828.9,0.02,0.14,0.0,0.0,0.45,0.39,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,13657.8,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,6828.9,2166.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,0.0,0.28,0.39,0.18,0.0,0.0,0.0,0.0,0.0
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0,0.24,0.22,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.32,0.0,0.46,0.0,0.0,0.0,0.0,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,1,4,32,2684.74,28253.75,2014-12-02,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13650.07,718.25,3091.79,4610.06,1950.01,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5600.14,4480.78,0.0,2846.48,4190.16,2049.78,1400.04,2240.39,0.0,0.2,0.3,0.15,0.2,0.16,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,990.08,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,330.03,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53,0.14,0.04,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.1,0.0,0.0,0.16,0.0,0.26


### Customer Data

In [40]:
df_customer['customer_status'] = np.where((df_customer['customer_id'].isin(converted_customers)), 'converted', 'voluntarily churned - not converted')
df_customer['customer_status'].value_counts(dropna=False)

converted                              5506
voluntarily churned - not converted     141
Name: customer_status, dtype: int64

In [41]:
df_customer['dob'] = pd.to_datetime(df_customer['dob'])
df_customer['dob'].sort_values().min()
df_customer['dob'].sort_values().max()

Timestamp('1970-01-02 00:00:00')

Timestamp('1992-12-29 00:00:00')

In [42]:
df_customer.tail()

Unnamed: 0,customer_id,dob,gender,city_code,customer_status,conversion_date,last_purchase_date,number_of_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,tax.sum,total_amt.sum,assessment_date,customer_age,customer_age.group,count.flagship store,count.mbr,count.teleshop,count.e-shop,sum.flagship store,sum.mbr,sum.teleshop,sum.e-shop,mean.flagship store,mean.mbr,mean.teleshop,mean.e-shop,flagship_store_spend.prop,mbr_spend.prop,teleshop_spend.prop,e-shop.prop,count.bags,count.books,count.clothing,count.electronics,count.footwear,count.home and kitchen,sum.bags,sum.books,sum.clothing,sum.electronics,sum.footwear,sum.home and kitchen,mean.bags,mean.books,mean.clothing,mean.electronics,mean.footwear,mean.home and kitchen,bags.prop,books.prop,clothing.prop,electronics.prop,footwear.prop,home and kitchen.prop,count.academic,count.audio and video,count.bath,count.cameras,count.children,count.comics,count.computers,count.diy,count.fiction,count.furnishing,count.kids,count.kitchen,count.mens,count.mobiles,count.non-fiction,count.personal appliances,count.tools,count.women,sum.academic,sum.audio and video,sum.bath,sum.cameras,sum.children,sum.comics,sum.computers,sum.diy,sum.fiction,sum.furnishing,sum.kids,sum.kitchen,sum.mens,sum.mobiles,sum.non-fiction,sum.personal appliances,sum.tools,sum.women,mean.academic,mean.audio and video,mean.bath,mean.cameras,mean.children,mean.comics,mean.computers,mean.diy,mean.fiction,mean.furnishing,mean.kids,mean.kitchen,mean.mens,mean.mobiles,mean.non-fiction,mean.personal appliances,mean.tools,mean.women,academic.prop,audio and video.prop,bath.prop,cameras.prop,children.prop,comics.prop,computers.prop,diy.prop,fiction.prop,furnishing.prop,kids.prop,kitchen.prop,mens.prop,mobiles.prop,non-fiction.prop,personal appliances.prop,tools.prop,women.prop,biological_age.actual,biological_age,biological_age.group,repeat_purchaser
5642,274474,1992-12-19,M,2.0,converted,2011-05-08,2011-08-31,3.0,42.0,1189.0,434.67,1189.0,3.0,1.0,2.0,4.0,323.4,3403.4,2014-12-02,3.57,03_04,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3403.4,0.0,0.0,0.0,1134.47,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,277.36,177.91,2948.14,0.0,0.0,0.0,277.36,177.91,2948.14,0.0,0.0,0.0,0.08,0.05,0.87,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,277.36,0.0,0.0,0.0,0.0,0.0,2948.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,177.91,0.0,0.0,0.0,277.36,0.0,0.0,0.0,0.0,0.0,2948.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,177.91,0.0,0.0,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,21.95,22.0,22_25,yes
5643,267666,1992-12-24,M,6.0,converted,2011-10-23,2014-07-01,5.0,118.0,342.0,227.2,154.0,5.0,2.0,5.0,18.0,1401.86,14752.85,2014-12-02,3.11,03_04,0.0,2.0,0.0,3.0,0.0,9017.91,0.0,5734.95,0.0,4508.95,0.0,1911.65,0.0,0.61,0.0,0.39,0.0,2.0,0.0,0.0,0.0,3.0,0.0,8454.35,0.0,0.0,0.0,6298.5,0.0,4227.18,0.0,0.0,0.0,2099.5,0.0,0.57,0.0,0.0,0.0,0.43,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1352.52,0.0,0.0,7337.2,0.0,0.0,1117.15,1680.7,0.0,0.0,0.0,0.0,0.0,0.0,3265.28,0.0,0.0,0.0,1352.52,0.0,0.0,7337.2,0.0,0.0,1117.15,1680.7,0.0,0.0,0.0,0.0,0.0,0.0,3265.28,0.0,0.0,0.0,0.09,0.0,0.0,0.5,0.0,0.0,0.08,0.11,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,21.94,22.0,22_25,yes
5644,270476,1992-12-25,F,3.0,converted,2011-09-08,2013-08-13,5.0,91.0,476.0,236.2,476.0,5.0,1.0,5.0,18.0,1601.14,16850.14,2014-12-02,3.23,03_04,2.0,1.0,1.0,1.0,11450.01,1595.62,1072.95,2731.56,5725.01,1595.62,1072.95,2731.56,0.68,0.09,0.06,0.16,1.0,0.0,0.0,2.0,1.0,1.0,2731.56,0.0,0.0,9076.47,3969.16,1072.95,2731.56,0.0,0.0,4538.24,3969.16,1072.95,0.16,0.0,0.0,0.54,0.24,0.06,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1595.62,1072.95,0.0,0.0,0.0,7480.85,0.0,0.0,0.0,3969.16,0.0,0.0,0.0,0.0,0.0,0.0,2731.56,0.0,1595.62,1072.95,0.0,0.0,0.0,7480.85,0.0,0.0,0.0,3969.16,0.0,0.0,0.0,0.0,0.0,0.0,2731.56,0.0,0.09,0.06,0.0,0.0,0.0,0.44,0.0,0.0,0.0,0.24,0.0,0.0,0.0,0.0,0.0,0.0,0.16,21.94,22.0,22_25,yes
5645,269626,1992-12-27,F,5.0,converted,2011-07-07,2011-10-18,2.0,103.0,1141.0,622.0,1141.0,2.0,1.0,3.0,4.0,406.56,4278.56,2014-12-02,3.41,03_04,0.0,1.0,0.0,1.0,0.0,3573.57,0.0,704.99,0.0,3573.57,0.0,704.99,0.0,0.84,0.0,0.16,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,3573.57,0.0,0.0,704.99,0.0,0.0,3573.57,0.0,0.0,704.99,0.0,0.0,0.84,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3573.57,0.0,0.0,0.0,0.0,0.0,704.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3573.57,0.0,0.0,0.0,0.0,0.0,704.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.0,0.0,0.0,0.0,0.0,0.16,0.0,21.93,22.0,22_25,yes
5646,274308,1992-12-29,F,5.0,converted,2012-04-07,2012-10-22,3.0,32.0,771.0,323.0,771.0,3.0,2.0,5.0,11.0,865.2,9105.2,2014-12-02,2.65,02_03,0.0,1.0,0.0,2.0,0.0,1984.58,0.0,7120.62,0.0,1984.58,0.0,3560.31,0.0,0.22,0.0,0.78,0.0,1.0,0.0,0.0,2.0,0.0,0.0,3248.7,0.0,0.0,5856.5,0.0,0.0,3248.7,0.0,0.0,2928.25,0.0,0.0,0.36,0.0,0.0,0.64,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5856.5,0.0,0.0,0.0,0.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2928.25,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.64,0.0,0.0,0.0,0.0,0.0,21.93,22.0,22_25,yes


In [43]:
df_transactions['trans_date.day'] = df_transactions['trans_date'].dt.day
df_transactions['trans_date.month_num'] = df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.year'] = df_transactions['trans_date'].dt.year
df_transactions['trans_date.year_month'] = df_transactions['trans_date'].dt.year.map(str) + '_' + df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.hour'] = df_transactions['trans_date'].dt.hour
df_transactions['trans_date.weekday'] = df_transactions['trans_date'].dt.day_name()
df_transactions['trans_date.week_of_year'] = df_transactions['trans_date'].dt.week.map("{:02}".format)

In [44]:
df_transactions['trans_date.weekday'].head()

18130       Friday
18114     Saturday
10030       Sunday
7515     Wednesday
7722        Monday
Name: trans_date.weekday, dtype: object

In [45]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_transactions['trans_date.month_num'].str.contains(k, case=True)

    df_transactions.loc[mask,'trans_date.month'] = v

In [46]:
weekday_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_num = ['01', '02', '03', '04', '05', '06', '07']
weekday_dict = dict(zip(weekday_name, weekday_num))

for k, v in weekday_dict.items():
    mask = df_transactions['trans_date.weekday'].str.contains(k, case=True)

    df_transactions.loc[mask, 'trans_date.weekday_num'] = v

In [47]:
def bin_week(x):
    if ( x>= 1) and (x <= 7):
        return 'month.week1'

    elif (x >= 8) and (x <= 14):
        return 'month.week2'

    elif (x >= 15) and (x <= 21):
        return 'month.week3'

    else:
        return 'month.week4'

In [48]:
df_transactions['trans_date.week_of_month'] = df_transactions['trans_date'].dt.day.apply(bin_week)

In [49]:
# df_transactions['trans_date.hour'].describe()
# df_transactions['trans_date.time_of_day']

In [50]:
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month
18130,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,Footwear,Mens,1,2014-12-02,1.0,23,9,2011,2011_09,0,Friday,38,Sep,6,month.week4
18114,25890929042,266783,2011-09-24,1,2,4,1321,554.82,5838.82,e-Shop,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,Saturday,38,Sep,7,month.week4
10030,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,2014-12-02,122.0,21,10,2012,2012_10,0,Sunday,42,Oct,1,month.week3
7515,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,2014-12-02,194.0,20,2,2013,2013_02,0,Wednesday,8,Feb,4,month.week3
7722,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,Books,Non-Fiction,1,2014-12-02,456.0,2,9,2013,2013_09,0,Monday,36,Sep,2,month.week1


In [51]:
df_transactions[df_transactions['customer_id'] == 268624]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month
22899,36332303449,268624,2011-01-02,10,6,4,295,123.9,1303.9,Flagship store,Home and kitchen,Kitchen,1,2014-12-02,25.0,2,1,2011,2011_01,0,Sunday,52,Jan,1,month.week1
22998,36332303449,268624,2011-01-27,10,6,4,295,123.9,1303.9,Flagship store,Home and kitchen,Kitchen,1,2014-12-02,53.0,27,1,2011,2011_01,0,Thursday,4,Jan,5,month.week4
21919,66278698494,268624,2011-03-21,1,2,1,418,43.89,461.89,TeleShop,Footwear,Mens,1,2014-12-02,186.0,21,3,2011,2011_03,0,Monday,12,Mar,2,month.week3
18144,89509659612,268624,2011-09-23,8,3,5,560,294.0,3094.0,MBR,Electronics,Personal Appliances,1,2014-12-02,383.0,23,9,2011,2011_09,0,Friday,38,Sep,6,month.week4
10280,71442479722,268624,2012-10-10,2,6,3,1477,465.25,4896.26,Flagship store,Home and kitchen,Furnishing,1,2014-12-02,168.0,10,10,2012,2012_10,0,Wednesday,41,Oct,4,month.week2
6770,26264839788,268624,2013-03-27,1,1,2,710,149.1,1569.1,e-Shop,Clothing,Women,1,2014-12-02,112.0,27,3,2013,2013_03,0,Wednesday,13,Mar,4,month.week4
4548,39847252322,268624,2013-07-17,1,2,5,1422,746.55,7856.55,Flagship store,Footwear,Mens,1,2014-12-02,63.0,17,7,2013,2013_07,0,Wednesday,29,Jul,4,month.week3
3244,31803201960,268624,2013-09-18,11,5,5,1437,754.42,7939.43,e-Shop,Books,Children,1,2014-12-02,440.0,18,9,2013,2013_09,0,Wednesday,38,Sep,4,month.week3


### Master File

In [52]:
df_master = pd.merge(df_customer, df_trans_overall,  on=['customer_id'], how='left')
# df_master['assessment_date'] = df_transactions['trans_date'].sort_values().max()
mt.check_unique_no(df_master, ['customer_id'])
df_master.head()

Data has 5647 unique customer_id


Unnamed: 0,customer_id,dob,gender,city_code,customer_status,conversion_date_x,last_purchase_date_x,number_of_purchase_days_x,duration.min_x,duration.max_x,duration.mean_x,duration.last_x,trans_id.count_x,qty.min_x,qty.max_x,qty.sum_x,tax.sum_x,total_amt.sum_x,assessment_date_x,customer_age_x,customer_age.group_x,count.flagship store,count.mbr,count.teleshop,count.e-shop,sum.flagship store,sum.mbr,sum.teleshop,sum.e-shop,mean.flagship store,mean.mbr,mean.teleshop,mean.e-shop,flagship_store_spend.prop,mbr_spend.prop,teleshop_spend.prop,e-shop.prop,count.bags,count.books,count.clothing,count.electronics,count.footwear,count.home and kitchen,sum.bags,sum.books,sum.clothing,sum.electronics,sum.footwear,sum.home and kitchen,mean.bags,mean.books,mean.clothing,mean.electronics,mean.footwear,mean.home and kitchen,bags.prop,books.prop,clothing.prop,electronics.prop,footwear.prop,home and kitchen.prop,count.academic,count.audio and video,count.bath,count.cameras,count.children,count.comics,count.computers,count.diy,count.fiction,count.furnishing,count.kids,count.kitchen,count.mens,count.mobiles,...,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop
0,268408,1970-02-01,M,4.0,converted,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,1.0,5.0,43.0,2526.93,26592.93,2014-12-02,2.99,02_03,5.0,1.0,3.0,2.0,8538.33,6491.88,3894.02,7668.7,1707.67,6491.88,1298.01,3834.35,0.32,0.24,0.15,0.29,3.0,1.0,1.0,2.0,2.0,2.0,3384.61,1033.17,890.63,7668.7,7526.15,6089.66,1128.2,1033.17,890.63,3834.35,3763.08,3044.83,0.13,0.04,0.03,0.29,0.28,0.23,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,...,0.29,0.28,0.23,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,8812.38,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2937.46,0.0,0.0,0.0,0.0,977.37,0.0,0.11,0.0,0.18,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.23,0.33,0.0,0.0,0.0,0.0,0.07
1,269696,1970-07-01,F,8.0,converted,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,3.0,4.0,11.0,1043.91,10985.91,2014-12-02,3.21,03_04,0.0,3.0,0.0,0.0,0.0,10985.91,0.0,0.0,0.0,3661.97,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,10985.91,0.0,0.0,0.0,0.0,0.0,3661.97,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6497.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.0,0.0,0.0
2,268159,1970-08-01,F,8.0,converted,2012-06-02,2013-03-31,6.0,21.0,611.0,152.17,611.0,6.0,1.0,5.0,19.0,1699.85,17888.84,2014-12-02,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.75,0.0,1182.35,7458.75,2311.94,0.0,0.07,0.42,0.52,1.0,1.0,2.0,0.0,0.0,2.0,779.02,1182.35,8141.64,0.0,0.0,7785.83,779.02,1182.35,4070.82,0.0,0.0,3892.91,0.04,0.07,0.46,0.0,0.0,0.44,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,...,0.0,0.0,0.44,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,8141.64,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,4070.82,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.46,0.42,0.04,0.0,0.0,0.0,0.0,0.0
3,270181,1970-10-01,F,2.0,converted,2011-03-18,2014-09-01,10.0,2.0,306.0,135.5,92.0,10.0,1.0,5.0,28.0,2440.31,25681.31,2014-12-02,3.71,03_04,3.0,1.0,1.0,5.0,8428.94,408.85,617.7,16225.82,2809.65,408.85,617.7,3245.16,0.33,0.02,0.02,0.63,0.0,4.0,1.0,2.0,1.0,2.0,0.0,7232.23,5348.2,6532.76,1962.48,4605.64,0.0,1808.06,5348.2,3266.38,1962.48,2302.82,0.0,0.28,0.21,0.25,0.08,0.18,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,...,0.25,0.08,0.18,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,408.85,0.0,0.0,0.0,6823.38,0.0,0.0,0.0,0.0,4605.64,5348.2,0.0,1962.48,0.0,0.0,6532.76,0.0,0.0,408.85,0.0,0.0,0.0,2274.46,0.0,0.0,0.0,0.0,2302.82,5348.2,0.0,1962.48,0.0,0.0,3266.38,0.0,0.0,0.02,0.0,0.0,0.0,0.27,0.0,0.0,0.0,0.0,0.18,0.21,0.0,0.08,0.0,0.0,0.25,0.0,0.0
4,268073,1970-11-01,M,1.0,converted,2011-11-19,2013-12-30,3.0,1.0,771.0,369.67,337.0,3.0,1.0,5.0,7.0,896.28,9432.28,2014-12-02,3.04,03_04,0.0,3.0,0.0,0.0,0.0,9432.28,0.0,0.0,0.0,3144.09,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,7956.0,0.0,0.0,1476.28,0.0,0.0,7956.0,0.0,0.0,738.14,0.0,0.0,0.84,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1476.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,738.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.16


In [53]:
df_master['customer_status'].value_counts(dropna=False)

converted                              5506
voluntarily churned - not converted     141
Name: customer_status, dtype: int64

In [54]:
df_master['biological_age.actual'] = (df_master['assessment_date'] - df_master['dob'])/np.timedelta64(1, 'Y')

KeyError: 'assessment_date'

In [None]:
df_master['biological_age.actual'].describe()

In [None]:
df_master['biological_age'] = np.round(df_master['biological_age.actual'])

In [None]:
df_master['biological_age'].describe()

In [None]:
age_bins =  [21, 25, 30, 35, 40, 45]
labels = ['22_25','25_30', '30_35', '35_40', '40_45']
df_master['biological_age.group'] = pd.cut(df_master['biological_age'], age_bins, labels = labels,include_lowest = True)

In [None]:
df_master['number_of_purchase_days'].describe()

In [None]:
df_master['number_of_purchase_days'] =df_master['number_of_purchase_days'].replace(np.nan, 0)

In [None]:
df_master['repeat_purchaser'] = np.where((df_master['number_of_purchase_days'] > 1), 'yes',
                                      np.where((df_master['number_of_purchase_days']== 0),'never purchased',
                                        'no'))

In [None]:
df_master[df_master['customer_id'] == 266783]

In [None]:
df_master[df_master['customer_age'] == df_master['customer_age'].min()]

In [None]:
df_master[df_master['repeat_purchaser'] == 'yes'].head()

In [None]:
df_master['repeat_purchaser'].value_counts(dropna=False)

In [None]:
df_master['qty.sum'].sum()

In [None]:
df_master['total_amt.sum'].sum()

In [None]:
df_master['total_amt.sum'].sum() - df_master['tax.sum'].sum() 

In [None]:
df_master.to_csv(filepaths.raw_customer_data, index=False)