## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Geolocation
import geonamescache

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_customer = pd.read_csv(filepaths.clean_customer_data_v1, sep=',')

df_cities = pd.read_csv(filepaths.clean_cities_data_v1, sep=',')

df_us_regions = pd.read_csv(filepaths.clean_us_regions_data_v1, sep=',')

df_transactions = pd.read_csv(filepaths.clean_transactions_data_v1, sep=',')

df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Feature Engineering

### Transactions Data

In [4]:
converted_customers = df_transactions['customer_id'].unique().tolist()

In [5]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions = df_transactions.sort_values(['customer_id','trans_date'])

In [6]:
df_transactions['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [7]:
df_transactions['duration'] = df_transactions.groupby(['customer_id'])['trans_date'].transform(pd.Series.diff).shift(-1)
df_transactions['duration'] = df_transactions.apply(lambda r: r['assessment_date'] - r['trans_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_transactions['duration'] = (df_transactions['duration']/np.timedelta64(1, 'D'))

In [8]:
df_transactions.head(7)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration
2,25890929042,266783,2011-09-23,1,2,0,0,0.0,0.0,e-Shop,Footwear,Mens,0,Footwear_Mens,yes,2014-12-02,394.0
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-12-02,122.0
0,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,Clothing_Mens,no,2014-12-02,194.0
3,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,Books,Non-Fiction,1,Books_Non-Fiction,no,2014-12-02,456.0
5,36310127403,266784,2012-04-12,4,3,2,200,42.0,442.0,Flagship store,Electronics,Mobiles,1,Electronics_Mobiles,no,2014-12-02,133.0
4,54234600611,266784,2012-08-23,10,5,3,1291,406.67,4279.66,TeleShop,Books,Non-Fiction,1,Books_Non-Fiction,no,2014-12-02,17.0
6,26928161256,266784,2012-09-09,7,5,5,176,92.4,972.4,e-Shop,Books,Fiction,1,Books_Fiction,no,2014-12-02,814.0


In [9]:
df_trans_agg = df_transactions.groupby(['customer_id'], as_index=False).agg({'trans_date':['first', 'last', pd.Series.nunique],
                                                                                  'trans_id':'count',
                                                                                  'qty': 'sum',
                                                                                  'tax':'sum',
                                                                                  'total_amt':'sum', 'counter':'sum',
                                                                                  'duration':['min', 'max','mean', 'last', lambda x: x.median()]})

In [10]:
df_trans_agg.shape
df_trans_agg.columns = [".".join(x).strip('.') for x in df_trans_agg.columns.ravel()] 
df_trans_agg.rename(columns={'trans_date.first':'conversion_date', 'trans_date.last':'last_purchase_date',
                                'trans_date.nunique':'total_unique_purchase_days', 
                                'trans_id.count':'total_unique_transactions', 
                                'qty.sum':'total_items_purchased', 'tax.sum':'total_tax_paid', 'total_amt.sum':'total_amt_paid',
                                'counter.sum':'total_unique_trans_not_reversed', 
                                'duration.mean':'avg_duration_btwn_purchase_days', 'duration.last':'time_since_last_purchase.days',
                                'duration.min':'min_duration_btwn_purchase_days','duration.max':'max_duration_btwn_purchase_days',
                                'duration.<lambda_0>':'median_duration_btwn_purchase_days'}, inplace=True)
mt.check_unique_no(df_trans_agg, ['customer_id'])
df_trans_agg.head()

(5506, 14)

Data has 5506 unique customer_id


Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days
0,266783,2011-09-23,2013-09-02,4,4,6,295.89,3113.89,3,122.0,456.0,291.5,456.0,294.0
1,266784,2012-04-12,2012-09-09,3,3,10,541.07,5694.06,3,17.0,814.0,321.33,814.0,133.0
2,266785,2011-03-15,2013-02-13,7,7,23,2053.8,21613.8,6,5.0,657.0,194.0,657.0,137.0
3,266788,2011-09-13,2013-12-02,4,4,8,578.97,6092.97,4,84.0,397.0,294.0,365.0,347.5
4,266794,2011-03-18,2014-12-02,10,11,30,2658.91,27981.92,10,0.0,521.0,123.18,0.0,116.0


In [11]:
# df_trans_agg[df_trans_agg['customer_id'] == 266852]

In [12]:
# df_transactions[df_transactions['customer_id'] == 266783]

In [13]:
# df_transactions[df_transactions['customer_id'] == 266794]

In [14]:
df_trans_agg['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [15]:
df_trans_agg['account_age.days'] = (df_trans_agg['assessment_date'] - df_trans_agg['conversion_date'])/np.timedelta64(1,'D')

In [16]:
df_trans_agg['account_age.years'] = (df_trans_agg['assessment_date'] - df_trans_agg['conversion_date'])/np.timedelta64(1,'Y')

In [17]:
age_bins =  [0, 1, 2, 3, 4]
labels = ['00_01','01_02', '02_03', '03_04']
df_trans_agg['account_age.years.group'] = pd.cut(df_trans_agg['account_age.years'], age_bins, labels = labels,include_lowest = True)

In [18]:
def customer_profile(_df, unique_id, value, featurelist):
    df_list = []
    for feature in featurelist:
        _df[feature] = _df[feature].astype(str)
        _df_temp = pd.crosstab(_df[unique_id], _df[feature], values=_df[value], 
                               aggfunc=['count','sum'], dropna=False).fillna(0).reset_index()
        _df_temp.columns = [".".join(x).strip('.') for x in _df_temp.columns.ravel()] 
        df_list.append(_df_temp)
        dfs = [df.set_index(unique_id) for df in df_list]
        df = pd.concat(dfs, axis=1)
        df.reset_index(inplace=True)
    return df

In [19]:
df_product_cat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['product'])
df_product_cat.head()

Unnamed: 0,customer_id,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266784,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,266785,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,0.0,0.0
3,266788,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
4,266794,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0


In [20]:
df_stores = customer_profile(df_transactions, 'customer_id', 'total_amt', ['store_type'])
df_stores.head()

Unnamed: 0,customer_id,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop
0,266783,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59
1,266784,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4
2,266785,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99
3,266788,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86
4,266794,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24


In [21]:
df_trans_agg = pd.merge(df_trans_agg, df_stores, on=['customer_id'], how='left')
df_trans_agg.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop
0,266783,2011-09-23,2013-09-02,4,4,6,295.89,3113.89,3,122.0,456.0,291.5,456.0,294.0,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59
1,266784,2012-04-12,2012-09-09,3,3,10,541.07,5694.06,3,17.0,814.0,321.33,814.0,133.0,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4
2,266785,2011-03-15,2013-02-13,7,7,23,2053.8,21613.8,6,5.0,657.0,194.0,657.0,137.0,2014-12-02,1358.0,3.72,03_04,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99
3,266788,2011-09-13,2013-12-02,4,4,8,578.97,6092.97,4,84.0,397.0,294.0,365.0,347.5,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86
4,266794,2011-03-18,2014-12-02,10,11,30,2658.91,27981.92,10,0.0,521.0,123.18,0.0,116.0,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24


In [22]:
df_trans_agg['Flagship_store_spend.prop'] = df_trans_agg['sum.Flagship store']/df_trans_agg['total_amt_paid']
df_trans_agg['MBR_spend.prop'] = df_trans_agg['sum.MBR']/df_trans_agg['total_amt_paid']
df_trans_agg['TeleShop_spend.prop'] = df_trans_agg['sum.TeleShop']/df_trans_agg['total_amt_paid']
df_trans_agg['e-Shop.prop'] = df_trans_agg['sum.e-Shop']/df_trans_agg['total_amt_paid']

In [23]:
df_trans_agg = pd.merge(df_trans_agg, df_product_cat, on=['customer_id'], how='left')
df_trans_agg.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,2011-09-23,2013-09-02,4,4,6,295.89,3113.89,3,122.0,456.0,291.5,456.0,294.0,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,3,10,541.07,5694.06,3,17.0,814.0,321.33,814.0,133.0,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,7,7,23,2053.8,21613.8,6,5.0,657.0,194.0,657.0,137.0,2014-12-02,1358.0,3.72,03_04,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,0.27,0.0,0.59,0.15,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,0.0,0.0
3,266788,2011-09-13,2013-12-02,4,4,8,578.97,6092.97,4,84.0,397.0,294.0,365.0,347.5,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
4,266794,2011-03-18,2014-12-02,10,11,30,2658.91,27981.92,10,0.0,521.0,123.18,0.0,116.0,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24,0.03,0.33,0.16,0.48,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0


In [24]:
df_transactions['trans_date.day'] = df_transactions['trans_date'].dt.day
df_transactions['trans_date.month_num'] = df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.year'] = df_transactions['trans_date'].dt.year
df_transactions['trans_date.year_month'] = df_transactions['trans_date'].dt.year.map(str) + '_' + df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.hour'] = df_transactions['trans_date'].dt.hour
df_transactions['trans_date.weekday'] = df_transactions['trans_date'].dt.day_name()
df_transactions['trans_date.week_of_year'] = df_transactions['trans_date'].dt.week.map("{:02}".format)

In [25]:
df_transactions['trans_date.weekday'].head()

2       Friday
1       Sunday
0    Wednesday
3       Monday
5     Thursday
Name: trans_date.weekday, dtype: object

In [26]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_transactions['trans_date.month_num'].str.contains(k, case=True)

    df_transactions.loc[mask,'trans_date.month'] = v

In [27]:
df_transactions['trans_date.month'] = df_transactions['trans_date.month_num'] + '_' + df_transactions['trans_date.month'].map(str)

In [28]:
df_transactions['trans_date.month'].value_counts(dropna=False)

01_Jan    1869
10_Oct    1828
09_Sep    1789
03_Mar    1785
07_Jul    1746
12_Dec    1737
08_Aug    1729
11_Nov    1710
02_Feb    1687
05_May    1677
06_Jun    1666
04_Apr    1653
Name: trans_date.month, dtype: int64

In [29]:
weekday_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_num = ['01', '02', '03', '04', '05', '06', '07']
weekday_dict = dict(zip(weekday_name, weekday_num))

for k, v in weekday_dict.items():
    mask = df_transactions['trans_date.weekday'].str.contains(k, case=True)

    df_transactions.loc[mask, 'trans_date.weekday_num'] = v

In [30]:
def bin_week(x):
    if ( x>= 1) and (x <= 7):
        return 'month.week1'

    elif (x >= 8) and (x <= 14):
        return 'month.week2'

    elif (x >= 15) and (x <= 21):
        return 'month.week3'

    else:
        return 'month.week4'

In [31]:
df_transactions['trans_date.week_of_month'] = df_transactions['trans_date'].dt.day.apply(bin_week)

In [32]:
df_transactions['trans_date.weekday'] = df_transactions['trans_date.weekday_num'] + '_' + df_transactions['trans_date.weekday'].map(str)

In [33]:
df_transactions['trans_date.weekday'].value_counts(dropna=False)

01_Sunday       3101
05_Thursday     2998
07_Saturday     2990
03_Tuesday      2984
04_Wednesday    2948
02_Monday       2943
06_Friday       2912
Name: trans_date.weekday, dtype: int64

In [34]:
# df_transactions['trans_date.hour'].describe()
# df_transactions['trans_date.time_of_day']

In [35]:
df_transactions = df_transactions.sort_values(['customer_id', 'trans_date'])

In [36]:
df_conversion_date = df_transactions.groupby(['customer_id']).agg({'trans_date':'first'})

In [37]:
df_conversion_date.rename(columns={'trans_date':'conversion_date'}, inplace=True)

In [38]:
df_conversion_date.shape

(5506, 1)

In [39]:
df_transactions = pd.merge(df_transactions, df_conversion_date, on='customer_id', how='left')

In [40]:
df_transactions['conversion_date_cohort'] = df_transactions['conversion_date'].dt.year.map(str) + "_" + df_transactions['conversion_date'].dt.month.map("{:02}".format)

In [41]:
df_transactions['purchase_date_cohort'] = df_transactions['trans_date'].dt.year.map(str) + "_" + df_transactions['trans_date'].dt.month.map("{:02}".format)

In [42]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,conversion_date,conversion_date_cohort,purchase_date_cohort
0,25890929042,266783,2011-09-23,1,2,0,0,0.0,0.0,e-Shop,Footwear,Mens,0,Footwear_Mens,yes,2014-12-02,394.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10


In [43]:
df_earliest_purchase_date = df_conversion_date.copy()
df_earliest_purchase_date.rename(columns={'conversion_date':'trans_date'}, inplace=True)

In [44]:
# df_earliest_purchase_date['customer_type.period'] = 'new'

In [45]:
df_transactions = pd.merge(df_transactions, df_earliest_purchase_date, on=['customer_id', 'trans_date'], how='left')

In [46]:
# df_transactions['customer_type.period']  = np.where(df_transactions['customer_type.period'].isnull(), 'existing', df_transactions['customer_type.period'])

**Customers who have returned at least one item before**

In [47]:
customers_who_returned_items_before = list(df_transactions[df_transactions['returned_item'] == 'yes']['customer_id'].unique());
len(customers_who_returned_items_before)

1726

### City Data

In [48]:
df_cities.head(2)
df_cities['city'] = df_cities['city']

Unnamed: 0,city_code,city
0,1,Arlington
1,2,Miami


In [49]:
city_list = df_cities['city'].to_list()
len(city_list)
city_list

10

['Arlington',
 'Miami',
 'Chicago',
 'Pittsburgh',
 'Dayton',
 'Akron',
 'Philadelphia',
 'Dallas',
 'Houston',
 'Los Angeles']

In [50]:
state_list = []
len(state_list)

gc = geonamescache.GeonamesCache()

for city in city_list:
    info = gc.get_cities_by_name(city)
    if info == []:
        state_list.append(np.nan)
    else:
        for dictionary in list(info[0].values()):
            state = list(dictionary.values())
            state_list.append(state[7])
#             print(city, state[7])
df_cities['state_code'] = state_list

0

In [51]:
df_cities['state_code'].value_counts(dropna=False)

TX    3
OH    2
PA    2
FL    1
CA    1
IL    1
Name: state_code, dtype: int64

In [52]:
df_cities

Unnamed: 0,city_code,city,state_code
0,1,Arlington,TX
1,2,Miami,FL
2,3,Chicago,IL
3,4,Pittsburgh,PA
4,5,Dayton,OH
5,6,Akron,OH
6,7,Philadelphia,PA
7,8,Dallas,TX
8,9,Houston,TX
9,10,Los Angeles,CA


In [53]:
df_cities = pd.merge(df_cities, df_us_regions, left_on=['state_code'], right_on=['state code'], how='left' )
df_cities

Unnamed: 0,city_code,city,state_code,state,state code,region,division
0,1,Arlington,TX,Texas,TX,South,West South Central
1,2,Miami,FL,Florida,FL,South,South Atlantic
2,3,Chicago,IL,Illinois,IL,Midwest,East North Central
3,4,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic
4,5,Dayton,OH,Ohio,OH,Midwest,East North Central
5,6,Akron,OH,Ohio,OH,Midwest,East North Central
6,7,Philadelphia,PA,Pennsylvania,PA,Northeast,Middle Atlantic
7,8,Dallas,TX,Texas,TX,South,West South Central
8,9,Houston,TX,Texas,TX,South,West South Central
9,10,Los Angeles,CA,California,CA,West,Pacific


In [54]:
df_customer = pd.merge(df_customer, df_cities, on=['city_code'],  how='left' )
df_customer.head(2)

Unnamed: 0,customer_id,dob,gender,city_code,city,state_code,state,state code,region,division
0,268408,02-01-1970,M,4.0,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic
1,269696,07-01-1970,F,8.0,Dallas,TX,Texas,TX,South,West South Central


### Customer Data

In [55]:
df_customer['customer_conversion'] = np.where((df_customer['customer_id'].isin(converted_customers)), 'converted', 'not converted')
df_customer['customer_conversion'].value_counts(dropna=False)

converted        5506
not converted     141
Name: customer_conversion, dtype: int64

In [56]:
df_customer['dob'] = pd.to_datetime(df_customer['dob'])
df_customer['dob'].sort_values().min()
df_customer['dob'].sort_values().max()

Timestamp('1970-01-02 00:00:00')

Timestamp('1992-12-29 00:00:00')

In [57]:
df_customer.tail()

Unnamed: 0,customer_id,dob,gender,city_code,city,state_code,state,state code,region,division,customer_conversion
5642,274474,1992-12-19,M,2.0,Miami,FL,Florida,FL,South,South Atlantic,converted
5643,267666,1992-12-24,M,6.0,Akron,OH,Ohio,OH,Midwest,East North Central,converted
5644,270476,1992-12-25,F,3.0,Chicago,IL,Illinois,IL,Midwest,East North Central,converted
5645,269626,1992-12-27,F,5.0,Dayton,OH,Ohio,OH,Midwest,East North Central,converted
5646,274308,1992-12-29,F,5.0,Dayton,OH,Ohio,OH,Midwest,East North Central,converted


### Master File

In [58]:
df_master = pd.merge(df_customer, df_trans_agg,  on=['customer_id'], how='left')
# df_master['assessment_date'] = df_transactions['trans_date'].sort_values().max()
mt.check_unique_no(df_master, ['customer_id'])
df_master.head()

Data has 5647 unique customer_id


Unnamed: 0,customer_id,dob,gender,city_code,city,state_code,state,state code,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,268408,1970-02-01,M,4.0,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic,converted,2011-12-07,2014-01-13,10.0,10.0,33.0,2306.43,24272.43,9.0,10.0,323.0,109.1,323.0,91.0,2014-12-02,1091.0,2.99,02_03,4.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,0.26,0.27,0.16,0.32,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,1064.12,0.0,0.0,0.0,0.0,1033.17,0.0,0.0,0.0,890.63,2873.0,4795.7,0.0,0.0,0.0,1034.28,6491.88,0.0,0.0,0.0,6089.66,0.0
1,269696,1970-07-01,F,8.0,Dallas,TX,Texas,TX,South,West South Central,converted,2011-09-18,2012-08-04,2.0,2.0,3.0,426.51,4488.51,1.0,321.0,850.0,585.5,850.0,585.5,2014-12-02,1171.0,3.21,03_04,0.0,2.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,268159,1970-08-01,F,8.0,Dallas,TX,Texas,TX,South,West South Central,converted,2012-06-02,2013-03-31,6.0,6.0,19.0,1699.85,17888.84,6.0,21.0,611.0,152.17,611.0,43.5,2014-12-02,913.0,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,0.07,0.42,0.52,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,779.02,0.0,0.0,1182.35,0.0,0.0,0.0,0.0,8141.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,7458.75,0.0
3,270181,1970-10-01,F,2.0,Miami,FL,Florida,FL,South,South Atlantic,converted,2011-03-18,2014-09-01,8.0,8.0,16.0,1412.98,14869.99,6.0,57.0,455.0,169.38,92.0,100.0,2014-12-02,1355.0,3.71,03_04,3.0,1.0,1.0,3.0,8428.94,408.85,617.7,5414.5,0.57,0.03,0.04,0.36,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,408.85,617.7,0.0,0.0,0.0,0.0,5348.2,0.0,0.0,0.0,0.0,0.0,0.0,6532.76,0.0,1962.48,0.0,0.0,0.0,0.0,0.0
4,268073,1970-11-01,M,1.0,Arlington,TX,Texas,TX,South,West South Central,converted,2011-11-19,2013-12-29,2.0,2.0,5.0,756.0,7956.0,1.0,338.0,771.0,554.5,338.0,554.5,2014-12-02,1109.0,3.04,03_04,0.0,2.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
df_master[df_master['customer_id'] == 268159]

Unnamed: 0,customer_id,dob,gender,city_code,city,state_code,state,state code,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
2,268159,1970-08-01,F,8.0,Dallas,TX,Texas,TX,South,West South Central,converted,2012-06-02,2013-03-31,6.0,6.0,19.0,1699.85,17888.84,6.0,21.0,611.0,152.17,611.0,43.5,2014-12-02,913.0,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,0.07,0.42,0.52,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,779.02,0.0,0.0,1182.35,0.0,0.0,0.0,0.0,8141.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,7458.75,0.0


**Customer with customer_id 268159 has purchased a total of 19 items on 6 different occasions (without returning any) and has done this approximately every 152 days over the last 2 to 3 years**

In [60]:
df_master['customer_conversion'].value_counts(dropna=False)

converted        5506
not converted     141
Name: customer_conversion, dtype: int64

In [61]:
df_master['conversion_date.year'] = df_master['conversion_date'].dt.year

In [62]:
df_master['conversion_date.month_num'] = df_master['conversion_date'].dt.month.map("{:02}".format)

In [63]:
df_master['conversion_date.month_num'].value_counts(dropna=False)

3.0     668
2.0     614
1.0     597
4.0     557
5.0     503
6.0     461
7.0     443
8.0     405
9.0     379
10.0    344
11.0    292
12.0    243
nan     141
Name: conversion_date.month_num, dtype: int64

In [64]:
# df_master['conversion_date.month_num'] = df_master['conversion_date.month_num'].astype('O').astype('int64')

In [65]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_master['conversion_date.month_num'].str.contains(k, case=True)

    df_master.loc[mask,'conversion_date.month'] = v

In [66]:
df_master['conversion_date.month'] = df_master['conversion_date.month_num'] + '_' + df_master['conversion_date.month'].map(str)

In [67]:
df_master['conversion_date.month'].value_counts(dropna=False)

3.0_Mar     668
2.0_Feb     614
1.0_Jan     597
4.0_Apr     557
5.0_May     503
6.0_Jun     461
7.0_Jul     443
8.0_Aug     405
9.0_Sep     379
10.0_Oct    344
11.0_Nov    292
12.0_Dec    243
nan_nan     141
Name: conversion_date.month, dtype: int64

In [68]:
df_master['conversion_date.year_month'] = df_master['conversion_date.year'].map(str) + '_' + df_master['conversion_date.month_num'].map(str)

In [69]:
df_master['conversion_date.year_month'].value_counts(dropna=False).head()

2011.0_3.0    469
2011.0_2.0    438
2011.0_4.0    404
2011.0_5.0    337
2011.0_1.0    328
Name: conversion_date.year_month, dtype: int64

In [70]:
df_master.head(1)

Unnamed: 0,customer_id,dob,gender,city_code,city,state_code,state,state code,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month
0,268408,1970-02-01,M,4.0,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic,converted,2011-12-07,2014-01-13,10.0,10.0,33.0,2306.43,24272.43,9.0,10.0,323.0,109.1,323.0,91.0,2014-12-02,1091.0,2.99,02_03,4.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,0.26,0.27,0.16,0.32,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,1064.12,0.0,0.0,0.0,0.0,1033.17,0.0,0.0,0.0,890.63,2873.0,4795.7,0.0,0.0,0.0,1034.28,6491.88,0.0,0.0,0.0,6089.66,0.0,2011.0,12.0,12.0_Dec,2011.0_12.0


In [71]:
df_master['biological_age.actual'] = (df_master['assessment_date'] - df_master['dob'])/np.timedelta64(1, 'Y')

In [72]:
df_master['biological_age.actual'].describe()

count   5506.00
mean      33.34
std        6.61
min       21.93
25%       27.52
50%       33.37
75%       39.09
max       44.92
Name: biological_age.actual, dtype: float64

In [73]:
df_master['biological_age'] = np.round(df_master['biological_age.actual'])

In [74]:
df_master['biological_age'].describe()

count   5506.00
mean      33.34
std        6.62
min       22.00
25%       28.00
50%       33.00
75%       39.00
max       45.00
Name: biological_age, dtype: float64

In [75]:
age_bins =  [21, 25, 30, 35, 40, 45]
labels = ['22_25','25_30', '30_35', '35_40', '40_45']
df_master['biological_age.group'] = pd.cut(df_master['biological_age'], age_bins, labels = labels,include_lowest = True)

In [76]:
df_master['total_unique_purchase_days'].describe()

count   5506.00
mean       3.79
std        1.83
min        1.00
25%        2.00
50%        4.00
75%        5.00
max       11.00
Name: total_unique_purchase_days, dtype: float64

In [77]:
df_master.drop(['state code', 'city_code'], axis=1, inplace=True)

In [78]:
df_master['total_unique_purchase_days'] =df_master['total_unique_purchase_days'].replace(np.nan, 0)

In [79]:
df_master['repeat_purchaser'] = np.where((df_master['total_unique_purchase_days'] > 1), 'yes',
                                      np.where((df_master['total_unique_purchase_days']== 0),'never purchased',
                                        'no'))

In [80]:
df_master['returned_item_before'] = np.where(df_master['customer_id'].isin(customers_who_returned_items_before), 'yes', 'no')

In [81]:
df_master['returned_item_before'].value_counts(dropna=False)

no     3921
yes    1726
Name: returned_item_before, dtype: int64

In [82]:
df_master[df_master['customer_id'] == 266783]

Unnamed: 0,customer_id,dob,gender,city,state_code,state,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before
1019,266783,1974-01-05,M,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,converted,2011-09-23,2013-09-02,4.0,4.0,6.0,295.89,3113.89,3.0,122.0,456.0,291.5,456.0,294.0,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,40.91,41.0,40_45,yes,yes


In [83]:
df_master[df_master['account_age.years'] == df_master['account_age.years'].min()]

Unnamed: 0,customer_id,dob,gender,city,state_code,state,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before
3951,274213,1986-03-27,M,Houston,TX,Texas,South,West South Central,converted,2014-12-01,2014-12-01,1.0,1.0,2.0,279.72,2943.72,1.0,1.0,1.0,1.0,1.0,1.0,2014-12-02,1.0,0.0,00_01,0.0,1.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,2014.0,12.0,12.0_Dec,2014.0_12.0,28.69,29.0,25_30,no,no


In [84]:
df_master['7_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 7
df_master['14_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 14
df_master['21_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 21
df_master['30_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 30
df_master['60_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 60
df_master['90_day_amt.avg'] = (df_master['total_amt_paid']/df_master['account_age.days']) * 90

In [85]:
df_master[['7_day_amt.avg', '14_day_amt.avg', '21_day_amt.avg', '30_day_amt.avg', '60_day_amt.avg','90_day_amt.avg']].describe()

Unnamed: 0,7_day_amt.avg,14_day_amt.avg,21_day_amt.avg,30_day_amt.avg,60_day_amt.avg,90_day_amt.avg
count,5506.0,5506.0,5506.0,5506.0,5506.0,5506.0
mean,58.74,117.48,176.22,251.75,503.5,755.25
std,279.22,558.43,837.65,1196.64,2393.28,3589.92
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.81,57.61,86.42,123.46,246.91,370.37
50%,49.42,98.84,148.26,211.8,423.6,635.41
75%,75.82,151.63,227.45,324.93,649.86,974.79
max,20606.04,41212.08,61818.12,88311.6,176623.2,264934.8


In [86]:
df_master['7_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 7
df_master['14_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 14
df_master['21_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 21
df_master['30_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 30
df_master['60_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 60
df_master['90_day_num.avg'] = (df_master['total_items_purchased']/df_master['account_age.days']) * 90

In [87]:
df_master[['7_day_num.avg', '14_day_num.avg', '21_day_num.avg', '30_day_num.avg', '60_day_num.avg','90_day_num.avg']].describe()

Unnamed: 0,7_day_num.avg,14_day_num.avg,21_day_num.avg,30_day_num.avg,60_day_num.avg,90_day_num.avg
count,5506.0,5506.0,5506.0,5506.0,5506.0,5506.0
mean,0.07,0.13,0.2,0.28,0.57,0.85
std,0.19,0.38,0.57,0.82,1.64,2.46
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.04,0.08,0.11,0.16,0.33,0.49
50%,0.06,0.12,0.18,0.25,0.51,0.76
75%,0.08,0.17,0.25,0.36,0.72,1.08
max,14.0,28.0,42.0,60.0,120.0,180.0


In [88]:
df_master[df_master['repeat_purchaser'] == 'yes'].head()

Unnamed: 0,customer_id,dob,gender,city,state_code,state,region,division,customer_conversion,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,7_day_amt.avg,14_day_amt.avg,21_day_amt.avg,30_day_amt.avg,60_day_amt.avg,90_day_amt.avg,7_day_num.avg,14_day_num.avg,21_day_num.avg,30_day_num.avg,60_day_num.avg,90_day_num.avg
0,268408,1970-02-01,M,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,converted,2011-12-07,2014-01-13,10.0,10.0,33.0,2306.43,24272.43,9.0,10.0,323.0,109.1,323.0,91.0,2014-12-02,1091.0,2.99,02_03,4.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,0.26,0.27,0.16,0.32,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,1064.12,0.0,0.0,0.0,0.0,1033.17,0.0,0.0,0.0,890.63,2873.0,4795.7,0.0,0.0,0.0,1034.28,6491.88,0.0,0.0,0.0,6089.66,0.0,2011.0,12.0,12.0_Dec,2011.0_12.0,44.83,45.0,40_45,yes,yes,155.74,311.47,467.21,667.44,1334.87,2002.31,0.21,0.42,0.64,0.91,1.81,2.72
1,269696,1970-07-01,F,Dallas,TX,Texas,South,West South Central,converted,2011-09-18,2012-08-04,2.0,2.0,3.0,426.51,4488.51,1.0,321.0,850.0,585.5,850.0,585.5,2014-12-02,1171.0,3.21,03_04,0.0,2.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,44.42,44.0,40_45,yes,yes,26.83,53.66,80.49,114.99,229.98,344.98,0.02,0.04,0.05,0.08,0.15,0.23
2,268159,1970-08-01,F,Dallas,TX,Texas,South,West South Central,converted,2012-06-02,2013-03-31,6.0,6.0,19.0,1699.85,17888.84,6.0,21.0,611.0,152.17,611.0,43.5,2014-12-02,913.0,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,0.07,0.42,0.52,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,779.02,0.0,0.0,1182.35,0.0,0.0,0.0,0.0,8141.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,7458.75,0.0,2012.0,6.0,6.0_Jun,2012.0_6.0,44.34,44.0,40_45,yes,no,137.15,274.31,411.46,587.8,1175.61,1763.41,0.15,0.29,0.44,0.62,1.25,1.87
3,270181,1970-10-01,F,Miami,FL,Florida,South,South Atlantic,converted,2011-03-18,2014-09-01,8.0,8.0,16.0,1412.98,14869.99,6.0,57.0,455.0,169.38,92.0,100.0,2014-12-02,1355.0,3.71,03_04,3.0,1.0,1.0,3.0,8428.94,408.85,617.7,5414.5,0.57,0.03,0.04,0.36,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,408.85,617.7,0.0,0.0,0.0,0.0,5348.2,0.0,0.0,0.0,0.0,0.0,0.0,6532.76,0.0,1962.48,0.0,0.0,0.0,0.0,0.0,2011.0,3.0,3.0_Mar,2011.0_3.0,44.17,44.0,40_45,yes,yes,76.82,153.64,230.46,329.22,658.45,987.67,0.08,0.17,0.25,0.35,0.71,1.06
4,268073,1970-11-01,M,Arlington,TX,Texas,South,West South Central,converted,2011-11-19,2013-12-29,2.0,2.0,5.0,756.0,7956.0,1.0,338.0,771.0,554.5,338.0,554.5,2014-12-02,1109.0,3.04,03_04,0.0,2.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011.0,11.0,11.0_Nov,2011.0_11.0,44.09,44.0,40_45,yes,yes,50.22,100.44,150.65,215.22,430.44,645.66,0.03,0.06,0.09,0.14,0.27,0.41


In [89]:
df_master['repeat_purchaser'].value_counts(dropna=False)

yes                4993
no                  513
never purchased     141
Name: repeat_purchaser, dtype: int64

In [90]:
df_master['number_of_unique_times_purchases_returned'] = df_master['total_unique_transactions'] - df_master['total_unique_trans_not_reversed']
df_master['number_of_unique_times_purchases_returned'].value_counts(dropna=False)

0.00    3780
1.00    1425
2.00     274
nan      141
3.00      24
4.00       3
Name: number_of_unique_times_purchases_returned, dtype: int64

### Sanity Check Data

In [91]:
mt.check_unique_no(df_master, ['customer_id'])
df_master.shape

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape

Data has 5647 unique customer_id


(5647, 106)

Data has 5506 unique customer_id


(20876, 30)

In [92]:
df_transactions['total_amt'].sum()
df_transactions['tax'].sum()
df_transactions['rate'].sum()
df_transactions['qty'].sum()

48903316.474999994

4646921.475

14766262

56434

## Export Data

In [93]:
df_master.to_csv(filepaths.master_file_data, index=False)
df_transactions.to_csv(filepaths.derived_transactions_data_v1, index=False)