## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Geolocation
import geonamescache

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

### Create Project Directories and Sub-Directories

In [3]:
mt.create_directories(filepaths.project_parent_directory)

Directory  deliverables  already exists
Directory  deprecated  already exists
Directory  dictionary  already exists
Directory  visualizations  already exists
Subdirectory  data/raw  already exists
Subdirectory  data/interim  already exists
Subdirectory  data/external  already exists
Subdirectory  data/processed  already exists


## Load Data

In [4]:
df_customer = pd.read_csv(filepaths.raw_customer_data, sep=',')

df_cities = pd.read_csv(filepaths.raw_city_data, sep=',')

df_us_regions = pd.read_csv(filepaths.raw_us_regions_data, sep=',')

df_transactions = pd.read_csv(filepaths.raw_transactions_data, sep=',')

df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Standardize Feature Names

## Data Audit + Cleaning
### Customer Data

In [5]:
df_customer.columns = [c.lower() for c in df_customer.columns]
df_customer.shape
mt.check_unique_no(df_customer, ['customer_id'])
mt.missing_data_table(df_customer)
df_customer.head()

(5647, 4)

Data has 5647 unique customer_id
Missing data distribution:

  Variable  Count  Proportion
    gender      2        0.00
 city_code      2        0.00


Unnamed: 0,customer_id,dob,gender,city_code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


In [6]:
df_customer['gender'] = df_customer['gender'].replace({np.nan:df_customer['gender'].mode()[0]})
df_customer['city_code'] = df_customer['city_code'].replace({np.nan:df_customer['city_code'].mode()[0]})

### Cities Data

In [7]:
df_cities.columns = [c.lower() for c in df_cities.columns]
df_cities.shape
mt.check_unique_no(df_cities, ['city_code'])
mt.missing_data_table(df_cities)
df_cities.head()

(10, 2)

Data has 10 unique city_code
There is no missing data


Unnamed: 0,city_code,city
0,1,Arlington
1,2,Miami
2,3,Chicago
3,4,Pittsburgh
4,5,Dayton


### USA Regions Data

In [8]:
df_us_regions.columns = [c.lower() for c in df_us_regions.columns]
df_us_regions.shape
mt.missing_data_table(df_us_regions)
df_us_regions.head()

(51, 4)

There is no missing data


Unnamed: 0,state,state code,region,division
0,Alaska,AK,West,Pacific
1,Alabama,AL,South,East South Central
2,Arkansas,AR,South,West South Central
3,Arizona,AZ,West,Mountain
4,California,CA,West,Pacific


### Transactions Data

In [9]:
df_transactions.columns = [c.lower() for c in df_transactions.columns]
df_transactions.rename(columns={'tax':'cogs', 'transaction_id':'trans_id', 'cust_id':'customer_id', 'tran_date':'trans_date'}, inplace=True)
df_transactions.shape
mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
mt.missing_data_table(df_transactions)
df_transactions.head()
df_transactions.describe()

(23053, 10)

Data has 5506 unique customer_id
Data has 20878 unique trans_id
There is no missing data


Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop


Unnamed: 0,trans_id,customer_id,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt
count,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0
mean,50073480358.45,271021.75,6.15,3.76,2.43,636.37,248.67,2107.31
std,28981936062.0,2431.69,3.73,1.68,2.27,622.36,187.18,2507.56
min,3268991.0,266783.0,1.0,1.0,-5.0,-1499.0,7.35,-8270.92
25%,24938639453.0,268935.0,3.0,2.0,1.0,312.0,98.28,762.45
50%,50093131361.0,270980.0,5.0,4.0,3.0,710.0,199.08,1754.74
75%,75329995679.0,273114.0,10.0,5.0,4.0,1109.0,365.71,3569.15
max,99987549630.0,275265.0,12.0,6.0,5.0,1500.0,787.5,8287.5


In [10]:
df_transactions['trans_id'].value_counts(dropna=False).head()

426787191      4
4170892941     4
32263938079    4
3130889793     3
42255136382    3
Name: trans_id, dtype: int64

In [11]:
df_transactions[df_transactions['trans_id'] == 426787191]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type
12961,426787191,273106,28-05-2012,1,2,-5,-109,57.23,-602.23,e-Shop
12967,426787191,273106,28-05-2012,1,2,-5,-109,57.23,-602.23,e-Shop
13156,426787191,273106,19-05-2012,1,2,-5,-109,57.23,-602.23,e-Shop
13168,426787191,273106,18-05-2012,1,2,5,109,57.23,602.23,e-Shop


In [12]:
df_transactions[df_transactions['trans_id'] == 4170892941]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type
16207,4170892941,266852,24-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16267,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16269,4170892941,266852,21-12-2011,8,3,-1,-412,43.26,-455.26,MBR
16288,4170892941,266852,20-12-2011,8,3,1,412,43.26,455.26,MBR


In [13]:
df_transactions[df_transactions['trans_id'] == 25890929042]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type
18114,25890929042,266783,24-09-2011,1,2,-4,-1321,554.82,-5838.82,e-Shop
18130,25890929042,266783,23-09-2011,1,2,4,1321,554.82,5838.82,e-Shop


In [14]:
df_transactions['trans_id'].value_counts(dropna=False)[df_transactions['trans_id'].value_counts(dropna=False).values > 1].index.nunique()

2057

In [15]:
returned_item = df_transactions['trans_id'].value_counts(dropna=False)[df_transactions['trans_id'].value_counts(dropna=False).values > 1].index.unique();
len(returned_item)

2057

In [16]:
df_transactions['returned_item'] = np.where(df_transactions['trans_id'].isin(returned_item), 'yes', 'no')

In [17]:
df_transactions['drop_record'] = np.where((df_transactions['qty'] > 0), 'yes', 'no')

In [18]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop,yes,no
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop,yes,no


In [19]:
df_transactions[df_transactions['qty'] < 0].shape
df_transactions[df_transactions['qty'] < 0].head()
df_transactions[df_transactions['qty'] < 0]['store_type'].value_counts(dropna=False)

(2177, 12)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop,yes,no
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop,yes,no
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop,yes,no
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop,yes,no
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop,yes,no


e-Shop            882
MBR               451
Flagship store    432
TeleShop          412
Name: store_type, dtype: int64

In [20]:
df_transactions[df_transactions['qty'] > 0].shape
df_transactions[df_transactions['qty'] > 0].head()
df_transactions[df_transactions['qty'] > 0]['store_type'].value_counts(dropna=False)

(20876, 12)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record
10,29258453508,270384,20-02-2014,5,3,5,1497,785.92,8270.92,e-Shop,yes,yes
11,25455265351,267750,20-02-2014,12,6,3,1360,428.4,4508.4,e-Shop,no,yes
12,1571002198,275023,20-02-2014,6,5,4,587,246.54,2594.54,e-Shop,no,yes
14,36554696014,269345,20-02-2014,3,5,3,1253,394.69,4153.69,e-Shop,no,yes
15,56814940239,268799,20-02-2014,7,5,5,368,193.2,2033.2,e-Shop,no,yes


e-Shop            8429
MBR               4210
Flagship store    4145
TeleShop          4092
Name: store_type, dtype: int64

In [21]:
df_products.rename(columns={'prod_sub_cat_code':'prod_subcat_code'}, inplace=True)
df_products['counter'] = 1
mt.missing_data_table(df_products)
df_products.head()

There is no missing data


Unnamed: 0,prod_cat_code,prod_cat,prod_subcat_code,prod_subcat,counter
0,1,Clothing,4,Mens,1
1,1,Clothing,1,Women,1
2,1,Clothing,3,Kids,1
3,2,Footwear,1,Mens,1
4,2,Footwear,3,Women,1


In [22]:
prod_cat_dict = dict(zip(df_products['prod_cat_code'],df_products['prod_cat']))
prod_subcat_dict = dict(zip(df_products['prod_subcat_code'],df_products['prod_subcat']))

In [23]:
df_products.groupby(['prod_cat'])['prod_subcat'].value_counts(dropna=False)

prod_cat          prod_subcat        
Bags              Mens                   1
                  Women                  1
Books             Academic               1
                  Children               1
                  Comics                 1
                  DIY                    1
                  Fiction                1
                  Non-Fiction            1
Clothing          Kids                   1
                  Mens                   1
                  Women                  1
Electronics       Audio and video        1
                  Cameras                1
                  Computers              1
                  Mobiles                1
                  Personal Appliances    1
Footwear          Kids                   1
                  Mens                   1
                  Women                  1
Home and kitchen  Bath                   1
                  Furnishing             1
                  Kitchen                1
                

In [24]:
df_products['prod_cat'].value_counts(dropna=False)

Books               6
Electronics         5
Home and kitchen    4
Clothing            3
Footwear            3
Bags                2
Name: prod_cat, dtype: int64

In [25]:
df_products['prod_subcat'].value_counts(dropna=False)

Women                  3
Mens                   3
Kids                   2
Tools                  1
Computers              1
DIY                    1
Cameras                1
Audio and video        1
Academic               1
Kitchen                1
Furnishing             1
Children               1
Bath                   1
Personal Appliances    1
Fiction                1
Mobiles                1
Non-Fiction            1
Comics                 1
Name: prod_subcat, dtype: int64

### Transactions Data

In [26]:
converted_customers = df_transactions['customer_id'].unique().tolist()

In [27]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])
df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [28]:
df_transactions = pd.merge(df_transactions, df_products, on=['prod_cat_code', 'prod_subcat_code'], how='left')
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter
0,80712190438,270351,2014-02-28,1,1,-5,-772,405.3,-4265.3,e-Shop,yes,no,Clothing,Women,1
1,29258453508,270384,2014-02-27,5,3,-5,-1497,785.92,-8270.92,e-Shop,yes,no,Electronics,Computers,1
2,51750724947,273420,2014-02-24,6,5,-2,-791,166.11,-1748.11,TeleShop,yes,no,Books,DIY,1
3,93274880719,271509,2014-02-24,11,6,-3,-1363,429.35,-4518.35,e-Shop,yes,no,Home and kitchen,Bath,1
4,51750724947,273420,2014-02-23,6,5,-2,-791,166.11,-1748.11,TeleShop,yes,no,Books,DIY,1


In [29]:
# df_transactions['qty'] = np.where((df_transactions['qty'] < 0), (-1*df_transactions['qty']), df_transactions['qty'])
# df_transactions['rate'] = np.where((df_transactions['rate'] < 0), (-1*df_transactions['rate']), df_transactions['rate'])
# df_transactions['total_amt'] = np.where((df_transactions['total_amt'] < 0), (-1*df_transactions['total_amt']), df_transactions['total_amt'])
# df_transactions.head()

In [30]:
df_transactions['store_type'].value_counts(dropna=False)

e-Shop            9311
MBR               4661
Flagship store    4577
TeleShop          4504
Name: store_type, dtype: int64

In [31]:
df_transactions = df_transactions.sort_values(['customer_id','trans_date'])

In [32]:
df_transactions['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [33]:
df_transactions['duration'] = df_transactions.groupby(['customer_id'])['trans_date'].transform(pd.Series.diff).shift(-1)
df_transactions['duration'] = df_transactions.apply(lambda r: r['assessment_date'] - r['trans_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_transactions['duration'] = (df_transactions['duration']/np.timedelta64(1, 'D'))

In [34]:
df_transactions.head(7)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration
18130,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,yes,Footwear,Mens,1,2014-12-02,1.0
18114,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0
10030,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,no,yes,Clothing,Mens,1,2014-12-02,122.0
7515,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,no,yes,Clothing,Mens,1,2014-12-02,194.0
7722,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,no,yes,Books,Non-Fiction,1,2014-12-02,456.0
9055,36310127403,266784,2012-04-12,4,3,2,200,42.0,442.0,Flagship store,no,yes,Electronics,Mobiles,1,2014-12-02,133.0
11286,54234600611,266784,2012-08-23,10,5,3,1291,406.67,4279.66,TeleShop,no,yes,Books,Non-Fiction,1,2014-12-02,17.0


In [35]:
df_trans_overall = df_transactions.groupby(['customer_id'], as_index=False).agg({'trans_date':['first', 'last', pd.Series.nunique],
                                                                                 'duration': ['min', 'max', 'mean', 'last'],
                                                        'trans_id':'count',
                                                        'qty':['min', 'max', 'sum'],
                                                       'cogs':'sum',
                                                       'total_amt':'sum' }) #, lambda x: stats.mode(x)[0][0]]|, 'prod_cat':[pd.Series.nunique],
#                                                          'prod_subcat':[pd.Series.nunique],
#                                                        'store_type':[pd.Series.nunique]

In [36]:
df_trans_overall.shape
df_trans_overall.columns = [".".join(x).strip('.') for x in df_trans_overall.columns.ravel()] 
df_trans_overall.rename(columns={'trans_date.first':'conversion_date', 'trans_date.last':'last_purchase_date',
                                'trans_date.nunique':'number_of_unique_purchase_days'}, inplace=True)
mt.check_unique_no(df_trans_overall, ['customer_id'])
df_trans_overall.head()

(5506, 14)

Data has 5506 unique customer_id


Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,-4,4,6,1405.53,3113.89
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,-5,5,23,3351.6,21613.8
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,-1,4,30,2684.74,27981.92


In [37]:
df_trans_overall['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [38]:
df_trans_overall['account_age.days'] = (df_trans_overall['assessment_date'] - df_trans_overall['conversion_date'])/np.timedelta64(1,'D')

In [39]:
df_trans_overall['account_age.years'] = (df_trans_overall['assessment_date'] - df_trans_overall['conversion_date'])/np.timedelta64(1,'Y')

In [40]:
age_bins =  [0, 1, 2, 3, 4]
labels = ['00_01','01_02', '02_03', '03_04']
df_trans_overall['account_age.years.group'] = pd.cut(df_trans_overall['account_age.years'], age_bins, labels = labels,include_lowest = True)

In [41]:
def customer_profile(_df, unique_id, value, featurelist):
    df_list = []
    for feature in featurelist:
        _df[feature] = _df[feature].astype(str)
        _df_temp = pd.crosstab(_df[unique_id], _df[feature], values=_df[value], 
                               aggfunc=['count','sum', 'mean'], dropna=False).fillna(0).reset_index()
        _df_temp.columns = [".".join(x).strip('.') for x in _df_temp.columns.ravel()] 
        df_list.append(_df_temp)
        dfs = [df.set_index(unique_id) for df in df_list]
        df = pd.concat(dfs, axis=1)
        df.reset_index(inplace=True)
    return df

In [42]:
df_product_cat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['prod_cat'])
df_product_cat.head()

Unnamed: 0,customer_id,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen
0,266783,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,0.0,0.0,0.0,1845.35,634.27,0.0,0.0,0.0
1,266784,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0
2,266785,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,0.0,682.89,5066.43,0.0,0.0,3966.12,0.0
3,266788,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0
4,266794,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5328.31,4480.78,0.0,2846.48,4190.16,2049.78,1332.08,2240.39,0.0


In [43]:
df_product_subcat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['prod_subcat'])
df_product_subcat.head()

Unnamed: 0,customer_id,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women
0,266783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268.54,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,317.13,0.0,1845.35,0.0,0.0,0.0
1,266784,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0
2,266785,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,0.0,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,0.0,2166.54,0.0,0.0,0.0,0.0,0.0
3,266788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0
4,266794,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,718.25,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,239.42,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53


In [44]:
df_stores = customer_profile(df_transactions, 'customer_id', 'total_amt', ['store_type'])
df_stores.head()

Unnamed: 0,customer_id,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop
0,266783,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4
1,266784,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4
2,266785,4.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,1454.18,0.0,4220.36,3135.99
3,266788,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93
4,266794,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13378.24,718.25,3091.79,4610.06,1911.18


In [45]:
df_trans_overall = pd.merge(df_trans_overall, df_stores, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,-4,4,6,1405.53,3113.89,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,-5,5,23,3351.6,21613.8,2014-12-02,1358.0,3.72,03_04,4.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,1454.18,0.0,4220.36,3135.99
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,-1,4,30,2684.74,27981.92,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13378.24,718.25,3091.79,4610.06,1911.18


In [46]:
df_trans_overall['Flagship_store_spend.prop'] = df_trans_overall['sum.Flagship store']/df_trans_overall['total_amt.sum']
df_trans_overall['MBR_spend.prop'] = df_trans_overall['sum.MBR']/df_trans_overall['total_amt.sum']
df_trans_overall['TeleShop_spend.prop'] = df_trans_overall['sum.TeleShop']/df_trans_overall['total_amt.sum']
df_trans_overall['e-Shop.prop'] = df_trans_overall['sum.e-Shop']/df_trans_overall['total_amt.sum']

In [47]:
df_trans_overall = pd.merge(df_trans_overall, df_product_cat, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,-4,4,6,1405.53,3113.89,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4,0.0,0.0,0.1,0.9,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,0.0,0.0,0.0,1845.35,634.27,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,-5,5,23,3351.6,21613.8,2014-12-02,1358.0,3.72,03_04,4.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,1454.18,0.0,4220.36,3135.99,0.27,0.0,0.59,0.15,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,0.0,682.89,5066.43,0.0,0.0,3966.12,0.0
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,-1,4,30,2684.74,27981.92,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13378.24,718.25,3091.79,4610.06,1911.18,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5328.31,4480.78,0.0,2846.48,4190.16,2049.78,1332.08,2240.39,0.0


In [48]:
df_trans_overall['Bags.prop'] = df_trans_overall['sum.Bags']/df_trans_overall['total_amt.sum']
df_trans_overall['Books.prop'] = df_trans_overall['sum.Books']/df_trans_overall['total_amt.sum']
df_trans_overall['Clothing.prop'] = df_trans_overall['sum.Clothing']/df_trans_overall['total_amt.sum']
df_trans_overall['Electronics.prop'] = df_trans_overall['sum.Electronics']/df_trans_overall['total_amt.sum']
df_trans_overall['Footwear.prop'] = df_trans_overall['sum.Footwear']/df_trans_overall['total_amt.sum']
df_trans_overall['Home and kitchen.prop'] = df_trans_overall['sum.Home and kitchen']/df_trans_overall['total_amt.sum']

In [49]:
df_trans_overall = pd.merge(df_trans_overall, df_product_subcat, on=['customer_id'], how='left')
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,-4,4,6,1405.53,3113.89,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4,0.0,0.0,0.1,0.9,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,0.0,0.0,0.0,1845.35,634.27,0.0,0.0,0.0,0.0,0.59,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268.54,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,317.13,0.0,1845.35,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0,0.0,0.92,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,-5,5,23,3351.6,21613.8,2014-12-02,1358.0,3.72,03_04,4.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,1454.18,0.0,4220.36,3135.99,0.27,0.0,0.59,0.15,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,0.0,682.89,5066.43,0.0,0.0,3966.12,0.0,0.03,0.23,0.0,0.0,0.73,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,0.0,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,0.0,2166.54,0.0,0.0,0.0,0.0,0.0
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0,0.24,0.22,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,-1,4,30,2684.74,27981.92,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13378.24,718.25,3091.79,4610.06,1911.18,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5328.31,4480.78,0.0,2846.48,4190.16,2049.78,1332.08,2240.39,0.0,0.2,0.3,0.15,0.19,0.16,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,718.25,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,239.42,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53


In [50]:
df_trans_overall['Academic.prop'] = df_trans_overall['sum.Academic']/df_trans_overall['total_amt.sum']
df_trans_overall['Audio and video.prop'] = df_trans_overall['sum.Audio and video']/df_trans_overall['total_amt.sum']
df_trans_overall['Bath.prop'] = df_trans_overall['sum.Bath']/df_trans_overall['total_amt.sum']
df_trans_overall['Cameras.prop'] = df_trans_overall['sum.Cameras']/df_trans_overall['total_amt.sum']
df_trans_overall['Children.prop'] = df_trans_overall['sum.Children']/df_trans_overall['total_amt.sum']
df_trans_overall['Comics.prop'] = df_trans_overall['sum.Comics']/df_trans_overall['total_amt.sum']
df_trans_overall['Computers.prop'] = df_trans_overall['sum.Computers']/df_trans_overall['total_amt.sum']
df_trans_overall['DIY.prop'] = df_trans_overall['sum.DIY']/df_trans_overall['total_amt.sum']
df_trans_overall['Fiction.prop'] = df_trans_overall['sum.Fiction']/df_trans_overall['total_amt.sum']
df_trans_overall['Furnishing.prop'] = df_trans_overall['sum.Furnishing']/df_trans_overall['total_amt.sum']
df_trans_overall['Kids.prop'] = df_trans_overall['sum.Kids']/df_trans_overall['total_amt.sum']
df_trans_overall['Kitchen.prop'] = df_trans_overall['sum.Kitchen']/df_trans_overall['total_amt.sum']
df_trans_overall['Mens.prop'] = df_trans_overall['sum.Mens']/df_trans_overall['total_amt.sum']
df_trans_overall['Mobiles.prop'] = df_trans_overall['sum.Mobiles']/df_trans_overall['total_amt.sum']
df_trans_overall['Non-Fiction.prop'] = df_trans_overall['sum.Non-Fiction']/df_trans_overall['total_amt.sum']
df_trans_overall['Personal Appliances.prop'] = df_trans_overall['sum.Personal Appliances']/df_trans_overall['total_amt.sum']
df_trans_overall['Tools.prop'] = df_trans_overall['sum.Tools']/df_trans_overall['total_amt.sum']
df_trans_overall['Women.prop'] = df_trans_overall['sum.Women']/df_trans_overall['total_amt.sum']

In [51]:
df_trans_overall.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop
0,266783,2011-09-23,2013-09-02,5,1.0,456.0,233.2,456.0,5,-4,4,6,1405.53,3113.89,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4,0.0,0.0,0.1,0.9,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,0.0,0.0,0.0,1845.35,634.27,0.0,0.0,0.0,0.0,0.59,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268.54,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,317.13,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.59,0.0,0.0,0.0
1,266784,2012-04-12,2012-09-09,3,17.0,814.0,321.33,814.0,3,2,5,10,541.07,5694.06,2014-12-02,964.0,2.64,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,2.0,0.0,1.0,0.0,0.0,0.0,5252.06,0.0,442.0,0.0,0.0,0.0,2626.03,0.0,442.0,0.0,0.0,0.0,0.92,0.0,0.08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,0.0,0.0,0.0,0.0,442.0,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.17,0.0,0.0,0.0,0.0,0.08,0.75,0.0,0.0,0.0
2,266785,2011-03-15,2013-02-13,8,5.0,657.0,169.75,657.0,8,-5,5,23,3351.6,21613.8,2014-12-02,1358.0,3.72,03_04,4.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,1454.18,0.0,4220.36,3135.99,0.27,0.0,0.59,0.15,1.0,1.0,0.0,0.0,4.0,2.0,682.89,5066.43,0.0,0.0,15864.49,0.0,682.89,5066.43,0.0,0.0,3966.12,0.0,0.03,0.23,0.0,0.0,0.73,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,10047.76,0.0,6499.61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,5023.88,0.0,2166.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23,0.0,0.0,0.0,0.0,0.0,0.46,0.0,0.3,0.0,0.0,0.0,0.0,0.0
3,266788,2011-09-13,2013-12-02,4,84.0,397.0,294.0,365.0,4,1,4,8,578.97,6092.97,2014-12-02,1176.0,3.22,03_04,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,1367.99,1485.12,0.0,1619.93,0.22,0.24,0.0,0.53,1.0,1.0,0.0,0.0,2.0,0.0,1485.12,1367.99,0.0,0.0,3239.86,0.0,1485.12,1367.99,0.0,0.0,1619.93,0.0,0.24,0.22,0.0,0.0,0.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,2797.86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,1927.12,0.0,1398.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.22,0.0,0.32,0.0,0.46,0.0,0.0,0.0,0.0,0.0
4,266794,2011-03-18,2014-12-02,11,0.0,520.0,112.92,0.0,12,-1,4,30,2684.74,27981.92,2014-12-02,1355.0,3.71,03_04,1.0,3.0,1.0,7.0,718.25,9275.37,4610.06,13378.24,718.25,3091.79,4610.06,1911.18,0.03,0.33,0.16,0.48,2.0,2.0,2.0,4.0,2.0,0.0,5692.96,8380.32,4099.55,5328.31,4480.78,0.0,2846.48,4190.16,2049.78,1332.08,2240.39,0.0,0.2,0.3,0.15,0.19,0.16,0.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,3964.74,718.25,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,4099.55,0.0,2948.14,0.0,0.0,4610.06,0.0,7225.6,3964.74,239.42,0.0,0.0,4415.58,0.0,0.0,0.0,0.0,0.0,2049.78,0.0,2948.14,0.0,0.0,4610.06,0.0,2408.53,0.14,0.03,0.0,0.0,0.16,0.0,0.0,0.0,0.0,0.0,0.15,0.0,0.11,0.0,0.0,0.16,0.0,0.26


### Customer Data

In [52]:
df_customer['customer_status'] = np.where((df_customer['customer_id'].isin(converted_customers)), 'converted', 'voluntarily churned - not converted')
df_customer['customer_status'].value_counts(dropna=False)

converted                              5506
voluntarily churned - not converted     141
Name: customer_status, dtype: int64

In [53]:
df_customer['dob'] = pd.to_datetime(df_customer['dob'])
df_customer['dob'].sort_values().min()
df_customer['dob'].sort_values().max()

Timestamp('1970-01-02 00:00:00')

Timestamp('1992-12-29 00:00:00')

In [54]:
df_customer.tail()

Unnamed: 0,customer_id,dob,gender,city_code,customer_status
5642,274474,1992-12-19,M,2.0,converted
5643,267666,1992-12-24,M,6.0,converted
5644,270476,1992-12-25,F,3.0,converted
5645,269626,1992-12-27,F,5.0,converted
5646,274308,1992-12-29,F,5.0,converted


### City Data

In [55]:
df_cities.head(2)
df_cities['city'] = df_cities['city']

Unnamed: 0,city_code,city
0,1,Arlington
1,2,Miami


In [56]:
city_list = df_cities['city'].to_list()
len(city_list)
city_list

10

['Arlington',
 'Miami',
 'Chicago',
 'Pittsburgh',
 'Dayton',
 'Akron',
 'Philadelphia',
 'Dallas',
 'Houston',
 'Los Angeles']

In [57]:
state_list = []
len(state_list)

gc = geonamescache.GeonamesCache()

for city in city_list:
    info = gc.get_cities_by_name(city)
    if info == []:
        state_list.append(np.nan)
    else:
        for dictionary in list(info[0].values()):
            state = list(dictionary.values())
            state_list.append(state[7])
#             print(city, state[7])
df_cities['state_code'] = state_list

0

In [58]:
df_cities['state_code'].value_counts(dropna=False)

TX    3
PA    2
OH    2
IL    1
CA    1
FL    1
Name: state_code, dtype: int64

In [59]:
df_cities

Unnamed: 0,city_code,city,state_code
0,1,Arlington,TX
1,2,Miami,FL
2,3,Chicago,IL
3,4,Pittsburgh,PA
4,5,Dayton,OH
5,6,Akron,OH
6,7,Philadelphia,PA
7,8,Dallas,TX
8,9,Houston,TX
9,10,Los Angeles,CA


In [60]:
df_cities = pd.merge(df_cities, df_us_regions, left_on=['state_code'], right_on=['state code'], how='left' )
df_cities

Unnamed: 0,city_code,city,state_code,state,state code,region,division
0,1,Arlington,TX,Texas,TX,South,West South Central
1,2,Miami,FL,Florida,FL,South,South Atlantic
2,3,Chicago,IL,Illinois,IL,Midwest,East North Central
3,4,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic
4,5,Dayton,OH,Ohio,OH,Midwest,East North Central
5,6,Akron,OH,Ohio,OH,Midwest,East North Central
6,7,Philadelphia,PA,Pennsylvania,PA,Northeast,Middle Atlantic
7,8,Dallas,TX,Texas,TX,South,West South Central
8,9,Houston,TX,Texas,TX,South,West South Central
9,10,Los Angeles,CA,California,CA,West,Pacific


In [61]:
df_customer = pd.merge(df_customer, df_cities, on=['city_code'],  how='left' )
df_customer.head(2)

Unnamed: 0,customer_id,dob,gender,city_code,customer_status,city,state_code,state,state code,region,division
0,268408,1970-02-01,M,4.0,converted,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic
1,269696,1970-07-01,F,8.0,converted,Dallas,TX,Texas,TX,South,West South Central


In [62]:
df_transactions['trans_date.day'] = df_transactions['trans_date'].dt.day
df_transactions['trans_date.month_num'] = df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.year'] = df_transactions['trans_date'].dt.year
df_transactions['trans_date.year_month'] = df_transactions['trans_date'].dt.year.map(str) + '_' + df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.hour'] = df_transactions['trans_date'].dt.hour
df_transactions['trans_date.weekday'] = df_transactions['trans_date'].dt.day_name()
df_transactions['trans_date.week_of_year'] = df_transactions['trans_date'].dt.week.map("{:02}".format)

In [63]:
df_transactions['trans_date.weekday'].head()

18130       Friday
18114     Saturday
10030       Sunday
7515     Wednesday
7722        Monday
Name: trans_date.weekday, dtype: object

In [64]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_transactions['trans_date.month_num'].str.contains(k, case=True)

    df_transactions.loc[mask,'trans_date.month'] = v

In [65]:
df_transactions['trans_date.month'] = df_transactions['trans_date.month_num'] + '_' + df_transactions['trans_date.month'].map(str)

In [66]:
df_transactions['trans_date.month'].value_counts(dropna=False)

01_Jan    2031
10_Oct    2019
09_Sep    1986
03_Mar    1972
12_Dec    1932
08_Aug    1919
07_Jul    1917
02_Feb    1888
11_Nov    1876
05_May    1853
04_Apr    1842
06_Jun    1818
Name: trans_date.month, dtype: int64

In [67]:
weekday_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_num = ['01', '02', '03', '04', '05', '06', '07']
weekday_dict = dict(zip(weekday_name, weekday_num))

for k, v in weekday_dict.items():
    mask = df_transactions['trans_date.weekday'].str.contains(k, case=True)

    df_transactions.loc[mask, 'trans_date.weekday_num'] = v

In [68]:
def bin_week(x):
    if ( x>= 1) and (x <= 7):
        return 'month.week1'

    elif (x >= 8) and (x <= 14):
        return 'month.week2'

    elif (x >= 15) and (x <= 21):
        return 'month.week3'

    else:
        return 'month.week4'

In [69]:
df_transactions['trans_date.week_of_month'] = df_transactions['trans_date'].dt.day.apply(bin_week)

In [70]:
df_transactions['trans_date.weekday'] = df_transactions['trans_date.weekday_num'] + '_' + df_transactions['trans_date.weekday'].map(str)

In [71]:
df_transactions['trans_date.weekday'].value_counts(dropna=False)

01_Sunday       3417
03_Tuesday      3315
05_Thursday     3305
07_Saturday     3278
06_Friday       3250
02_Monday       3245
04_Wednesday    3243
Name: trans_date.weekday, dtype: int64

In [72]:
# df_transactions['trans_date.hour'].describe()
# df_transactions['trans_date.time_of_day']

In [73]:
df_transactions = df_transactions.sort_values(['customer_id', 'trans_date'])

In [74]:
df_first_purchase_date = df_transactions.groupby(['customer_id']).agg({'trans_date':'first'})

In [75]:
df_first_purchase_date.rename(columns={'trans_date':'first_purchase_date'}, inplace=True)

In [76]:
df_first_purchase_date.shape

(5506, 1)

In [77]:
df_transactions = pd.merge(df_transactions, df_first_purchase_date, on='customer_id', how='left')

In [78]:
df_transactions['first_purchase.cohort'] = df_transactions['first_purchase_date'].dt.year.map(str) + "_" + df_transactions['first_purchase_date'].dt.month.map("{:02}".format)

In [79]:
df_transactions['purchase_date.cohort'] = df_transactions['trans_date'].dt.year.map(str) + "_" + df_transactions['trans_date'].dt.month.map("{:02}".format)

In [80]:
df_earliest_purchase_date = df_first_purchase_date.copy()
df_earliest_purchase_date.rename(columns={'first_purchase_date':'trans_date'}, inplace=True)

In [81]:
df_earliest_purchase_date['customer_type.period'] = 'new'

In [82]:
df_transactions = pd.merge(df_transactions, df_earliest_purchase_date, on=['customer_id', 'trans_date'], how='left')

In [83]:
df_transactions['customer_type.period']  = np.where(df_transactions['customer_type.period'].isnull(), 'existing', df_transactions['customer_type.period'])

In [84]:
df_transactions['qty_negative'] = np.where((df_transactions['qty'] < 0), 'yes', 'no')

In [85]:
trans_id_returned = df_transactions[df_transactions['qty_negative'] == 'yes']['trans_id'].unique().tolist()

In [86]:
df_transactions['returned_item_before'] = np.where((df_transactions['trans_id'].isin(trans_id_returned)), 'yes', 'no')

In [87]:
df_transactions.drop(['qty_negative'], axis=1, inplace=True)

In [88]:
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
0,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,yes,Footwear,Mens,1,2014-12-02,1.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09,new,yes
1,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes
2,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,no,yes,Clothing,Mens,1,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10,existing,no
3,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,no,yes,Clothing,Mens,1,2014-12-02,194.0,20,2,2013,2013_02,0,04_Wednesday,8,02_Feb,4,month.week3,2011-09-23,2011_09,2013_02,existing,no
4,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,no,yes,Books,Non-Fiction,1,2014-12-02,456.0,2,9,2013,2013_09,0,02_Monday,36,09_Sep,2,month.week1,2011-09-23,2011_09,2013_09,existing,no


In [89]:
df_transactions.tail()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
23048,94712826085,275264,2011-05-08,10,5,5,221,116.03,1221.03,TeleShop,no,yes,Books,Non-Fiction,1,2014-12-02,94.0,8,5,2011,2011_05,0,01_Sunday,18,05_May,1,month.week2,2011-05-08,2011_05,2011_05,new,no
23049,81382444243,275264,2011-08-10,12,6,4,587,246.54,2594.54,e-Shop,no,yes,Home and kitchen,Tools,1,2014-12-02,1210.0,10,8,2011,2011_08,0,04_Wednesday,32,08_Aug,4,month.week2,2011-05-08,2011_05,2011_08,existing,no
23050,7214136016,275265,2011-12-17,1,4,1,222,23.31,245.31,TeleShop,no,yes,Bags,Mens,1,2014-12-02,9.0,17,12,2011,2011_12,0,07_Saturday,50,12_Dec,7,month.week3,2011-12-17,2011_12,2011_12,new,no
23051,38961184788,275265,2011-12-26,12,5,3,188,59.22,623.22,e-Shop,no,yes,Books,Academic,1,2014-12-02,434.0,26,12,2011,2011_12,0,02_Monday,52,12_Dec,2,month.week4,2011-12-17,2011_12,2011_12,existing,no
23052,24113900219,275265,2013-03-04,2,6,3,719,226.49,2383.49,Flagship store,no,yes,Home and kitchen,Furnishing,1,2014-12-02,638.0,4,3,2013,2013_03,0,02_Monday,10,03_Mar,2,month.week1,2011-12-17,2011_12,2013_03,existing,no


In [90]:
df_transactions['returned_item'].value_counts(dropna=False)

no     18821
yes     4232
Name: returned_item, dtype: int64

In [91]:
df_transactions['returned_item_before'].value_counts(dropna=False)

no     18819
yes     4234
Name: returned_item_before, dtype: int64

In [92]:
df_transactions[(df_transactions['returned_item'] == 'yes') & (df_transactions['returned_item_before'] == 'yes')].head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
0,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,yes,Footwear,Mens,1,2014-12-02,1.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09,new,yes
1,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes


In [93]:
df_transactions[df_transactions['customer_id'] == 268624]

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
4849,36332303449,268624,2011-01-02,10,6,-4,-295,123.9,-1303.9,Flagship store,yes,no,Home and kitchen,Kitchen,1,2014-12-02,25.0,2,1,2011,2011_01,0,01_Sunday,52,01_Jan,1,month.week1,2011-01-02,2011_01,2011_01,new,yes
4850,36332303449,268624,2011-01-27,10,6,4,295,123.9,1303.9,Flagship store,yes,yes,Home and kitchen,Kitchen,1,2014-12-02,53.0,27,1,2011,2011_01,0,05_Thursday,4,01_Jan,5,month.week4,2011-01-02,2011_01,2011_01,existing,yes
4851,66278698494,268624,2011-03-21,1,2,1,418,43.89,461.89,TeleShop,no,yes,Footwear,Mens,1,2014-12-02,186.0,21,3,2011,2011_03,0,02_Monday,12,03_Mar,2,month.week3,2011-01-02,2011_01,2011_03,existing,no
4852,89509659612,268624,2011-09-23,8,3,5,560,294.0,3094.0,MBR,no,yes,Electronics,Personal Appliances,1,2014-12-02,383.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-01-02,2011_01,2011_09,existing,no
4853,71442479722,268624,2012-10-10,2,6,3,1477,465.25,4896.26,Flagship store,no,yes,Home and kitchen,Furnishing,1,2014-12-02,168.0,10,10,2012,2012_10,0,04_Wednesday,41,10_Oct,4,month.week2,2011-01-02,2011_01,2012_10,existing,no
4854,26264839788,268624,2013-03-27,1,1,2,710,149.1,1569.1,e-Shop,no,yes,Clothing,Women,1,2014-12-02,112.0,27,3,2013,2013_03,0,04_Wednesday,13,03_Mar,4,month.week4,2011-01-02,2011_01,2013_03,existing,no
4855,39847252322,268624,2013-07-17,1,2,5,1422,746.55,7856.55,Flagship store,no,yes,Footwear,Mens,1,2014-12-02,63.0,17,7,2013,2013_07,0,04_Wednesday,29,07_Jul,4,month.week3,2011-01-02,2011_01,2013_07,existing,no
4856,31803201960,268624,2013-09-18,11,5,5,1437,754.42,7939.43,e-Shop,no,yes,Books,Children,1,2014-12-02,440.0,18,9,2013,2013_09,0,04_Wednesday,38,09_Sep,4,month.week3,2011-01-02,2011_01,2013_09,existing,no


In [94]:
customers_who_returned_items_before = list(df_transactions[df_transactions['returned_item'] == 'yes']['customer_id'].unique());
len(customers_who_returned_items_before)

1726

### Master File

In [95]:
df_master = pd.merge(df_customer, df_trans_overall,  on=['customer_id'], how='left')
# df_master['assessment_date'] = df_transactions['trans_date'].sort_values().max()
mt.check_unique_no(df_master, ['customer_id'])
df_master.head()

Data has 5647 unique customer_id


Unnamed: 0,customer_id,dob,gender,city_code,customer_status,city,state_code,state,state code,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop
0,268408,1970-02-01,M,4.0,converted,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,-5.0,5.0,33.0,2526.93,24272.43,2014-12-02,1091.0,2.99,02_03,5.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,1243.57,6491.88,1298.01,3834.35,0.26,0.27,0.16,0.32,3.0,1.0,1.0,2.0,2.0,2.0,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,354.7,1033.17,890.63,3834.35,3763.08,3044.83,0.04,0.04,0.04,0.32,0.31,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2163.96,0.0,0.0,0.0,0.0,977.37,0.0,0.12,0.0,0.2,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.25,0.27,0.0,0.0,0.0,0.0,0.08
1,269696,1970-07-01,F,8.0,converted,Dallas,TX,Texas,TX,South,West South Central,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,-4.0,4.0,3.0,1043.91,4488.51,2014-12-02,1171.0,3.21,03_04,0.0,3.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,268159,1970-08-01,F,8.0,converted,Dallas,TX,Texas,TX,South,West South Central,2012-06-02,2013-03-31,6.0,21.0,611.0,152.17,611.0,6.0,1.0,5.0,19.0,1699.85,17888.84,2014-12-02,913.0,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,1182.35,7458.75,2311.94,0.0,0.07,0.42,0.52,1.0,1.0,2.0,0.0,0.0,2.0,779.02,1182.35,8141.64,0.0,0.0,7785.83,779.02,1182.35,4070.82,0.0,0.0,3892.91,0.04,0.07,0.46,0.0,0.0,0.44,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,8141.64,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,4070.82,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.46,0.42,0.04,0.0,0.0,0.0,0.0,0.0
3,270181,1970-10-01,F,2.0,converted,Miami,FL,Florida,FL,South,South Atlantic,2011-03-18,2014-09-01,10.0,2.0,306.0,135.5,92.0,10.0,-4.0,5.0,16.0,2440.31,14869.99,2014-12-02,1355.0,3.71,03_04,3.0,1.0,1.0,5.0,8428.94,408.85,617.7,5414.5,2809.65,408.85,617.7,1082.9,0.57,0.03,0.04,0.36,0.0,4.0,1.0,2.0,1.0,2.0,0.0,1026.55,5348.2,6532.76,1962.48,0.0,0.0,256.64,5348.2,3266.38,1962.48,0.0,0.0,0.07,0.36,0.44,0.13,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,408.85,0.0,0.0,0.0,617.7,0.0,0.0,0.0,0.0,0.0,5348.2,0.0,1962.48,0.0,0.0,6532.76,0.0,0.0,408.85,0.0,0.0,0.0,205.9,0.0,0.0,0.0,0.0,0.0,5348.2,0.0,1962.48,0.0,0.0,3266.38,0.0,0.0,0.03,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.13,0.0,0.0,0.44,0.0,0.0
4,268073,1970-11-01,M,1.0,converted,Arlington,TX,Texas,TX,South,West South Central,2011-11-19,2013-12-30,3.0,1.0,771.0,369.67,337.0,3.0,-1.0,5.0,5.0,896.28,7956.0,2014-12-02,1109.0,3.04,03_04,0.0,3.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,2652.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
df_master['customer_status'].value_counts(dropna=False)

converted                              5506
voluntarily churned - not converted     141
Name: customer_status, dtype: int64

In [97]:
df_master['conversion_date.year'] = df_master['conversion_date'].dt.year

In [98]:
df_master['conversion_date.month_num'] = df_master['conversion_date'].dt.month.map("{:02}".format)

In [99]:
df_master['conversion_date.month_num'].value_counts(dropna=False)

3.0     689
2.0     648
1.0     621
4.0     568
5.0     500
6.0     453
7.0     436
8.0     387
9.0     356
10.0    334
11.0    284
12.0    230
nan     141
Name: conversion_date.month_num, dtype: int64

In [100]:
# df_master['conversion_date.month_num'] = df_master['conversion_date.month_num'].astype('O').astype('int64')

In [101]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_master['conversion_date.month_num'].str.contains(k, case=True)

    df_master.loc[mask,'conversion_date.month'] = v

In [102]:
df_master['conversion_date.month'] = df_master['conversion_date.month_num'] + '_' + df_master['conversion_date.month'].map(str)

In [103]:
df_master['conversion_date.month'].value_counts(dropna=False)

3.0_Mar     689
2.0_Feb     648
1.0_Jan     621
4.0_Apr     568
5.0_May     500
6.0_Jun     453
7.0_Jul     436
8.0_Aug     387
9.0_Sep     356
10.0_Oct    334
11.0_Nov    284
12.0_Dec    230
nan_nan     141
Name: conversion_date.month, dtype: int64

In [104]:
df_master['conversion_date.year_month'] = df_master['conversion_date.year'].map(str) + '_' + df_master['conversion_date.month_num'].map(str)

In [105]:
df_master['conversion_date.year_month'].value_counts(dropna=False).head()

2011.0_3.0    485
2011.0_2.0    461
2011.0_4.0    414
2011.0_1.0    341
2011.0_5.0    335
Name: conversion_date.year_month, dtype: int64

In [106]:
df_master.head(1)

Unnamed: 0,customer_id,dob,gender,city_code,customer_status,city,state_code,state,state code,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month
0,268408,1970-02-01,M,4.0,converted,Pittsburgh,PA,Pennsylvania,PA,Northeast,Middle Atlantic,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,-5.0,5.0,33.0,2526.93,24272.43,2014-12-02,1091.0,2.99,02_03,5.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,1243.57,6491.88,1298.01,3834.35,0.26,0.27,0.16,0.32,3.0,1.0,1.0,2.0,2.0,2.0,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,354.7,1033.17,890.63,3834.35,3763.08,3044.83,0.04,0.04,0.04,0.32,0.31,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2163.96,0.0,0.0,0.0,0.0,977.37,0.0,0.12,0.0,0.2,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.25,0.27,0.0,0.0,0.0,0.0,0.08,2011.0,12.0,12.0_Dec,2011.0_12.0


In [107]:
df_master['biological_age.actual'] = (df_master['assessment_date'] - df_master['dob'])/np.timedelta64(1, 'Y')

In [108]:
df_master['biological_age.actual'].describe()

count   5506.00
mean      33.34
std        6.61
min       21.93
25%       27.52
50%       33.37
75%       39.09
max       44.92
Name: biological_age.actual, dtype: float64

In [109]:
df_master['biological_age'] = np.round(df_master['biological_age.actual'])

In [110]:
df_master['biological_age'].describe()

count   5506.00
mean      33.34
std        6.62
min       22.00
25%       28.00
50%       33.00
75%       39.00
max       45.00
Name: biological_age, dtype: float64

In [111]:
age_bins =  [21, 25, 30, 35, 40, 45]
labels = ['22_25','25_30', '30_35', '35_40', '40_45']
df_master['biological_age.group'] = pd.cut(df_master['biological_age'], age_bins, labels = labels,include_lowest = True)

In [112]:
df_master['number_of_unique_purchase_days'].describe()

count   5506.00
mean       4.18
std        2.11
min        1.00
25%        3.00
50%        4.00
75%        6.00
max       13.00
Name: number_of_unique_purchase_days, dtype: float64

In [113]:
df_master.drop(['state code', 'city_code'], axis=1, inplace=True)

In [114]:
df_master['number_of_unique_purchase_days'] =df_master['number_of_unique_purchase_days'].replace(np.nan, 0)

In [115]:
df_master['repeat_purchaser'] = np.where((df_master['number_of_unique_purchase_days'] > 1), 'yes',
                                      np.where((df_master['number_of_unique_purchase_days']== 0),'never purchased',
                                        'no'))

In [116]:
df_master['returned_item_before'] = np.where(df_master['customer_id'].isin(customers_who_returned_items_before), 'yes', 'no')

In [117]:
df_master['returned_item_before'].value_counts(dropna=False)

no     3921
yes    1726
Name: returned_item_before, dtype: int64

In [118]:
df_master[df_master['customer_id'] == 266783]

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before
1019,266783,1974-01-05,M,converted,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,2011-09-23,2013-09-02,5.0,1.0,456.0,233.2,456.0,5.0,-4.0,4.0,6.0,1405.53,3113.89,2014-12-02,1166.0,3.19,03_04,0.0,0.0,1.0,4.0,0.0,0.0,308.3,2805.59,0.0,0.0,308.3,701.4,0.0,0.0,0.1,0.9,0.0,1.0,2.0,0.0,2.0,0.0,0.0,1845.35,1268.54,0.0,0.0,0.0,0.0,1845.35,634.27,0.0,0.0,0.0,0.0,0.59,0.41,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1268.54,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,317.13,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.41,0.0,0.59,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,40.91,41.0,40_45,yes,yes


In [119]:
df_master[df_master['account_age.years'] == df_master['account_age.years'].min()]

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,count.Furnishing,count.Kids,count.Kitchen,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before
3951,274213,1986-03-27,M,converted,Houston,TX,Texas,South,West South Central,2014-12-01,2014-12-01,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,279.72,2943.72,2014-12-02,1.0,0.0,00_01,0.0,1.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2014.0,12.0,12.0_Dec,2014.0_12.0,28.69,29.0,25_30,no,no


In [120]:
df_master['1_day_amt.avg'] = (df_master['total_amt.sum']/df_master['account_age.days']) * 1
df_master['7_day_amt.avg'] = (df_master['total_amt.sum']/df_master['account_age.days']) * 7
df_master['30_day_amt.avg'] = (df_master['total_amt.sum']/df_master['account_age.days']) * 30

In [121]:
df_master['1_day_num.avg'] = (df_master['qty.sum']/df_master['account_age.days']) * 1
df_master['7_day_num.avg'] = (df_master['qty.sum']/df_master['account_age.days']) * 7
df_master['30_day_num.avg'] = (df_master['qty.sum']/df_master['account_age.days']) * 30

In [122]:
df_master[df_master['repeat_purchaser'] == 'yes'].head()

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,...,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,1_day_num.avg,7_day_num.avg,30_day_num.avg
0,268408,1970-02-01,M,converted,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,-5.0,5.0,33.0,2526.93,24272.43,2014-12-02,1091.0,2.99,02_03,5.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,1243.57,6491.88,1298.01,3834.35,0.26,0.27,0.16,0.32,3.0,1.0,1.0,2.0,2.0,2.0,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,354.7,1033.17,890.63,3834.35,3763.08,3044.83,0.04,0.04,0.04,0.32,0.31,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2163.96,0.0,0.0,0.0,0.0,977.37,0.0,0.12,0.0,0.2,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.25,0.27,0.0,0.0,0.0,0.0,0.08,2011.0,12.0,12.0_Dec,2011.0_12.0,44.83,45.0,40_45,yes,yes,22.25,155.74,667.44,0.03,0.21,0.91
1,269696,1970-07-01,F,converted,Dallas,TX,Texas,South,West South Central,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,-4.0,4.0,3.0,1043.91,4488.51,2014-12-02,1171.0,3.21,03_04,0.0,3.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,44.42,44.0,40_45,yes,yes,3.83,26.83,114.99,0.0,0.02,0.08
2,268159,1970-08-01,F,converted,Dallas,TX,Texas,South,West South Central,2012-06-02,2013-03-31,6.0,21.0,611.0,152.17,611.0,6.0,1.0,5.0,19.0,1699.85,17888.84,2014-12-02,913.0,2.5,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,1182.35,7458.75,2311.94,0.0,0.07,0.42,0.52,1.0,1.0,2.0,0.0,0.0,2.0,779.02,1182.35,8141.64,0.0,0.0,7785.83,779.02,1182.35,4070.82,0.0,0.0,3892.91,0.04,0.07,0.46,0.0,0.0,0.44,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,8141.64,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,1182.35,0.0,0.0,0.0,0.0,0.0,4070.82,7458.75,779.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02,0.0,0.07,0.0,0.0,0.0,0.0,0.0,0.46,0.42,0.04,0.0,0.0,0.0,0.0,0.0,2012.0,6.0,6.0_Jun,2012.0_6.0,44.34,44.0,40_45,yes,no,19.59,137.15,587.8,0.02,0.15,0.62
3,270181,1970-10-01,F,converted,Miami,FL,Florida,South,South Atlantic,2011-03-18,2014-09-01,10.0,2.0,306.0,135.5,92.0,10.0,-4.0,5.0,16.0,2440.31,14869.99,2014-12-02,1355.0,3.71,03_04,3.0,1.0,1.0,5.0,8428.94,408.85,617.7,5414.5,2809.65,408.85,617.7,1082.9,0.57,0.03,0.04,0.36,0.0,4.0,1.0,2.0,1.0,2.0,0.0,1026.55,5348.2,6532.76,1962.48,0.0,0.0,256.64,5348.2,3266.38,1962.48,0.0,0.0,0.07,0.36,0.44,0.13,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,2.0,0.0,0.0,408.85,0.0,0.0,0.0,617.7,0.0,0.0,0.0,0.0,0.0,5348.2,0.0,1962.48,0.0,0.0,6532.76,0.0,0.0,408.85,0.0,0.0,0.0,205.9,0.0,0.0,0.0,0.0,0.0,5348.2,0.0,1962.48,0.0,0.0,3266.38,0.0,0.0,0.03,0.0,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.36,0.0,0.13,0.0,0.0,0.44,0.0,0.0,2011.0,3.0,3.0_Mar,2011.0_3.0,44.17,44.0,40_45,yes,yes,10.97,76.82,329.22,0.01,0.08,0.35
4,268073,1970-11-01,M,converted,Arlington,TX,Texas,South,West South Central,2011-11-19,2013-12-30,3.0,1.0,771.0,369.67,337.0,3.0,-1.0,5.0,5.0,896.28,7956.0,2014-12-02,1109.0,3.04,03_04,0.0,3.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,2652.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7956.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2011.0,11.0,11.0_Nov,2011.0_11.0,44.09,44.0,40_45,yes,yes,7.17,50.22,215.22,0.0,0.03,0.14


In [123]:
df_master['repeat_purchaser'].value_counts(dropna=False)

yes                5041
no                  465
never purchased     141
Name: repeat_purchaser, dtype: int64

In [124]:
df_master['qty.sum'].sum()

56074.0

In [125]:
df_master['total_amt.sum'].sum()

48579771.37

In [126]:
df_master['total_amt.sum'].sum() - df_master['cogs.sum'].sum() 

42847246.599999994

## Export Data

In [127]:
df_transactions.to_csv(filepaths.interim_transactions_data, index=False)
# df_trans_overall.to_csv(filepaths.interim_transactions_data_aggregated, index=False)
df_master.to_csv(filepaths.master_file_data, index=False)