## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats
import datetime as dt

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

### Create Project Directories and Sub-Directories

In [3]:
mt.create_directories(filepaths.project_parent_directory)

Directory  deliverables  already exists
Directory  deprecated  already exists
Directory  dictionary  already exists
Directory  visualizations  already exists
Subdirectory  data/raw  already exists
Subdirectory  data/interim  already exists
Subdirectory  data/external  already exists
Subdirectory  data/processed  already exists


## Load Data

In [4]:
df_customer = pd.read_csv(filepaths.raw_customer_data, sep=',')

df_transactions = pd.read_csv(filepaths.raw_transactions_data, sep=',')

df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Standardize Feature Names

## Data Audit + Cleaning
### Customer Data

In [5]:
df_customer.columns = [c.lower() for c in df_customer.columns]
df_customer.shape
mt.check_unique_no(df_customer, ['customer_id'])
mt.missing_data_table(df_customer)
df_customer.head()

(5647, 4)

Data has 5647 unique customer_id
Missing data distribution:

  Variable  Count  Proportion
    gender      2        0.00
 city_code      2        0.00


Unnamed: 0,customer_id,dob,gender,city_code
0,268408,02-01-1970,M,4.0
1,269696,07-01-1970,F,8.0
2,268159,08-01-1970,F,8.0
3,270181,10-01-1970,F,2.0
4,268073,11-01-1970,M,1.0


### Transactions Data

In [6]:
df_transactions.rename(columns={'transaction_id':'trans_id', 'cust_id':'customer_id', 'tran_date':'trans_date'}, inplace=True)
df_transactions.columns = [c.lower() for c in df_transactions.columns]
df_transactions.shape
mt.check_unique_no(df_transactions, ['customer_id', 'trans_id'])
mt.missing_data_table(df_transactions)
df_transactions.head()
df_transactions.describe()

(23053, 10)

Data has 5506 unique customer_id
Data has 20878 unique trans_id
There is no missing data


Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop
2,51750724947,273420,24-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop
3,93274880719,271509,24-02-2014,11,6,-3,-1363,429.35,-4518.35,e-Shop
4,51750724947,273420,23-02-2014,6,5,-2,-791,166.11,-1748.11,TeleShop


Unnamed: 0,trans_id,customer_id,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt
count,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0,23053.0
mean,50073480358.45,271021.75,6.15,3.76,2.43,636.37,248.67,2107.31
std,28981936062.0,2431.69,3.73,1.68,2.27,622.36,187.18,2507.56
min,3268991.0,266783.0,1.0,1.0,-5.0,-1499.0,7.35,-8270.92
25%,24938639453.0,268935.0,3.0,2.0,1.0,312.0,98.28,762.45
50%,50093131361.0,270980.0,5.0,4.0,3.0,710.0,199.08,1754.74
75%,75329995679.0,273114.0,10.0,5.0,4.0,1109.0,365.71,3569.15
max,99987549630.0,275265.0,12.0,6.0,5.0,1500.0,787.5,8287.5


In [7]:
# df_transactions['trans_id'].value_counts(dropna=False).head()

In [8]:
# df_transactions[df_transactions['trans_id'] == 426787191]

In [9]:
# df_transactions[df_transactions['trans_id'] == 4170892941]

In [10]:
# df_transactions[df_transactions['trans_id'] == 25890929042]

In [11]:
returned_item = df_transactions['trans_id'].value_counts(dropna=False)[df_transactions['trans_id'].value_counts(dropna=False).values > 1].index.unique();
len(returned_item)

2057

In [12]:
df_transactions['returned_item'] = np.where(df_transactions['trans_id'].isin(returned_item), 'yes', 'no')

In [13]:
df_transactions['drop_record'] = np.where((df_transactions['qty'] < 0), 'yes', 'no')

In [14]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,returned_item,drop_record
0,80712190438,270351,28-02-2014,1,1,-5,-772,405.3,-4265.3,e-Shop,yes,yes
1,29258453508,270384,27-02-2014,5,3,-5,-1497,785.92,-8270.92,e-Shop,yes,yes


In [15]:
df_transactions = df_transactions[~(df_transactions['drop_record'] == 'yes')]
df_transactions.shape
df_transactions['customer_id'].nunique()

(20876, 12)

5506

In [16]:
df_transactions[df_transactions['qty'] > 0]['store_type'].value_counts(dropna=False, normalize=True)

e-Shop           0.40
MBR              0.20
Flagship store   0.20
TeleShop         0.20
Name: store_type, dtype: float64

In [17]:
df_products.rename(columns={'prod_sub_cat_code':'prod_subcat_code'}, inplace=True)
df_products['counter'] = 1
mt.missing_data_table(df_products)
# df_products.head()

There is no missing data


In [18]:
prod_cat_dict = dict(zip(df_products['prod_cat_code'],df_products['prod_cat']))
prod_subcat_dict = dict(zip(df_products['prod_subcat_code'],df_products['prod_subcat']))

In [19]:
# df_products.groupby(['prod_cat'])['prod_subcat'].value_counts(dropna=False)

In [20]:
# df_products['prod_cat'].value_counts(dropna=False)

In [21]:
# df_products['prod_subcat'].value_counts(dropna=False)

### Flatten Transactions Data

In [22]:
converted_customers = df_transactions['customer_id'].unique().tolist()

In [23]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])
df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [24]:
df_transactions = pd.merge(df_transactions, df_products, on=['prod_cat_code', 'prod_subcat_code'], how='left')
df_transactions.head()

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter
0,29258453508,270384,2014-02-20,5,3,5,1497,785.92,8270.92,e-Shop,yes,no,Electronics,Computers,1
1,25455265351,267750,2014-02-20,12,6,3,1360,428.4,4508.4,e-Shop,no,no,Home and kitchen,Tools,1
2,1571002198,275023,2014-02-20,6,5,4,587,246.54,2594.54,e-Shop,no,no,Books,DIY,1
3,36554696014,269345,2014-02-20,3,5,3,1253,394.69,4153.69,e-Shop,no,no,Books,Comics,1
4,56814940239,268799,2014-02-20,7,5,5,368,193.2,2033.2,e-Shop,no,no,Books,Fiction,1


In [25]:
df_transactions[df_transactions['returned_item'] == 'yes']['customer_id'].nunique()

1726

In [26]:
# df_transactions['store_type'].value_counts(dropna=False)

In [27]:
df_transactions = df_transactions.sort_values(['customer_id','trans_date'])

In [28]:
df_transactions['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [29]:
# df_transactions['duration'] = df_transactions.groupby(['customer_id'])['trans_date'].transform(pd.Series.diff).shift(-1)
# df_transactions['duration'] = df_transactions.apply(lambda r: r['assessment_date'] - r['trans_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
# df_transactions['duration'] = (df_transactions['duration']/np.timedelta64(1, 'D'))

In [30]:
# df_transactions.head(7)

In [31]:
df_transactions['product_category'] = df_transactions['prod_cat'] + "-" + df_transactions['prod_subcat']

In [32]:
df_transactions['product_category'].value_counts(dropna=False)

Home and kitchen-Tools             971
Electronics-Mobiles                947
Footwear-Women                     943
Books-Fiction                      940
Books-Children                     934
Home and kitchen-Kitchen           932
Books-Comics                       928
Home and kitchen-Bath              923
Books-Non-Fiction                  916
Footwear-Kids                      916
Home and kitchen-Furnishing        906
Books-DIY                          905
Bags-Mens                          903
Clothing-Women                     901
Clothing-Kids                      901
Electronics-Cameras                898
Electronics-Personal Appliances    887
Bags-Women                         886
Electronics-Audio and video        877
Electronics-Computers              876
Clothing-Mens                      874
Books-Academic                     863
Footwear-Mens                      849
Name: product_category, dtype: int64

In [33]:
# df_transactions.head()

In [34]:
product_features = df_transactions['product_category'].unique().tolist()

for v in product_features:
    df_transactions['product_qty.'+ v]= np.where(df_transactions['product_category'].str.contains(v,case=False),df_transactions['qty'],0) 

In [35]:
product_features = df_transactions['product_category'].unique().tolist()

for v in product_features:
    df_transactions['product_spend.'+ v]= np.where(df_transactions['product_category'].str.contains(v,case=False),df_transactions['total_amt'],0) 

In [36]:
store_features = df_transactions['store_type'].unique().tolist()

for v in store_features:
    df_transactions['store_qty.'+ v]= np.where(df_transactions['store_type'].str.contains(v,case=False),df_transactions['qty'],0) 

In [37]:
store_features = df_transactions['store_type'].unique().tolist()

for v in store_features:
    df_transactions['store_spend.'+ v]= np.where(df_transactions['store_type'].str.contains(v,case=False),df_transactions['total_amt'],0) 

In [38]:
# df_transactions.head()
# df_transactions.shape

In [39]:
trans_cols = list(df_transactions.columns)

In [40]:
matching = [s for s in trans_cols if 'product_' in s]

In [41]:
df_transactions_products_per_customer = df_transactions[matching]
df_transactions_products_per_customer.head(2)

Unnamed: 0,product_category,product_qty.Footwear-Mens,product_qty.Clothing-Mens,product_qty.Books-Non-Fiction,product_qty.Electronics-Mobiles,product_qty.Books-Fiction,product_qty.Footwear-Kids,product_qty.Books-Children,product_qty.Home and kitchen-Kitchen,product_qty.Bags-Mens,product_qty.Clothing-Kids,product_qty.Books-Academic,product_qty.Bags-Women,product_qty.Footwear-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Personal Appliances,product_qty.Electronics-Computers,product_qty.Books-DIY,product_qty.Home and kitchen-Tools,product_qty.Home and kitchen-Furnishing,product_qty.Electronics-Cameras,product_qty.Home and kitchen-Bath,product_qty.Clothing-Women,product_qty.Books-Comics,product_spend.Footwear-Mens,product_spend.Clothing-Mens,product_spend.Books-Non-Fiction,product_spend.Electronics-Mobiles,product_spend.Books-Fiction,product_spend.Footwear-Kids,product_spend.Books-Children,product_spend.Home and kitchen-Kitchen,product_spend.Bags-Mens,product_spend.Clothing-Kids,product_spend.Books-Academic,product_spend.Bags-Women,product_spend.Footwear-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Personal Appliances,product_spend.Electronics-Computers,product_spend.Books-DIY,product_spend.Home and kitchen-Tools,product_spend.Home and kitchen-Furnishing,product_spend.Electronics-Cameras,product_spend.Home and kitchen-Bath,product_spend.Clothing-Women,product_spend.Books-Comics
16438,Footwear-Mens,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5838.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9103,Clothing-Mens,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,308.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_transactions_products_per_customer = df_transactions_products_per_customer.join(df_transactions['customer_id']).reset_index(drop=True)

In [43]:
df_transactions_products_per_customer.shape

(20876, 48)

In [44]:
df_transactions_products_per_customer.head()

Unnamed: 0,product_category,product_qty.Footwear-Mens,product_qty.Clothing-Mens,product_qty.Books-Non-Fiction,product_qty.Electronics-Mobiles,product_qty.Books-Fiction,product_qty.Footwear-Kids,product_qty.Books-Children,product_qty.Home and kitchen-Kitchen,product_qty.Bags-Mens,product_qty.Clothing-Kids,product_qty.Books-Academic,product_qty.Bags-Women,product_qty.Footwear-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Personal Appliances,product_qty.Electronics-Computers,product_qty.Books-DIY,product_qty.Home and kitchen-Tools,product_qty.Home and kitchen-Furnishing,product_qty.Electronics-Cameras,product_qty.Home and kitchen-Bath,product_qty.Clothing-Women,product_qty.Books-Comics,product_spend.Footwear-Mens,product_spend.Clothing-Mens,product_spend.Books-Non-Fiction,product_spend.Electronics-Mobiles,product_spend.Books-Fiction,product_spend.Footwear-Kids,product_spend.Books-Children,product_spend.Home and kitchen-Kitchen,product_spend.Bags-Mens,product_spend.Clothing-Kids,product_spend.Books-Academic,product_spend.Bags-Women,product_spend.Footwear-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Personal Appliances,product_spend.Electronics-Computers,product_spend.Books-DIY,product_spend.Home and kitchen-Tools,product_spend.Home and kitchen-Furnishing,product_spend.Electronics-Cameras,product_spend.Home and kitchen-Bath,product_spend.Clothing-Women,product_spend.Books-Comics,customer_id
0,Footwear-Mens,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5838.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266783
1,Clothing-Mens,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,308.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266783
2,Clothing-Mens,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,960.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266783
3,Books-Non-Fiction,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,1845.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266783
4,Electronics-Mobiles,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,266784


In [45]:
df_transactions_products_per_customer_agg = pd.pivot_table(df_transactions_products_per_customer, index=['customer_id'], values=matching, aggfunc=np.sum)

In [46]:
df_transactions_products_per_customer_agg.head()

Unnamed: 0_level_0,product_qty.Bags-Mens,product_qty.Bags-Women,product_qty.Books-Academic,product_qty.Books-Children,product_qty.Books-Comics,product_qty.Books-DIY,product_qty.Books-Fiction,product_qty.Books-Non-Fiction,product_qty.Clothing-Kids,product_qty.Clothing-Mens,product_qty.Clothing-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Cameras,product_qty.Electronics-Computers,product_qty.Electronics-Mobiles,product_qty.Electronics-Personal Appliances,product_qty.Footwear-Kids,product_qty.Footwear-Mens,product_qty.Footwear-Women,product_qty.Home and kitchen-Bath,product_qty.Home and kitchen-Furnishing,product_qty.Home and kitchen-Kitchen,product_qty.Home and kitchen-Tools,product_spend.Bags-Mens,product_spend.Bags-Women,product_spend.Books-Academic,product_spend.Books-Children,product_spend.Books-Comics,product_spend.Books-DIY,product_spend.Books-Fiction,product_spend.Books-Non-Fiction,product_spend.Clothing-Kids,product_spend.Clothing-Mens,product_spend.Clothing-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Cameras,product_spend.Electronics-Computers,product_spend.Electronics-Mobiles,product_spend.Electronics-Personal Appliances,product_spend.Footwear-Kids,product_spend.Footwear-Mens,product_spend.Footwear-Women,product_spend.Home and kitchen-Bath,product_spend.Home and kitchen-Furnishing,product_spend.Home and kitchen-Kitchen,product_spend.Home and kitchen-Tools
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
266783,0,0,0,0,0,0,0,2,0,4,0,0,0,0,0,0,0,4,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5838.82,0.0,0.0,0.0,0.0,0.0
266784,0,0,0,0,0,0,5,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
266785,3,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,7,8,0,0,0,5,0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,6828.9,0.0
266788,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4,1,0,0,0,0,0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
266794,2,3,4,3,0,0,0,0,7,0,0,3,0,0,0,4,0,0,5,0,0,0,0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,854.16,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0


In [47]:
matching = [s for s in trans_cols if 'store_' in s]

In [48]:
df_transactions_stores_per_customer = df_transactions[matching]
df_transactions_stores_per_customer.head(2)

Unnamed: 0,store_type,store_qty.e-Shop,store_qty.TeleShop,store_qty.Flagship store,store_qty.MBR,store_spend.e-Shop,store_spend.TeleShop,store_spend.Flagship store,store_spend.MBR
16438,e-Shop,4,0,0,0,5838.82,0.0,0.0,0.0
9103,TeleShop,0,3,0,0,0.0,308.3,0.0,0.0


In [49]:
df_transactions_stores_per_customer = df_transactions_stores_per_customer.join(df_transactions['customer_id']).reset_index(drop=True)

In [50]:
df_transactions_stores_per_customer.shape

(20876, 10)

In [51]:
df_transactions_stores_per_customer.head()

Unnamed: 0,store_type,store_qty.e-Shop,store_qty.TeleShop,store_qty.Flagship store,store_qty.MBR,store_spend.e-Shop,store_spend.TeleShop,store_spend.Flagship store,store_spend.MBR,customer_id
0,e-Shop,4,0,0,0,5838.82,0.0,0.0,0.0,266783
1,TeleShop,0,3,0,0,0.0,308.3,0.0,0.0,266783
2,e-Shop,1,0,0,0,960.25,0.0,0.0,0.0,266783
3,e-Shop,2,0,0,0,1845.35,0.0,0.0,0.0,266783
4,Flagship store,0,0,2,0,0.0,0.0,442.0,0.0,266784


In [52]:
df_transactions_stores_per_customer_agg = pd.pivot_table(df_transactions_stores_per_customer, index=['customer_id'], values=matching, aggfunc=np.sum)
df_transactions_stores_per_customer_agg.shape

(5506, 8)

In [53]:
df_transactions_stores_per_customer_agg.head()

Unnamed: 0_level_0,store_qty.Flagship store,store_qty.MBR,store_qty.TeleShop,store_qty.e-Shop,store_spend.Flagship store,store_spend.MBR,store_spend.TeleShop,store_spend.e-Shop
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
266783,0,0,3,7,0.0,0.0,308.3,8644.41
266784,2,0,3,5,442.0,0.0,4279.66,972.4
266785,13,0,13,2,12645.62,0.0,12661.09,3135.99
266788,1,2,0,5,1367.99,1485.12,0.0,3239.86
266794,2,10,4,15,718.25,9275.37,4610.06,13514.15


In [54]:
df_transactions_total_store_visits_per_customer_agg = df_transactions[['customer_id', 'store_type', 'trans_date']].groupby(['customer_id'])['store_type'].value_counts(dropna=False).unstack().fillna(0)
df_transactions_total_store_visits_per_customer_agg.shape
df_transactions_unique_stores_visited_per_customer_agg = df_transactions_total_store_visits_per_customer_agg.copy()

(5506, 4)

In [55]:
isnum = lambda x:int(x)

count_num_of_store_visits = lambda row: isnum(row['Flagship store']) + isnum(row['MBR']) + isnum(row['TeleShop']) + isnum(row['e-Shop']) 

df_transactions_total_store_visits_per_customer_agg['total_visits.count'] = df_transactions_total_store_visits_per_customer_agg.apply(count_num_of_store_visits,axis=1)

In [56]:
df_transactions_total_store_visits_per_customer_agg.reset_index(inplace=True)
df_transactions_total_store_visits_per_customer_agg.head(2)

store_type,customer_id,Flagship store,MBR,TeleShop,e-Shop,total_visits.count
0,266783,0.0,0.0,1.0,3.0,4
1,266784,1.0,0.0,1.0,1.0,3


In [57]:
df_transactions_unique_stores_visited_per_customer_agg[df_transactions_unique_stores_visited_per_customer_agg != 0] = 'yes'

In [58]:
isY = lambda x:int(x=='yes')

unique_num_of_stores_visited = lambda row: isY(row['Flagship store']) + isY(row['MBR']) + isY(row['TeleShop']) + isY(row['e-Shop'])

df_transactions_unique_stores_visited_per_customer_agg['unique_stores_visited.count'] = df_transactions_unique_stores_visited_per_customer_agg.apply(unique_num_of_stores_visited,axis=1)

In [59]:
df_transactions_unique_stores_visited_per_customer_agg.reset_index(inplace=True)

In [60]:
df_transactions_flatten = pd.merge(df_transactions_products_per_customer_agg, df_transactions_stores_per_customer_agg, on='customer_id', how='left')

In [61]:
df_transactions_flatten = pd.merge(df_transactions_flatten, df_transactions_total_store_visits_per_customer_agg[['customer_id', 'total_visits.count']], on='customer_id', how='left')

In [62]:
df_transactions_flatten = pd.merge(df_transactions_flatten, df_transactions_unique_stores_visited_per_customer_agg[['customer_id', 'unique_stores_visited.count']], on='customer_id', how='left')

In [63]:
df_transactions_flatten.head()

Unnamed: 0,customer_id,product_qty.Bags-Mens,product_qty.Bags-Women,product_qty.Books-Academic,product_qty.Books-Children,product_qty.Books-Comics,product_qty.Books-DIY,product_qty.Books-Fiction,product_qty.Books-Non-Fiction,product_qty.Clothing-Kids,product_qty.Clothing-Mens,product_qty.Clothing-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Cameras,product_qty.Electronics-Computers,product_qty.Electronics-Mobiles,product_qty.Electronics-Personal Appliances,product_qty.Footwear-Kids,product_qty.Footwear-Mens,product_qty.Footwear-Women,product_qty.Home and kitchen-Bath,product_qty.Home and kitchen-Furnishing,product_qty.Home and kitchen-Kitchen,product_qty.Home and kitchen-Tools,product_spend.Bags-Mens,product_spend.Bags-Women,product_spend.Books-Academic,product_spend.Books-Children,product_spend.Books-Comics,product_spend.Books-DIY,product_spend.Books-Fiction,product_spend.Books-Non-Fiction,product_spend.Clothing-Kids,product_spend.Clothing-Mens,product_spend.Clothing-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Cameras,product_spend.Electronics-Computers,product_spend.Electronics-Mobiles,product_spend.Electronics-Personal Appliances,product_spend.Footwear-Kids,product_spend.Footwear-Mens,product_spend.Footwear-Women,product_spend.Home and kitchen-Bath,product_spend.Home and kitchen-Furnishing,product_spend.Home and kitchen-Kitchen,product_spend.Home and kitchen-Tools,store_qty.Flagship store,store_qty.MBR,store_qty.TeleShop,store_qty.e-Shop,store_spend.Flagship store,store_spend.MBR,store_spend.TeleShop,store_spend.e-Shop,total_visits.count,unique_stores_visited.count
0,266783,0,0,0,0,0,0,0,2,0,4,0,0,0,0,0,0,0,4,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5838.82,0.0,0.0,0.0,0.0,0.0,0,0,3,7,0.0,0.0,308.3,8644.41,4,2
1,266784,0,0,0,0,0,0,5,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,3,5,442.0,0.0,4279.66,972.4,3,3
2,266785,3,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,7,8,0,0,0,5,0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,6828.9,0.0,13,0,13,2,12645.62,0.0,12661.09,3135.99,7,3
3,266788,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,4,1,0,0,0,0,0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0,1,2,0,5,1367.99,1485.12,0.0,3239.86,4,3
4,266794,2,3,4,3,0,0,0,0,7,0,0,3,0,0,0,4,0,0,5,0,0,0,0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,854.16,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0,2,10,4,15,718.25,9275.37,4610.06,13514.15,11,4


In [64]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,product_category,product_qty.Footwear-Mens,product_qty.Clothing-Mens,product_qty.Books-Non-Fiction,product_qty.Electronics-Mobiles,product_qty.Books-Fiction,product_qty.Footwear-Kids,product_qty.Books-Children,product_qty.Home and kitchen-Kitchen,product_qty.Bags-Mens,product_qty.Clothing-Kids,product_qty.Books-Academic,product_qty.Bags-Women,product_qty.Footwear-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Personal Appliances,product_qty.Electronics-Computers,product_qty.Books-DIY,product_qty.Home and kitchen-Tools,product_qty.Home and kitchen-Furnishing,product_qty.Electronics-Cameras,product_qty.Home and kitchen-Bath,product_qty.Clothing-Women,product_qty.Books-Comics,product_spend.Footwear-Mens,product_spend.Clothing-Mens,product_spend.Books-Non-Fiction,product_spend.Electronics-Mobiles,product_spend.Books-Fiction,product_spend.Footwear-Kids,product_spend.Books-Children,product_spend.Home and kitchen-Kitchen,product_spend.Bags-Mens,product_spend.Clothing-Kids,product_spend.Books-Academic,product_spend.Bags-Women,product_spend.Footwear-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Personal Appliances,product_spend.Electronics-Computers,product_spend.Books-DIY,product_spend.Home and kitchen-Tools,product_spend.Home and kitchen-Furnishing,product_spend.Electronics-Cameras,product_spend.Home and kitchen-Bath,product_spend.Clothing-Women,product_spend.Books-Comics,store_qty.e-Shop,store_qty.TeleShop,store_qty.Flagship store,store_qty.MBR,store_spend.e-Shop,store_spend.TeleShop,store_spend.Flagship store,store_spend.MBR
16438,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,Footwear-Mens,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5838.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,0,0,0,5838.82,0.0,0.0,0.0
9103,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,no,no,Clothing,Mens,1,2014-12-02,Clothing-Mens,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,308.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,3,0,0,0.0,308.3,0.0,0.0


In [65]:
df_transactions_summary = df_transactions.groupby(['customer_id'], as_index=False).agg({'trans_date':'first',
                                                                                        'qty':'sum',
                                                       'tax':'sum',
                                                       'total_amt':'sum' })
df_transactions_summary.shape

(5506, 5)

In [66]:
# df_trans_overall.columns = [".".join(x).strip('.') for x in df_trans_overall.columns.ravel()] 
df_transactions_summary.rename(columns={'trans_date':'conversion_date', 'qty':'product_qty.total',
                                'total_amt':'product_spend.total', 'tax':'tax.total'}, inplace=True)
mt.check_unique_no(df_transactions_summary, ['customer_id'])
df_transactions_summary.head()

Data has 5506 unique customer_id


Unnamed: 0,customer_id,conversion_date,product_qty.total,tax.total,product_spend.total
0,266783,2011-09-23,10,850.71,8952.71
1,266784,2012-04-12,10,541.07,5694.06
2,266785,2011-03-15,28,2702.7,28442.7
3,266788,2011-09-13,8,578.97,6092.97
4,266794,2011-03-18,31,2671.83,28117.83


In [67]:
df_transactions_flatten = pd.merge(df_transactions_flatten, df_transactions_summary, on='customer_id', how='left')
df_transactions_flatten.shape
df_transactions_flatten.head(2)

(5506, 61)

Unnamed: 0,customer_id,product_qty.Bags-Mens,product_qty.Bags-Women,product_qty.Books-Academic,product_qty.Books-Children,product_qty.Books-Comics,product_qty.Books-DIY,product_qty.Books-Fiction,product_qty.Books-Non-Fiction,product_qty.Clothing-Kids,product_qty.Clothing-Mens,product_qty.Clothing-Women,product_qty.Electronics-Audio and video,product_qty.Electronics-Cameras,product_qty.Electronics-Computers,product_qty.Electronics-Mobiles,product_qty.Electronics-Personal Appliances,product_qty.Footwear-Kids,product_qty.Footwear-Mens,product_qty.Footwear-Women,product_qty.Home and kitchen-Bath,product_qty.Home and kitchen-Furnishing,product_qty.Home and kitchen-Kitchen,product_qty.Home and kitchen-Tools,product_spend.Bags-Mens,product_spend.Bags-Women,product_spend.Books-Academic,product_spend.Books-Children,product_spend.Books-Comics,product_spend.Books-DIY,product_spend.Books-Fiction,product_spend.Books-Non-Fiction,product_spend.Clothing-Kids,product_spend.Clothing-Mens,product_spend.Clothing-Women,product_spend.Electronics-Audio and video,product_spend.Electronics-Cameras,product_spend.Electronics-Computers,product_spend.Electronics-Mobiles,product_spend.Electronics-Personal Appliances,product_spend.Footwear-Kids,product_spend.Footwear-Mens,product_spend.Footwear-Women,product_spend.Home and kitchen-Bath,product_spend.Home and kitchen-Furnishing,product_spend.Home and kitchen-Kitchen,product_spend.Home and kitchen-Tools,store_qty.Flagship store,store_qty.MBR,store_qty.TeleShop,store_qty.e-Shop,store_spend.Flagship store,store_spend.MBR,store_spend.TeleShop,store_spend.e-Shop,total_visits.count,unique_stores_visited.count,conversion_date,product_qty.total,tax.total,product_spend.total
0,266783,0,0,0,0,0,0,0,2,0,4,0,0,0,0,0,0,0,4,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5838.82,0.0,0.0,0.0,0.0,0.0,0,0,3,7,0.0,0.0,308.3,8644.41,4,2,2011-09-23,10,850.71,8952.71
1,266784,0,0,0,0,0,0,5,3,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,3,5,442.0,0.0,4279.66,972.4,3,3,2012-04-12,10,541.07,5694.06


In [71]:
ASSESSMENT_DATE = dt.datetime(2014,12,3)

In [76]:
df_transactions_summary.groupby(['customer_id'], as_index=False).agg({'conversion_date': lambda date: (ASSESSMENT_DATE - date)/np.timedelta64(1, 'D'),
                                                                     'conversion_date': lambda date: (ASSESSMENT_DATE - date)/np.timedelta64(1, 'W'),
                                                                     'conversion_date': lambda date: (ASSESSMENT_DATE - date)/np.timedelta64(1, 'Y')})

Unnamed: 0,customer_id,conversion_date
0,266783,3.2
1,266784,2.64
2,266785,3.72
3,266788,3.22
4,266794,3.71
5,266799,2.08
6,266803,2.82
7,266804,1.33
8,266805,0.93
9,266806,2.83


In [None]:
df_transactions.head(2)

In [None]:
df_account_age = df_transactions[['customer_id', 'trans_date']]

In [None]:
df_account_age = pd.merge(df_account_age, df_transactions_summary[['customer_id', 'conversion_date']], on='customer_id', how='left')

In [None]:
df_account_age.head()

In [None]:
df_account_age['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [None]:
df_trans_overall['customer_age.days'] = (df_trans_overall['assessment_date'] - df_trans_overall['conversion_date'])/np.timedelta64(1,'D')

In [None]:
df_trans_overall['customer_age.years'] = (df_trans_overall['assessment_date'] - df_trans_overall['conversion_date'])/np.timedelta64(1,'Y')

In [None]:
age_bins =  [0, 1, 2, 3, 4]
labels = ['_<01','01_02', '02_03', '03_04']
df_trans_overall['customer_age.years.group'] = pd.cut(df_trans_overall['customer_age.years'], age_bins, labels = labels,include_lowest = True)

In [None]:
df_trans_overall.head()

In [None]:
df_purchase_returns_customers = df_transactions[['customer_id', 'returned_item']].drop_duplicates(subset=['customer_id'])

In [None]:
df_trans_overall = pd.merge(df_trans_overall, df_purchase_returns_customers, on='customer_id', how='left')

### Customer Data

In [None]:
df_customer['customer_status'] = np.where((df_customer['customer_id'].isin(converted_customers)), 'converted', 'voluntarily churned - not converted')
df_customer['customer_status'].value_counts(dropna=False)

In [None]:
df_customer['dob'] = pd.to_datetime(df_customer['dob'])
df_customer['dob'].sort_values().min()
df_customer['dob'].sort_values().max()

In [None]:
df_customer.tail()

In [None]:
df_transactions['trans_date.day'] = df_transactions['trans_date'].dt.day
df_transactions['trans_date.month_num'] = df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.year'] = df_transactions['trans_date'].dt.year
df_transactions['trans_date.year_month'] = df_transactions['trans_date'].dt.year.map(str) + '_' + df_transactions['trans_date'].dt.month.map("{:02}".format)
df_transactions['trans_date.hour'] = df_transactions['trans_date'].dt.hour
df_transactions['trans_date.weekday'] = df_transactions['trans_date'].dt.day_name()
df_transactions['trans_date.week_of_year'] = df_transactions['trans_date'].dt.week.map("{:02}".format)

In [None]:
df_transactions['trans_date.weekday'].head()

In [None]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_transactions['trans_date.month_num'].str.contains(k, case=True)

    df_transactions.loc[mask,'trans_date.month'] = v

In [None]:
df_transactions['trans_date.month'] = df_transactions['trans_date.month_num'] + '_' + df_transactions['trans_date.month'].map(str)

In [None]:
df_transactions['trans_date.month'].value_counts(dropna=False)

In [None]:
weekday_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
weekday_num = ['01', '02', '03', '04', '05', '06', '07']
weekday_dict = dict(zip(weekday_name, weekday_num))

for k, v in weekday_dict.items():
    mask = df_transactions['trans_date.weekday'].str.contains(k, case=True)

    df_transactions.loc[mask, 'trans_date.weekday_num'] = v

In [None]:
def bin_week(x):
    if ( x>= 1) and (x <= 7):
        return 'month.week1'

    elif (x >= 8) and (x <= 14):
        return 'month.week2'

    elif (x >= 15) and (x <= 21):
        return 'month.week3'

    else:
        return 'month.week4'

In [None]:
df_transactions['trans_date.week_of_month'] = df_transactions['trans_date'].dt.day.apply(bin_week)

In [None]:
df_transactions['trans_date.weekday'] = df_transactions['trans_date.weekday_num'] + '_' + df_transactions['trans_date.weekday'].map(str)

In [None]:
df_transactions['trans_date.weekday'].value_counts(dropna=False)

In [None]:
# df_transactions['trans_date.hour'].describe()
# df_transactions['trans_date.time_of_day']

In [None]:
df_transactions = df_transactions.sort_values(['customer_id', 'trans_date'])

In [None]:
df_first_purchase_date = df_transactions.groupby(['customer_id']).agg({'trans_date':'first'})

In [None]:
df_first_purchase_date.rename(columns={'trans_date':'first_purchase_date'}, inplace=True)

In [None]:
df_first_purchase_date.shape

In [None]:
df_transactions = pd.merge(df_transactions, df_first_purchase_date, on='customer_id', how='left')

In [None]:
df_transactions['first_purchase.cohort'] = df_transactions['first_purchase_date'].dt.year.map(str) + "_" + df_transactions['first_purchase_date'].dt.month.map("{:02}".format)

In [None]:
df_transactions['purchase_date.cohort'] = df_transactions['trans_date'].dt.year.map(str) + "_" + df_transactions['trans_date'].dt.month.map("{:02}".format)

In [None]:
df_earliest_purchase_date = df_first_purchase_date.copy()
df_earliest_purchase_date.rename(columns={'first_purchase_date':'trans_date'}, inplace=True)

In [None]:
df_earliest_purchase_date['customer_type.period'] = 'new'

In [None]:
df_transactions = pd.merge(df_transactions, df_earliest_purchase_date, on=['customer_id', 'trans_date'], how='left')

In [None]:
df_transactions['customer_type.period']  = np.where(df_transactions['customer_type.period'].isnull(), 'existing', df_transactions['customer_type.period'])

In [None]:
df_transactions['qty_negative'] = np.where((df_transactions['qty'] < 0), 'yes', 'no')

In [None]:
trans_id_returned = df_transactions[df_transactions['qty_negative'] == 'yes']['trans_id'].unique().tolist()

In [None]:
df_transactions['returned'] = np.where((df_transactions['trans_id'].isin(trans_id_returned)), 'yes', 'no')

In [None]:
df_transactions.drop(['qty_negative'], axis=1, inplace=True)

In [None]:
df_transactions.head()

In [None]:
df_transactions.tail()

In [None]:
df_transactions[df_transactions['customer_id'] == 268624]

In [None]:
df_transactions_dates = df_transactions[['customer_id', 'trans_date']]
df_transactions_dates['datetime'] = pd.to_datetime(df_transactions_dates['trans_date'])
df_transactions_dates.head(2)

In [None]:
grouped_df = df_transactions_dates.sort_values('datetime', ascending=False).groupby('customer_id')['trans_date'].apply(list).apply(pd.Series).reset_index()

In [None]:
grouped_df.head()

In [None]:
grouped_df.set_index('customer_id', inplace=True)

In [None]:
grouped_df = grouped_df.rename(columns=lambda x: int(x)+1)

In [None]:
# rename each variable is tags
grouped_df = grouped_df.rename(columns = lambda x : 'trans_date' + str(x))

In [None]:
grouped_df_day = (grouped_df.diff(axis=1) * -1).apply(lambda x: x/np.timedelta64(1, 'D')).fillna(0).astype('int64')
grouped_df_day.head(2)

In [None]:
grouped_df_day[grouped_df_day < 0] = 0

In [None]:
grouped_df_week = (grouped_df.diff(axis=1) * -1).apply(lambda x: x/np.timedelta64(1, 'W')).fillna(0).astype('int64')
grouped_df_week.head(2)

In [None]:
grouped_df_week[grouped_df_week < 0] = 0

In [None]:
grouped_df.head()

In [None]:
grouped_df[grouped_df['trans_date2'] !=0 ]['trans_date2'].describe()

In [None]:
grouped_df[grouped_df['trans_date2'] !=0 ]['trans_date2'].value_counts(dropna=False)

In [None]:
df_transactions['trans_date'].sort_values().max()

In [None]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])
EXTRACTION_DATE = dt.datetime(2014,12,2)
    
rfm= df_transactions.groupby('customer_id', as_index=False).agg({'trans_date': lambda date: (EXTRACTION_DATE - date.max()).days,
                                    'trans_id': lambda num: len(num),
                                    'total_amt': lambda price: price.sum()})

rfm.rename(columns={'trans_date':'recency', 'trans_id':'frequency', 'total_amt':'monetary'}, inplace=True)

rfm['r_quartile'] = pd.qcut(rfm['recency'], 4, ['1','2','3','4'])
rfm['f_quartile'] = pd.qcut(rfm['frequency'], 4, ['4','3','2','1'])
rfm['m_quartile'] = pd.qcut(rfm['monetary'], 4, ['4','3','2','1'])

rfm['RFM_Score'] = rfm.r_quartile.astype(str)+ rfm.f_quartile.astype(str) + rfm.m_quartile.astype(str)

df_customer_rfm = pd.merge(rfm, df_customer, on='customer_id', how='right')

def categorize_customers(rfm_score):
    x = rfm_score
    if (x == '111'):
        return 'best customers'
    elif (x == '311'):
        return 'almost lost'
    elif (x == '411'):
        return 'lost customers'
    elif (x == '444'):
        return 'lost cheap customers'
    else:
        return 'other' 

df_customer_rfm['customer_segment'] = df_customer_rfm['RFM_Score'].apply(categorize_customers)    

df_customer_rfm['customer_segment'] = np.where(((df_customer_rfm['customer_segment'] == 'other') & 
                                                (df_customer_rfm['m_quartile'] == '1')), 'big spender',
                                           np.where(((df_customer_rfm['customer_segment'] == 'other') & 
                                                     (df_customer_rfm['f_quartile'] == '1')), 'loyal customers',
                                                   df_customer_rfm['customer_segment']))

In [None]:
df_customer_rfm.head()

### Master File

In [None]:
df_master = pd.merge(df_customer_rfm, df_trans_overall,  on=['customer_id'], how='left')
df_master = pd.merge(df_master, df_transactions_products_per_customer_agg, on=['customer_id'], how='left')
# df_master['assessment_date'] = df_transactions['trans_date'].sort_values().max()
mt.check_unique_no(df_master, ['customer_id'])
df_master.head()

In [None]:
df_master['customer_status'].value_counts(dropna=False)

In [None]:
df_master['conversion_date.month_num'] = df_master['conversion_date'].dt.month.map("{:02}".format)

In [None]:
df_master['conversion_date.month_num'].value_counts(dropna=False)

In [None]:
# df_master['conversion_date.month_num'] = df_master['conversion_date.month_num'].astype('O').astype('int64')

In [None]:
month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
month_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
month_dict = dict(zip(month_num, month_name))

for k, v in month_dict.items():
    mask = df_master['conversion_date.month_num'].str.contains(k, case=True)

    df_master.loc[mask,'conversion_date.month'] = v

In [None]:
df_master['conversion_date.month'] = df_master['conversion_date.month_num'] + '_' + df_master['conversion_date.month'].map(str)

In [None]:
df_master['conversion_date.month'].value_counts(dropna=False)

In [None]:
df_master.head(1)

In [None]:
df_master['biological_age.actual'] = (df_master['assessment_date'] - df_master['dob'])/np.timedelta64(1, 'Y')

In [None]:
df_master['biological_age.actual'].describe()

In [None]:
df_master['biological_age'] = np.round(df_master['biological_age.actual'])

In [None]:
df_master['biological_age'].describe()

In [None]:
age_bins =  [21, 25, 30, 35, 40, 45]
labels = ['22_25','25_30', '30_35', '35_40', '40_45']
df_master['biological_age.group'] = pd.cut(df_master['biological_age'], age_bins, labels = labels,include_lowest = True)

In [None]:
df_master['number_of_purchase_days'].describe()

In [None]:
df_master['number_of_purchase_days'] =df_master['number_of_purchase_days'].replace(np.nan, 0)

In [None]:
df_master['repeat_purchaser'] = np.where((df_master['number_of_purchase_days'] > 1), 'yes',
                                      np.where((df_master['number_of_purchase_days']== 0),'never purchased',
                                        'no'))

In [None]:
df_master[df_master['customer_id'] == 266783]

In [None]:
df_master[df_master['customer_age.years'] == df_master['customer_age.years'].min()]

In [None]:
df_master['1_day_amt.avg'] = (df_master['total_amt.sum']/df_master['customer_age.days']) * 1
df_master['7_day_amt.avg'] = (df_master['total_amt.sum']/df_master['customer_age.days']) * 7
df_master['30_day_amt.avg'] = (df_master['total_amt.sum']/df_master['customer_age.days']) * 30

In [None]:
df_master['1_day_num.avg'] = (df_master['qty.sum']/df_master['customer_age.days']) * 1
df_master['7_day_num.avg'] = (df_master['qty.sum']/df_master['customer_age.days']) * 7
df_master['30_day_num.avg'] = (df_master['qty.sum']/df_master['customer_age.days']) * 30

In [None]:
df_master[df_master['repeat_purchaser'] == 'yes'].head()

In [None]:
df_master['repeat_purchaser'].value_counts(dropna=False)

In [None]:
df_master['qty.sum'].sum()

In [None]:
df_master['total_amt.sum'].sum()

In [None]:
df_master['total_amt.sum'].sum() - df_master['tax.sum'].sum() 

### Metrics

In [None]:
purchase_frequency_distribution = df_customer_rfm.groupby(['frequency'], as_index=False).agg({'customer_id':'count'})
purchase_frequency_distribution

In [None]:
purchase_frequency_distribution = df_customer_rfm.groupby(['recency'], as_index=False).agg({'customer_id':'count'})
purchase_frequency_distribution

In [None]:
df_customer_rfm['customer_segment'].value_counts(dropna=False, normalize=True)

In [None]:
df_customer_rfm['gender'].value_counts(dropna=False, normalize=True)

In [None]:
df_customer_rfm['city_code'].value_counts(dropna=False, normalize=True)

### Drop Features

In [None]:
drop_cols = ['drop_record', 'counter']

In [None]:
df_transactions = df_transactions.drop(drop_cols, axis=1)

In [None]:
df_transactions.to_csv(filepaths.interim_transactions_data, index=False)
df_master.to_csv(filepaths.master_file_data, index=False)