## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Geolocation
import geonamescache

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_customer = pd.read_csv(filepaths.clean_customer_data_v1, sep=',')

df_churn = pd.read_csv(filepaths.processed_churn_data, sep=',')

# df_us_regions = pd.read_csv(filepaths.clean_us_regions_data_v1, sep=',')

# df_churn = pd.read_csv(filepaths.master_file_data,  sep=',')

df_transactions = pd.read_csv(filepaths.clean_transactions_data_v1, sep=',')

# df_products = pd.read_csv(filepaths.raw_products_data, sep=',')

## Drop Customers with at Most 2 Transactions

In [4]:
df_churn.head(2)

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,account_age.years.group,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,median_duration_btwn_purchase_days,time_since_last_purchase.days,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,number_of_days_late_above_median_duration_factor,churned_customer
0,266783,M,40_45,Pennsylvania,Northeast,converted,03_04,yes,yes,2011.0_9.0,2011-09-23,2013-09-02,4.0,4.0,6.0,295.89,3113.89,3.0,122.0,456.0,291.5,294.0,456.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.55,no
1,266784,F,22_25,California,West,converted,02_03,yes,no,2012.0_4.0,2012-04-12,2012-09-09,3.0,3.0,10.0,541.07,5694.06,3.0,17.0,814.0,321.33,133.0,814.0,442.0,0.0,4279.66,972.4,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.12,yes


In [5]:
df_churn = df_churn[~(df_churn['churned_customer'] == 'yes')]

In [6]:
df_churn['days_since_last_purchase_greater_than_median_duration_btwn_purchases'] = np.where(df_churn['time_since_last_purchase.days']>df_churn['median_duration_btwn_purchase_days'], 1, 0)

In [7]:
df_churn['days_since_last_purchase_greater_than_median_duration_btwn_purchases'].describe()

count   3908.00
mean       0.73
std        0.44
min        0.00
25%        0.00
50%        1.00
75%        1.00
max        1.00
Name: days_since_last_purchase_greater_than_median_duration_btwn_purchases, dtype: float64

In [8]:
df_churn['days_since_last_purchase_greater_than_median_duration_btwn_purchases'].value_counts(dropna=False)

1    2871
0    1037
Name: days_since_last_purchase_greater_than_median_duration_btwn_purchases, dtype: int64

In [9]:
customers_who_purchased_at_most_twice = list(df_churn[df_churn['total_unique_purchase_days'] <= 2]['customer_id'].unique());
len(customers_who_purchased_at_most_twice)

1006

In [10]:
mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape

Data has 5506 unique customer_id


(20876, 15)

In [11]:
df_transactions = df_transactions[~(df_transactions['customer_id'].isin(customers_who_purchased_at_most_twice))].reset_index(drop=True)

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape

Data has 4500 unique customer_id


(18904, 15)

## Observation and Performance Window

In [12]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item
0,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,Clothing_Mens,no
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no


In [13]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [14]:
df_observation_window = df_transactions[(df_transactions['trans_date'] < pd.Timestamp(2014,7,1)) & (df_transactions['trans_date'] >= pd.Timestamp(2011,1,1))].reset_index(drop=True)
df_observation_window['trans_date'].sort_values().min()
df_observation_window['trans_date'].sort_values().max()

mt.check_unique_no(df_observation_window, ['customer_id'])
df_observation_window.shape

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-06-02 00:00:00')

Data has 4500 unique customer_id


(18689, 15)

In [15]:
df_performance_window = df_transactions[(df_transactions['trans_date'] < pd.Timestamp(2014,12,2)) & (df_transactions['trans_date'] >= pd.Timestamp(2014,7,1))].reset_index(drop=True)
df_performance_window['trans_date'].sort_values().min()
df_performance_window['trans_date'].sort_values().max()

mt.check_unique_no(df_performance_window, ['customer_id'])
df_performance_window.shape

Timestamp('2014-07-01 00:00:00')

Timestamp('2014-12-01 00:00:00')

Data has 196 unique customer_id


(199, 15)

In [16]:
df_transactions_customers = pd.DataFrame(df_observation_window['customer_id'].unique())
df_transactions_customers.columns =['customer_id']
df_transactions_customers.head(2)
mt.check_unique_no(df_transactions_customers, ['customer_id'])
df_transactions_customers.shape

Unnamed: 0,customer_id
0,266783
1,266784


Data has 4500 unique customer_id


(4500, 1)

In [17]:
df_1st_payment_in_performance_window = df_performance_window.groupby('customer_id').trans_date.min().reset_index()
df_1st_payment_in_performance_window.columns = ['customer_id', 'min_purchase_date']

In [18]:
df_last_payment_in_observation_window = df_observation_window.groupby('customer_id').trans_date.max().reset_index()
df_last_payment_in_observation_window.columns = ['customer_id', 'max_purchase_date']

In [19]:
df_payment_dates = pd.merge(df_last_payment_in_observation_window, df_1st_payment_in_performance_window, on='customer_id', how='left')

In [20]:
df_payment_dates['next_purchase_day'] = (df_payment_dates['min_purchase_date'] - df_payment_dates['max_purchase_date']).dt.days

df_transactions_customers = pd.merge(df_transactions_customers, df_payment_dates[['customer_id', 'next_purchase_day']], on='customer_id', how='left')

## Feature Engineering

### Observation Window Data

In [21]:
converted_customers = df_transactions['customer_id'].unique().tolist()

In [22]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions = df_transactions.sort_values(['customer_id','trans_date'])

In [23]:
df_observation_window['assessment_date'] = df_observation_window['trans_date'].sort_values().max()

In [24]:
df_observation_window['duration'] = df_observation_window.groupby(['customer_id'])['trans_date'].transform(pd.Series.diff).shift(-1)
df_observation_window['duration'] = df_observation_window.apply(lambda r: r['assessment_date'] - r['trans_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_observation_window['duration'] = (df_observation_window['duration']/np.timedelta64(1, 'D'))

In [25]:
df_observation_window.head(7)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration
0,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,Clothing_Mens,no,2014-06-02,-122.0
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-06-02,-394.0
2,25890929042,266783,2011-09-23,1,2,0,0,0.0,0.0,e-Shop,Footwear,Mens,0,Footwear_Mens,yes,2014-06-02,710.0
3,16999552161,266783,2013-09-02,10,5,2,835,175.35,1845.35,e-Shop,Books,Non-Fiction,1,Books_Non-Fiction,no,2014-06-02,273.0
4,54234600611,266784,2012-08-23,10,5,3,1291,406.67,4279.66,TeleShop,Books,Non-Fiction,1,Books_Non-Fiction,no,2014-06-02,-133.0
5,36310127403,266784,2012-04-12,4,3,2,200,42.0,442.0,Flagship store,Electronics,Mobiles,1,Electronics_Mobiles,no,2014-06-02,150.0
6,26928161256,266784,2012-09-09,7,5,5,176,92.4,972.4,e-Shop,Books,Fiction,1,Books_Fiction,no,2014-06-02,631.0


In [26]:
df_trans_agg = df_observation_window.groupby(['customer_id'], as_index=False).agg({'trans_date':['first', 'last', pd.Series.nunique],
                                                                                  'trans_id':'count',
                                                                                  'qty': 'sum',
                                                                                  'tax':'sum',
                                                                                  'total_amt':'sum', 'counter':'sum',
                                                                                  'duration':['min', 'max','mean', 'last',lambda x: x.median()]})

In [27]:
df_trans_agg.shape
df_trans_agg.columns = [".".join(x).strip('.') for x in df_trans_agg.columns.ravel()] 
df_trans_agg.rename(columns={'trans_date.first':'conversion_date', 'trans_date.last':'last_purchase_date',
                                'trans_date.nunique':'total_unique_purchase_days', 
                                'trans_id.count':'total_unique_transactions', 
                                'qty.sum':'total_items_purchased', 'tax.sum':'total_tax_paid', 'total_amt.sum':'total_amt_paid',
                                'counter.sum':'total_unique_trans_not_reversed', 
                                'duration.mean':'avg_duration_btwn_purchase_days', 'duration.last':'time_since_last_purchase.days',
                                'duration.min':'min_duration_btwn_purchase_days','duration.max':'max_duration_btwn_purchase_days',
                                'duration.<lambda_0>':'median_duration_btwn_purchase_days'}, inplace=True)
mt.check_unique_no(df_trans_agg, ['customer_id'])
df_trans_agg.head()

(4500, 14)

Data has 4500 unique customer_id


Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days
0,266783,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5
1,266784,2012-08-23,2012-09-09,3,3,10,541.07,5694.06,3,-133.0,631.0,216.0,631.0,150.0
2,266785,2012-01-02,2011-10-24,7,7,23,2053.8,21613.8,6,-701.0,952.0,126.0,952.0,36.0
3,266788,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0
4,266794,2011-10-06,2012-01-30,9,10,28,2591.3,27270.3,9,-576.0,854.0,97.0,854.0,17.5


In [28]:
# df_trans_agg[df_trans_agg['customer_id'] == 266852]

In [29]:
# df_transactions[df_transactions['customer_id'] == 266783]

In [30]:
# df_transactions[df_transactions['customer_id'] == 266794]

In [31]:
df_trans_agg['assessment_date'] = df_transactions['trans_date'].sort_values().max()

In [32]:
df_trans_agg['account_age.days'] = (df_trans_agg['assessment_date'] - df_trans_agg['conversion_date'])/np.timedelta64(1,'D')

In [33]:
df_trans_agg['account_age.years'] = (df_trans_agg['assessment_date'] - df_trans_agg['conversion_date'])/np.timedelta64(1,'Y')

In [34]:
age_bins =  [0, 1, 2, 3, 4]
labels = ['00_01','01_02', '02_03', '03_04']
df_trans_agg['account_age.years.group'] = pd.cut(df_trans_agg['account_age.years'], age_bins, labels = labels,include_lowest = True)

In [35]:
def customer_profile(_df, unique_id, value, featurelist):
    df_list = []
    for feature in featurelist:
        _df[feature] = _df[feature].astype(str)
        _df_temp = pd.crosstab(_df[unique_id], _df[feature], values=_df[value], 
                               aggfunc=['count','sum'], dropna=False).fillna(0).reset_index()
        _df_temp.columns = [".".join(x).strip('.') for x in _df_temp.columns.ravel()] 
        df_list.append(_df_temp)
        dfs = [df.set_index(unique_id) for df in df_list]
        df = pd.concat(dfs, axis=1)
        df.reset_index(inplace=True)
    return df

In [36]:
df_product_cat = customer_profile(df_transactions, 'customer_id', 'total_amt', ['product'])
df_product_cat.head()

Unnamed: 0,customer_id,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266784,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,266785,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,0.0,0.0
3,266788,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
4,266794,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0


In [37]:
df_stores = customer_profile(df_transactions, 'customer_id', 'total_amt', ['store_type'])
df_stores.head()

Unnamed: 0,customer_id,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop
0,266783,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59
1,266784,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4
2,266785,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99
3,266788,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86
4,266794,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24


In [38]:
df_trans_agg = pd.merge(df_trans_agg, df_stores, on=['customer_id'], how='left')
df_trans_agg.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop
0,266783,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59
1,266784,2012-08-23,2012-09-09,3,3,10,541.07,5694.06,3,-133.0,631.0,216.0,631.0,150.0,2014-12-02,831.0,2.28,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4
2,266785,2012-01-02,2011-10-24,7,7,23,2053.8,21613.8,6,-701.0,952.0,126.0,952.0,36.0,2014-12-02,1065.0,2.92,02_03,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99
3,266788,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86
4,266794,2011-10-06,2012-01-30,9,10,28,2591.3,27270.3,9,-576.0,854.0,97.0,854.0,17.5,2014-12-02,1153.0,3.16,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24


In [39]:
df_trans_agg['Flagship_store_spend.prop'] = df_trans_agg['sum.Flagship store']/df_trans_agg['total_amt_paid']
df_trans_agg['MBR_spend.prop'] = df_trans_agg['sum.MBR']/df_trans_agg['total_amt_paid']
df_trans_agg['TeleShop_spend.prop'] = df_trans_agg['sum.TeleShop']/df_trans_agg['total_amt_paid']
df_trans_agg['e-Shop.prop'] = df_trans_agg['sum.e-Shop']/df_trans_agg['total_amt_paid']

In [40]:
df_trans_agg = pd.merge(df_trans_agg, df_product_cat, on=['customer_id'], how='left')
df_trans_agg.head()

Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266784,2012-08-23,2012-09-09,3,3,10,541.07,5694.06,3,-133.0,631.0,216.0,631.0,150.0,2014-12-02,831.0,2.28,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,266785,2012-01-02,2011-10-24,7,7,23,2053.8,21613.8,6,-701.0,952.0,126.0,952.0,36.0,2014-12-02,1065.0,2.92,02_03,3.0,0.0,3.0,1.0,5816.72,0.0,12661.09,3135.99,0.27,0.0,0.59,0.15,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,682.89,0.0,0.0,5066.43,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10047.76,5816.72,0.0,0.0,0.0,0.0,0.0
3,266788,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
4,266794,2011-10-06,2012-01-30,9,10,28,2591.3,27270.3,9,-576.0,854.0,97.0,854.0,17.5,2014-12-02,1153.0,3.16,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24,0.03,0.34,0.17,0.49,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0


In [41]:
# df_observation_window['trans_date.day'] = df_observation_window['trans_date'].dt.day
# df_observation_window['trans_date.month_num'] = df_observation_window['trans_date'].dt.month.map("{:02}".format)
# df_observation_window['trans_date.year'] = df_observation_window['trans_date'].dt.year
# df_observation_window['trans_date.year_month'] = df_observation_window['trans_date'].dt.year.map(str) + '_' + df_observation_window['trans_date'].dt.month.map("{:02}".format)
# df_observation_window['trans_date.hour'] = df_observation_window['trans_date'].dt.hour
# df_observation_window['trans_date.weekday'] = df_observation_window['trans_date'].dt.day_name()
# df_observation_window['trans_date.week_of_year'] = df_observation_window['trans_date'].dt.week.map("{:02}".format)

In [42]:
# df_observation_window['trans_date.weekday'].head()

In [43]:
# month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# month_num = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
# month_dict = dict(zip(month_num, month_name))

# for k, v in month_dict.items():
#     mask = df_observation_window['trans_date.month_num'].str.contains(k, case=True)

#     df_observation_window.loc[mask,'trans_date.month'] = v

In [44]:
# df_observation_window['trans_date.month'] = df_observation_window['trans_date.month_num'] + '_' + df_observation_window['trans_date.month'].map(str)

In [45]:
# df_observation_window['trans_date.month'].value_counts(dropna=False)

In [46]:
# weekday_name = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
# weekday_num = ['01', '02', '03', '04', '05', '06', '07']
# weekday_dict = dict(zip(weekday_name, weekday_num))

# for k, v in weekday_dict.items():
#     mask = df_observation_window['trans_date.weekday'].str.contains(k, case=True)

#     df_observation_window.loc[mask, 'trans_date.weekday_num'] = v

In [47]:
# def bin_week(x):
#     if ( x>= 1) and (x <= 7):
#         return 'month.week1'

#     elif (x >= 8) and (x <= 14):
#         return 'month.week2'

#     elif (x >= 15) and (x <= 21):
#         return 'month.week3'

#     else:
#         return 'month.week4'

In [48]:
# df_observation_window['trans_date.week_of_month'] = df_observation_window['trans_date'].dt.day.apply(bin_week)

In [49]:
# df_observation_window.head()

In [50]:
# df_observation_window['trans_date.weekday'] = df_observation_window['trans_date.weekday_num'] + '_' + df_observation_window['trans_date.weekday'].map(str)

In [51]:
# df_observation_window['trans_date.weekday'].value_counts(dropna=False)

In [52]:
# df_observation_window = df_observation_window.sort_values(['customer_id', 'trans_date'])

In [53]:
df_conversion_date = df_observation_window.groupby(['customer_id']).agg({'trans_date':'first'})

In [54]:
df_conversion_date.rename(columns={'trans_date':'conversion_date'}, inplace=True)

In [55]:
df_conversion_date.shape

(4500, 1)

In [56]:
df_observation_window = pd.merge(df_observation_window, df_conversion_date, on='customer_id', how='left')

In [57]:
df_observation_window['conversion_date_cohort'] = df_observation_window['conversion_date'].dt.year.map(str) + "_" + df_observation_window['conversion_date'].dt.month.map("{:02}".format)

In [58]:
df_observation_window['purchase_date_cohort'] = df_observation_window['trans_date'].dt.year.map(str) + "_" + df_observation_window['trans_date'].dt.month.map("{:02}".format)

In [59]:
df_observation_window.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,tax,total_amt,store_type,prod_cat,prod_subcat,counter,product,returned_item,assessment_date,duration,conversion_date,conversion_date_cohort,purchase_date_cohort
0,8410316370,266783,2013-02-20,4,1,1,869,91.25,960.25,e-Shop,Clothing,Mens,1,Clothing_Mens,no,2014-06-02,-122.0,2013-02-20,2013_02,2013_02
1,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,Clothing,Mens,1,Clothing_Mens,no,2014-06-02,-394.0,2013-02-20,2013_02,2012_10


In [60]:
df_earliest_purchase_date = df_conversion_date.copy()
df_earliest_purchase_date.rename(columns={'conversion_date':'trans_date'}, inplace=True)

In [61]:
# df_earliest_purchase_date['customer_type.period'] = 'new'

In [62]:
df_observation_window = pd.merge(df_observation_window, df_earliest_purchase_date, on=['customer_id', 'trans_date'], how='left')

In [63]:
# df_transactions['customer_type.period']  = np.where(df_transactions['customer_type.period'].isnull(), 'existing', df_transactions['customer_type.period'])

**Customers who have returned at least one item before**

In [64]:
customers_who_returned_items_before = list(df_observation_window[df_observation_window['returned_item'] == 'yes']['customer_id'].unique());
len(customers_who_returned_items_before)

1514

### Master File

In [65]:
df_churn.head(2)

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,account_age.years.group,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,median_duration_btwn_purchase_days,time_since_last_purchase.days,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,number_of_days_late_above_median_duration_factor,churned_customer,days_since_last_purchase_greater_than_median_duration_btwn_purchases
0,266783,M,40_45,Pennsylvania,Northeast,converted,03_04,yes,yes,2011.0_9.0,2011-09-23,2013-09-02,4.0,4.0,6.0,295.89,3113.89,3.0,122.0,456.0,291.5,294.0,456.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.55,no,1
3,266788,F,40_45,Florida,South,converted,03_04,yes,no,2011.0_9.0,2011-09-13,2013-12-02,4.0,4.0,8.0,578.97,6092.97,4.0,84.0,397.0,294.0,347.5,365.0,1367.99,1485.12,0.0,3239.86,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.05,no,1


In [66]:
print(list(df_churn))

['customer_id', 'gender', 'biological_age.group', 'state', 'region', 'customer_conversion', 'account_age.years.group', 'repeat_purchaser', 'returned_item_before', 'conversion_date.year_month', 'conversion_date', 'last_purchase_date', 'total_unique_purchase_days', 'total_unique_transactions', 'total_items_purchased', 'total_tax_paid', 'total_amt_paid', 'total_unique_trans_not_reversed', 'min_duration_btwn_purchase_days', 'max_duration_btwn_purchase_days', 'avg_duration_btwn_purchase_days', 'median_duration_btwn_purchase_days', 'time_since_last_purchase.days', 'sum.Flagship store', 'sum.MBR', 'sum.TeleShop', 'sum.e-Shop', 'sum.Bags_Mens', 'sum.Bags_Women', 'sum.Books_Academic', 'sum.Books_Children', 'sum.Books_Comics', 'sum.Books_DIY', 'sum.Books_Fiction', 'sum.Books_Non-Fiction', 'sum.Clothing_Kids', 'sum.Clothing_Mens', 'sum.Clothing_Women', 'sum.Electronics_Audio and video', 'sum.Electronics_Cameras', 'sum.Electronics_Computers', 'sum.Electronics_Mobiles', 'sum.Electronics_Personal Ap

In [67]:
df_churn_observation_window = df_churn[['customer_id', 'gender', 'biological_age.group', 'state', 'region', 'customer_conversion',  'repeat_purchaser', 'returned_item_before', 'conversion_date.year_month']]

In [68]:
df_churn_observation_window.isnull().sum()

customer_id                   0
gender                        0
biological_age.group          0
state                         0
region                        0
customer_conversion           0
repeat_purchaser              0
returned_item_before          0
conversion_date.year_month    0
dtype: int64

In [69]:
df_trans_agg.head(2)

Unnamed: 0,customer_id,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266784,2012-08-23,2012-09-09,3,3,10,541.07,5694.06,3,-133.0,631.0,216.0,631.0,150.0,2014-12-02,831.0,2.28,02_03,1.0,0.0,1.0,1.0,442.0,0.0,4279.66,972.4,0.08,0.0,0.75,0.17,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,972.4,4279.66,0.0,0.0,0.0,0.0,0.0,0.0,442.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
df_churn_observation_window = pd.merge(df_churn_observation_window, df_trans_agg,  on=['customer_id'], how='right')
# df_churn['assessment_date'] = df_transactions['trans_date'].sort_values().max()
mt.check_unique_no(df_churn_observation_window, ['customer_id'])
df_churn_observation_window.head()

Data has 4500 unique customer_id


Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
0,266783,M,40_45,Pennsylvania,Northeast,converted,yes,yes,2011.0_9.0,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,266788,F,40_45,Florida,South,converted,yes,no,2011.0_9.0,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0
2,266794,F,40_45,Texas,South,converted,yes,yes,2011.0_3.0,2011-10-06,2012-01-30,9,10,28,2591.3,27270.3,9,-576.0,854.0,97.0,854.0,17.5,2014-12-02,1153.0,3.16,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24,0.03,0.34,0.17,0.49,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0
3,266799,F,40_45,Ohio,Midwest,converted,yes,yes,2012.0_11.0,2012-11-03,2013-09-24,2,2,7,721.98,7597.98,2,251.0,325.0,288.0,251.0,288.0,2014-12-02,759.0,2.08,02_03,2.0,0.0,0.0,1.0,7149.35,0.0,0.0,448.63,0.94,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.63,0.0,0.0,0.0,0.0,0.0,7149.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,266806,F,22_25,California,West,converted,yes,no,2012.0_2.0,2012-10-18,2012-09-09,6,6,23,1922.24,20229.24,6,-565.0,631.0,98.67,631.0,200.0,2014-12-02,775.0,2.12,02_03,3.0,0.0,0.0,3.0,14423.56,0.0,0.0,5805.67,0.71,0.0,0.0,0.29,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,7392.45,0.0,923.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4364.75,0.0,0.0,3782.41,0.0,517.14


In [71]:
df_churn_observation_window[df_churn_observation_window['customer_id'] == 268159]

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools
3160,268159,,,,,,,,,2012-07-20,2012-06-02,6,6,19,1699.85,17888.85,6,-302.0,730.0,113.67,730.0,68.0,2014-12-02,865.0,2.37,02_03,0.0,1.0,1.0,4.0,0.0,1182.35,7458.75,9247.74,0.0,0.07,0.42,0.52,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,779.02,0.0,0.0,1182.35,0.0,0.0,0.0,0.0,8141.64,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,327.08,0.0,7458.75,0.0


**Customer with customer_id 268159 has purchased a total of 19 items on 6 different occasions (without returning any) and has done this approximately every 152 days over the last 2 to 3 years**

In [72]:
df_churn_observation_window['customer_conversion'].value_counts(dropna=False)

converted    2902
NaN          1598
Name: customer_conversion, dtype: int64

In [73]:
df_churn_observation_window['conversion_date_cohort'] = df_churn_observation_window['conversion_date'].dt.year.map(str) + "_" + df_churn_observation_window['conversion_date'].dt.month.map("{:02}".format)

In [74]:
# df_churn_observation_window['conversion_date.year'] = df_churn_observation_window['conversion_date'].dt.year

In [75]:
# df_churn_observation_window['conversion_date.month_num'] = df_churn_observation_window['conversion_date'].dt.month.map("{:02}".format)

In [76]:
# df_churn_observation_window['conversion_date.month_num'].value_counts(dropna=False)

In [77]:
# df_churn['conversion_date.month_num'] = df_churn['conversion_date.month_num'].astype('O').astype('int64')

In [78]:
# month_name = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
# month_num = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12']
# month_dict = dict(zip(month_num, month_name))

# for k, v in month_dict.items():
#     mask = df_churn_observation_window['conversion_date.month_num'].str.contains(k, case=True)

#     df_churn_observation_window.loc[mask,'conversion_date.month'] = v

In [79]:
# df_churn_observation_window['conversion_date.month'] = df_churn_observation_window['conversion_date.month_num'] + '_' + df_churn_observation_window['conversion_date.month'].map(str)

In [80]:
# df_churn_observation_window['conversion_date.month'].value_counts(dropna=False)

In [81]:
# df_churn_observation_window['conversion_date.year_month'] = df_churn_observation_window['conversion_date.year'].map(str) + '_' + df_churn_observation_window['conversion_date.month_num'].map(str)

In [82]:
# df_churn_observation_window['conversion_date.year_month'].value_counts(dropna=False).head()

In [83]:
# df_churn_observation_window.head(1)

In [84]:
# df_churn_observation_window['dob'] = pd.to_datetime(df_churn_observation_window['dob'])

In [85]:
# df_churn_observation_window['biological_age.actual'] = (df_churn_observation_window['assessment_date'] - df_churn_observation_window['dob'])/np.timedelta64(1, 'Y')

In [86]:
# df_churn_observation_window['biological_age.actual'].describe()

In [87]:
# df_churn_observation_window['biological_age'] = np.round(df_churn_observation_window['biological_age.actual'])

In [88]:
# df_churn_observation_window['biological_age'].describe()

In [89]:
# age_bins =  [21, 25, 30, 35, 40, 45]
# labels = ['22_25','25_30', '30_35', '35_40', '40_45']
# df_churn_observation_window['biological_age.group'] = pd.cut(df_churn_observation_window['biological_age'], age_bins, labels = labels,include_lowest = True)

In [90]:
df_churn_observation_window['total_unique_purchase_days'].describe()

count   4500.00
mean       4.15
std        1.77
min        1.00
25%        3.00
50%        4.00
75%        5.00
max       11.00
Name: total_unique_purchase_days, dtype: float64

In [91]:
df_churn_observation_window['total_unique_purchase_days'] =df_churn_observation_window['total_unique_purchase_days'].replace(np.nan, 0)

In [92]:
df_churn_observation_window['repeat_purchaser'] = np.where((df_churn_observation_window['total_unique_purchase_days'] > 1), 'yes',
                                      np.where((df_churn_observation_window['total_unique_purchase_days']== 0),'never purchased',
                                        'no'))

In [93]:
df_churn_observation_window['returned_item_before'] = np.where(df_churn_observation_window['customer_id'].isin(customers_who_returned_items_before), 'yes', 'no')

In [94]:
df_churn_observation_window['returned_item_before'].value_counts(dropna=False)

no     2986
yes    1514
Name: returned_item_before, dtype: int64

In [95]:
df_churn_observation_window[df_churn_observation_window['customer_id'] == 266783]

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date_cohort
0,266783,M,40_45,Pennsylvania,Northeast,converted,yes,yes,2011.0_9.0,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013_02


In [96]:
df_churn_observation_window[df_churn_observation_window['account_age.years'] == df_churn_observation_window['account_age.years'].min()]

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date_cohort
2108,272998,M,35_40,Texas,South,converted,yes,no,2012.0_6.0,2014-03-01,2013-09-02,3,3,8,829.82,8732.82,3,-631.0,451.0,31.0,273.0,273.0,2014-12-02,276.0,0.76,00_01,2.0,0.0,0.0,1.0,8396.9,0.0,0.0,335.92,0.96,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2954.77,0.0,0.0,5442.12,0.0,0.0,335.92,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2014_03


In [97]:
# df_churn_observation_window['7_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 7
# df_churn_observation_window['14_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 14
# df_churn_observation_window['21_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 21
# df_churn_observation_window['30_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 30
# df_churn_observation_window['60_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 60
# df_churn_observation_window['90_day_amt.avg'] = (df_churn_observation_window['total_amt_paid']/df_churn_observation_window['account_age.days']) * 90

In [98]:
# df_churn_observation_window[['7_day_amt.avg', '14_day_amt.avg', '21_day_amt.avg', '30_day_amt.avg', '60_day_amt.avg','90_day_amt.avg']].describe()

In [99]:
# df_churn_observation_window['7_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 7
# df_churn_observation_window['14_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 14
# df_churn_observation_window['21_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 21
# df_churn_observation_window['30_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 30
# df_churn_observation_window['60_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 60
# df_churn_observation_window['90_day_num.avg'] = (df_churn_observation_window['total_items_purchased']/df_churn_observation_window['account_age.days']) * 90

In [100]:
# df_churn_observation_window[['7_day_num.avg', '14_day_num.avg', '21_day_num.avg', '30_day_num.avg', '60_day_num.avg','90_day_num.avg']].describe()

In [101]:
df_churn_observation_window[df_churn_observation_window['repeat_purchaser'] == 'yes'].head()

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date_cohort
0,266783,M,40_45,Pennsylvania,Northeast,converted,yes,yes,2011.0_9.0,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013_02
1,266788,F,40_45,Florida,South,converted,yes,no,2011.0_9.0,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0,2013_12
2,266794,F,40_45,Texas,South,converted,yes,yes,2011.0_3.0,2011-10-06,2012-01-30,9,10,28,2591.3,27270.3,9,-576.0,854.0,97.0,854.0,17.5,2014-12-02,1153.0,3.16,03_04,1.0,3.0,1.0,6.0,718.25,9275.37,4610.06,13378.24,0.03,0.34,0.17,0.49,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2948.14,2744.82,3964.74,4415.58,0.0,0.0,0.0,0.0,4099.55,0.0,0.0,718.25,0.0,0.0,0.0,4610.06,0.0,0.0,4480.78,0.0,0.0,0.0,0.0,2011_10
3,266799,F,40_45,Ohio,Midwest,converted,yes,no,2012.0_11.0,2012-11-03,2013-09-24,2,2,7,721.98,7597.98,2,251.0,325.0,288.0,251.0,288.0,2014-12-02,759.0,2.08,02_03,2.0,0.0,0.0,1.0,7149.35,0.0,0.0,448.63,0.94,0.0,0.0,0.06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,448.63,0.0,0.0,0.0,0.0,0.0,7149.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2012_11
4,266806,F,22_25,California,West,converted,yes,no,2012.0_2.0,2012-10-18,2012-09-09,6,6,23,1922.24,20229.24,6,-565.0,631.0,98.67,631.0,200.0,2014-12-02,775.0,2.12,02_03,3.0,0.0,0.0,3.0,14423.56,0.0,0.0,5805.67,0.71,0.0,0.0,0.29,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,3248.7,0.0,0.0,0.0,0.0,0.0,7392.45,0.0,923.78,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4364.75,0.0,0.0,3782.41,0.0,517.14,2012_10


In [102]:
df_churn_observation_window['repeat_purchaser'].value_counts(dropna=False)

yes    4029
no      471
Name: repeat_purchaser, dtype: int64

In [103]:
df_churn_observation_window['number_of_unique_times_purchases_returned'] = df_churn_observation_window['total_unique_transactions'] - df_churn_observation_window['total_unique_trans_not_reversed']
df_churn_observation_window['number_of_unique_times_purchases_returned'].value_counts(dropna=False)

0    2986
1    1230
2     257
3      24
4       3
Name: number_of_unique_times_purchases_returned, dtype: int64

In [104]:
df_churn_observation_window.head(2)

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date_cohort,number_of_unique_times_purchases_returned
0,266783,M,40_45,Pennsylvania,Northeast,converted,yes,yes,2011.0_9.0,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013_02,1
1,266788,F,40_45,Florida,South,converted,yes,no,2011.0_9.0,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0,2013_12,0


## Data for Next Purchase Day

In [105]:
df_next_purchase_date = pd.merge(df_churn_observation_window, df_transactions_customers, on='customer_id', how='left')
mt.check_unique_no(df_next_purchase_date, ['customer_id'])
df_next_purchase_date.shape

Data has 4500 unique customer_id


(4500, 87)

In [106]:
df_next_purchase_date.head(2)

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_conversion,repeat_purchaser,returned_item_before,conversion_date.year_month,conversion_date,last_purchase_date,total_unique_purchase_days,total_unique_transactions,total_items_purchased,total_tax_paid,total_amt_paid,total_unique_trans_not_reversed,min_duration_btwn_purchase_days,max_duration_btwn_purchase_days,avg_duration_btwn_purchase_days,time_since_last_purchase.days,median_duration_btwn_purchase_days,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags_Mens,count.Bags_Women,count.Books_Academic,count.Books_Children,count.Books_Comics,count.Books_DIY,count.Books_Fiction,count.Books_Non-Fiction,count.Clothing_Kids,count.Clothing_Mens,count.Clothing_Women,count.Electronics_Audio and video,count.Electronics_Cameras,count.Electronics_Computers,count.Electronics_Mobiles,count.Electronics_Personal Appliances,count.Footwear_Kids,count.Footwear_Mens,count.Footwear_Women,count.Home and kitchen_Bath,count.Home and kitchen_Furnishing,count.Home and kitchen_Kitchen,count.Home and kitchen_Tools,sum.Bags_Mens,sum.Bags_Women,sum.Books_Academic,sum.Books_Children,sum.Books_Comics,sum.Books_DIY,sum.Books_Fiction,sum.Books_Non-Fiction,sum.Clothing_Kids,sum.Clothing_Mens,sum.Clothing_Women,sum.Electronics_Audio and video,sum.Electronics_Cameras,sum.Electronics_Computers,sum.Electronics_Mobiles,sum.Electronics_Personal Appliances,sum.Footwear_Kids,sum.Footwear_Mens,sum.Footwear_Women,sum.Home and kitchen_Bath,sum.Home and kitchen_Furnishing,sum.Home and kitchen_Kitchen,sum.Home and kitchen_Tools,conversion_date_cohort,number_of_unique_times_purchases_returned,next_purchase_day
0,266783,M,40_45,Pennsylvania,Northeast,converted,yes,yes,2011.0_9.0,2013-02-20,2013-09-02,4,4,6,295.89,3113.89,3,-394.0,710.0,116.75,273.0,75.5,2014-12-02,650.0,1.78,01_02,0.0,0.0,1.0,3.0,0.0,0.0,308.3,2805.59,0.0,0.0,0.1,0.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1845.35,0.0,1268.54,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2013_02,1,
1,266788,F,40_45,Florida,South,converted,yes,no,2011.0_9.0,2013-12-02,2012-10-31,4,4,8,578.97,6092.97,4,-727.0,579.0,45.5,579.0,165.0,2014-12-02,365.0,1.0,00_01,1.0,1.0,0.0,2.0,1367.99,1485.12,0.0,3239.86,0.22,0.24,0.0,0.53,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1485.12,0.0,0.0,0.0,0.0,0.0,1367.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1927.12,1312.74,0.0,0.0,0.0,0.0,0.0,2013_12,0,


In [107]:
df_next_purchase_date['next_purchase_day'] = df_next_purchase_date['next_purchase_day'].fillna(999)

In [108]:
df_next_purchase_date['next_purchase_day'].describe()

count   4500.00
mean     975.56
std      119.43
min      150.00
25%      999.00
50%      999.00
75%      999.00
max     1179.00
Name: next_purchase_day, dtype: float64

In [109]:
df_next_purchase_date['next_purchase_day.group'] = np.where((df_next_purchase_date['next_purchase_day'] > df_next_purchase_date['median_duration_btwn_purchase_days']), 1, 0) 

In [110]:
df_next_purchase_date['next_purchase_day.group'].value_counts(dropna=False, normalize=True)

1   0.97
0   0.03
Name: next_purchase_day.group, dtype: float64

### Sanity Check Data

In [111]:
mt.check_unique_no(df_churn_observation_window, ['customer_id'])
df_churn_observation_window.shape

mt.check_unique_no(df_observation_window, ['customer_id'])
df_observation_window.shape

Data has 4500 unique customer_id


(4500, 86)

Data has 4500 unique customer_id


(18689, 20)

In [112]:
df_observation_window['total_amt'].sum()
df_observation_window['tax'].sum()
df_observation_window['rate'].sum()
df_observation_window['qty'].sum()

43875917.864999995

4169204.865

13243451

50572

## Export Data

In [113]:
# df_churn.to_csv(filepaths.master_file_data, index=False)
# df_transactions.to_csv(filepaths.derived_transactions_data_v1, index=False)