## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_master = pd.read_csv(filepaths.master_file_data, sep=',')

df_transactions = pd.read_csv(filepaths.interim_transactions_data, sep=',')

df_transactions_aggregated = pd.read_csv(filepaths.interim_transactions_data_aggregated, sep=',')

In [4]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
0,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,yes,Footwear,Mens,1,2014-12-02,1.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09,new,yes
1,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes


In [5]:
df_transactions = df_transactions[~((df_transactions['returned_item'] == 'yes') & (df_transactions['drop_record'] == 'yes'))]

In [6]:
df_transactions_expanded = df_transactions[['customer_id', 'purchase_date.cohort', 'total_amt']]
mt.check_unique_no(df_transactions_expanded, ['customer_id'])
df_transactions_expanded.shape
df_transactions_expanded.head(2)

Data has 5506 unique customer_id


(20996, 3)

Unnamed: 0,customer_id,purchase_date.cohort,total_amt
1,266783,2011_09,-5838.82
2,266783,2012_10,308.3


In [7]:
df_transactions_agg = df_transactions_expanded.groupby(['customer_id', 'purchase_date.cohort'], as_index=False).agg({'total_amt':'sum'})
mt.check_unique_no(df_transactions_agg, ['customer_id'])
df_transactions_agg.shape
df_transactions_agg.head(2)

Data has 5506 unique customer_id


(19976, 3)

Unnamed: 0,customer_id,purchase_date.cohort,total_amt
0,266783,2011_09,-5838.82
1,266783,2012_10,308.3


In [8]:
df_transactions_agg['purchase_date.cohort'].dtype

dtype('O')

In [9]:
df_transactions_agg['purchase_date.month_start'] = df_transactions_agg['purchase_date.cohort'].str.replace('_', '-')
df_transactions_agg.head(2)

Unnamed: 0,customer_id,purchase_date.cohort,total_amt,purchase_date.month_start
0,266783,2011_09,-5838.82,2011-09
1,266783,2012_10,308.3,2012-10


In [10]:
df_transactions_agg['purchase_date.month_start'] = df_transactions_agg['purchase_date.month_start'] + '-01'
df_transactions_agg.head(2)

Unnamed: 0,customer_id,purchase_date.cohort,total_amt,purchase_date.month_start
0,266783,2011_09,-5838.82,2011-09-01
1,266783,2012_10,308.3,2012-10-01


In [11]:
df_transactions_agg.drop(['purchase_date.cohort'], axis=1, inplace=True)

In [12]:
df_transactions_agg.rename(columns={'purchase_date.month_start':'trans_date', 'total_amt':'trans_amt'}, inplace=True)
df_transactions_agg['trans_date'] = pd.to_datetime(df_transactions_agg['trans_date'])

df_transactions_agg['trans_date_month_num'] = df_transactions_agg['trans_date'].dt.month
df_transactions_agg['trans_date_year'] = df_transactions_agg['trans_date'].dt.year
df_transactions_agg.head(2)

Unnamed: 0,customer_id,trans_amt,trans_date,trans_date_month_num,trans_date_year
0,266783,-5838.82,2011-09-01,9,2011
1,266783,308.3,2012-10-01,10,2012


In [13]:
df_transactions_agg = df_transactions_agg.sort_values(['customer_id', 'trans_date'])
df_transactions_agg.head(2)

Unnamed: 0,customer_id,trans_amt,trans_date,trans_date_month_num,trans_date_year
0,266783,-5838.82,2011-09-01,9,2011
1,266783,308.3,2012-10-01,10,2012


In [14]:
df = df_transactions_agg.copy()

In [15]:
unique_customer_no = df['customer_id'].unique()

MONTH_NUMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
MONTH_NAMES = {n: name for n, name in zip(MONTH_NUMS, MONTHS)}


def _add_rows(copy, months, year):
    global df
    _rows = []
    for _month in months:
        _row = dict(copy)

        _row.update(dict(
            trans_date_year=year,
            trans_date_month_num=_month,
            trans_date_month=MONTH_NAMES[_month],
            trans_amt_month_total=0
        ))

        _rows.append(_row)

    df = df.append(_rows, ignore_index=True)


In [16]:
%%time
for customer_no in unique_customer_no:
    _df = df[df['customer_id'] == customer_no]

    if _df.shape[0] < 2:
        continue

    _df = _df.sort_values(by=['trans_date_year', 'trans_date_month_num'], ascending=True)

    _prev = None
    for _, row in _df.iterrows():
        if _prev is None:
            _prev = row
            continue

        ys, ms = _prev['trans_date_year'], _prev['trans_date_month_num']
        ye, me = row['trans_date_year'], row['trans_date_month_num']

        if ys == ye:
            if ms < me - 1:
                # Fill missing months same year
                _add_rows(_prev, range(ms + 1, me), ys)
        else:
            if ms < 12:
                # Fill in months up to dec
                _add_rows(_prev, range(ms + 1, 13), ys)

            if me > 1:
                # Fill in months starting from jan
                _add_rows(_prev, range(1, me), ye)

        _prev = row

Wall time: 2min 25s


In [17]:
df['trans_amt_month_total'] = np.where(df['trans_amt_month_total'].isnull(), df['trans_amt'], df['trans_amt_month_total'])
df.drop(['trans_date', 'trans_amt', 'trans_date_month'], axis=1, inplace=True)

In [18]:
df['trans_date_month_num'] = df['trans_date_month_num'].map("{:02}".format)

In [19]:
MONTH_LENGTH = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
MONTH_NUMS = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

MONTH_DICT = dict(zip(MONTH_NUMS, MONTH_LENGTH))

for k, v in MONTH_DICT.items():
    mask = df['trans_date_month_num'].str.contains(k, case=True)

    df.loc[mask,'trans_date_month_len'] = v

In [20]:
df['trans_date_month_len'] = df['trans_date_month_len'].astype('int64')

In [21]:
df['month_end_date'] = df['trans_date_year'].map(str) + '-' + df['trans_date_month_num'] + '-' + df['trans_date_month_len'].map(str)

In [22]:
df = df[['customer_id', 'trans_amt_month_total', 'month_end_date']]
df['month_end_date'] = pd.to_datetime(df['month_end_date'])

df = df.sort_values(['customer_id', 'month_end_date']).reset_index(drop=True)
mt.check_unique_no(df, ['customer_id'])
df.shape
df.head(25)

df_copy = df.copy()

Data has 5506 unique customer_id


(98934, 3)

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date
0,266783,-5838.82,2011-09-30
1,266783,0.0,2011-10-31
2,266783,0.0,2011-11-30
3,266783,0.0,2011-12-31
4,266783,0.0,2012-01-31
5,266783,0.0,2012-02-28
6,266783,0.0,2012-03-31
7,266783,0.0,2012-04-30
8,266783,0.0,2012-05-31
9,266783,0.0,2012-06-30


In [23]:
df_copy['trans_amt_month_total.clean'] = np.where(df_copy['trans_amt_month_total'] > 0, df_copy['trans_amt_month_total'], 0 )

In [24]:
# df_first_purchase_month = df.groupby(['customer_id'], as_index=False).agg({'month_end_date':'first'})
# df_first_purchase_month.head(2)

In [25]:
# df = pd.merge(df, df_first_purchase_month, on=['customer_id'], how='left')
# mt.check_unique_no(df, ['customer_id'])
# df.shape
# df.head(2)

In [26]:
# df['revenue_type'] = np.where(df['month_end_date_x'] == df['month_end_date_y'], 'new', 'existing')

In [27]:
df_copy.head(2)

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean
0,266783,-5838.82,2011-09-30,0.0
1,266783,0.0,2011-10-31,0.0


In [28]:
# df['month_revenue_change'] = df.apply(lambda r: r['trans_amt_month_total'] if (r['revenue_type'] == 'new') else r['trans_amt_month_total'].transform(pd.Series.diff), axis=1)

In [29]:
df_copy['prev_trans_amt_month_total.clean'] = df_copy.groupby(['customer_id'])['trans_amt_month_total.clean'].shift(1)

In [30]:
# df_copy['prev_trans_amt_month_total.clean'] = np.where(df_copy['prev_trans_amt_month_total.clean'].isnull(),0, df_copy['prev_trans_amt_month_total.clean'])

In [31]:
# df_copy['month_revenue_change'] = df_copy.groupby(['customer_id'])['trans_amt_month_total.clean'].transform(pd.Series.diff)

In [32]:
df_copy['month_revenue_change'] = df_copy['trans_amt_month_total.clean'] - df_copy['prev_trans_amt_month_total.clean']

In [33]:
# %%time
# df_status = []
# for customer_no in unique_customer_no:
#     _df = df_copy[df_copy['customer_id'] == customer_no]

# #     if _df.shape[0] < 2:
# #         continue

#     _df = _df.sort_values(by=['month_end_date'], ascending=True)

#     _prev = None
#     for _, row in _df.iterrows():
#         if _prev is None:
#             _prev = row
#             continue

#     previous = _prev['trans_amt_month_total.clean']
#     current = row['trans_amt_month_total.clean']

#     if (previous == 0) & (current == 0):
#         _df['revenue_type'] = 'hibernating'
#     else:
#         _df['revenue_type'] = 'not hibernating'

# #         if ys == ye:
# #             if ms < me - 1:
# #                 # Fill missing months same year
# #                 _add_rows(_prev, range(ms + 1, me), ys)
# #         else:
# #             if ms < 12:
# #                 # Fill in months up to dec
# #                 _add_rows(_prev, range(ms + 1, 13), ys)

# #             if me > 1:
# #                 # Fill in months starting from jan
# #                 _add_rows(_prev, range(1, me), ye)

#         df_status.append(_df)
    
# df_new = pd.concat(df_status,ignore_index=True)

In [34]:
df_copy[df_copy['customer_id'] == 275264]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change
98926,275264,1221.03,2011-05-31,1221.03,,
98927,275264,0.0,2011-06-30,0.0,1221.03,-1221.03
98928,275264,0.0,2011-07-31,0.0,0.0,0.0
98929,275264,2594.54,2011-08-31,2594.54,0.0,2594.54


In [35]:
df_copy['revenue_type'] = np.where((df_copy['trans_amt_month_total.clean'] == 0) & (df_copy['prev_trans_amt_month_total.clean'] > 0), 'lapsed',
                                      np.where((df_copy['trans_amt_month_total.clean'] == 0) & (df_copy['prev_trans_amt_month_total.clean'] == 0),'hibernating',
                                               np.where((df_copy['trans_amt_month_total.clean'] > 0) & (df_copy['prev_trans_amt_month_total.clean'] == 0),'reactivation',
                                                np.where((df_copy['trans_amt_month_total.clean'] > df_copy['prev_trans_amt_month_total.clean']),'expansion',
                                                          np.where((df_copy['trans_amt_month_total.clean'] < df_copy['prev_trans_amt_month_total.clean']),'contraction',
                                                                   np.where((df_copy['trans_amt_month_total.clean'] == df_copy['prev_trans_amt_month_total.clean']),'same',
                                                                   np.where((df_copy['trans_amt_month_total.clean'] >= 0) & (df_copy['prev_trans_amt_month_total.clean'].isnull()), 'new',
                                     'unknown')))))))

In [36]:
df_copy[df_copy['customer_id'] == 275264]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
98926,275264,1221.03,2011-05-31,1221.03,,,new
98927,275264,0.0,2011-06-30,0.0,1221.03,-1221.03,lapsed
98928,275264,0.0,2011-07-31,0.0,0.0,0.0,hibernating
98929,275264,2594.54,2011-08-31,2594.54,0.0,2594.54,reactivation


In [37]:
df_copy[df_copy['customer_id'] == 268720]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
22088,268720,468.52,2011-01-31,468.52,,,new
22089,268720,0.0,2011-02-28,0.0,468.52,-468.52,lapsed
22090,268720,0.0,2011-03-31,0.0,0.0,0.0,hibernating
22091,268720,0.0,2011-04-30,0.0,0.0,0.0,hibernating
22092,268720,0.0,2011-05-31,0.0,0.0,0.0,hibernating
22093,268720,0.0,2011-06-30,0.0,0.0,0.0,hibernating
22094,268720,0.0,2011-07-31,0.0,0.0,0.0,hibernating
22095,268720,0.0,2011-08-31,0.0,0.0,0.0,hibernating
22096,268720,0.0,2011-09-30,0.0,0.0,0.0,hibernating
22097,268720,0.0,2011-10-31,0.0,0.0,0.0,hibernating


In [38]:
df_copy['customer_id'].value_counts(dropna=False).head(2)

268720    48
271380    48
Name: customer_id, dtype: int64

In [39]:
df_copy['revenue_type'].value_counts(dropna=False)

hibernating     68909
lapsed          11539
reactivation    11513
new              5506
contraction       733
expansion         732
same                2
Name: revenue_type, dtype: int64

In [40]:
df_copy[df_copy['revenue_type'] == 'same'].head(2)

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
36248,269886,468.52,2012-08-31,468.52,468.52,0.0,same
36663,269921,636.48,2012-09-30,636.48,636.48,0.0,same


In [41]:
df_copy[df_copy['customer_id'] == 269886]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
36247,269886,468.52,2012-07-31,468.52,,,new
36248,269886,468.52,2012-08-31,468.52,468.52,0.0,same
36249,269886,1737.06,2012-09-30,1737.06,468.52,1268.54,expansion
36250,269886,1163.57,2012-10-31,1163.57,1737.06,-573.49,contraction
36251,269886,0.0,2012-11-30,0.0,1163.57,-1163.57,lapsed
36252,269886,0.0,2012-12-31,0.0,0.0,0.0,hibernating
36253,269886,0.0,2013-01-31,0.0,0.0,0.0,hibernating
36254,269886,0.0,2013-02-28,0.0,0.0,0.0,hibernating
36255,269886,0.0,2013-03-31,0.0,0.0,0.0,hibernating
36256,269886,0.0,2013-04-30,0.0,0.0,0.0,hibernating


In [42]:
df_copy[df_copy['customer_id'] == 266784]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
25,266784,442.0,2012-04-30,442.0,,,new
26,266784,0.0,2012-05-31,0.0,442.0,-442.0,lapsed
27,266784,0.0,2012-06-30,0.0,0.0,0.0,hibernating
28,266784,0.0,2012-07-31,0.0,0.0,0.0,hibernating
29,266784,4279.66,2012-08-31,4279.66,0.0,4279.66,reactivation
30,266784,972.4,2012-09-30,972.4,4279.66,-3307.26,contraction


In [43]:
df_copy[df_copy['revenue_type'] == 'unknown'].head(2)

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type


In [44]:
df_copy[df_copy['customer_id'] == 266833]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type
562,266833,-185.64,2011-10-31,0.0,,,new
563,266833,0.0,2011-11-30,0.0,0.0,0.0,hibernating
564,266833,0.0,2011-12-31,0.0,0.0,0.0,hibernating
565,266833,1949.22,2012-01-31,1949.22,0.0,1949.22,reactivation


In [45]:
df_copy = df_copy.sort_values(['customer_id','month_end_date'])
df_copy['assessment_date'] = df_copy['month_end_date'].sort_values().max()
df_copy['duration'] = df_copy.groupby(['customer_id'])['month_end_date'].transform(pd.Series.diff).shift(-1)
df_copy['duration'] = df_copy.apply(lambda r: r['assessment_date'] - r['month_end_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_copy['duration'] = (df_copy['duration']/np.timedelta64(1, 'D'))

In [46]:
df_copy[df_copy['customer_id'] == 266833]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration
562,266833,-185.64,2011-10-31,0.0,,,new,2014-12-31,30.0
563,266833,0.0,2011-11-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0
564,266833,0.0,2011-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0
565,266833,1949.22,2012-01-31,1949.22,0.0,1949.22,reactivation,2014-12-31,1065.0


In [47]:
df_copy['duration'].describe()

count   98934.00
mean       64.02
std       141.27
min         0.00
25%        30.00
50%        31.00
75%        31.00
max      1430.00
Name: duration, dtype: float64

In [48]:
df_copy['customer_status'] = np.where((df_copy['revenue_type'] == 'new') | (df_copy['revenue_type'] == 'expansion') | (df_copy['revenue_type'] == 'reactivation') | (df_copy['revenue_type'] == 'same'), 'active',
                                np.where((df_copy['revenue_type'] == 'contraction') | (df_copy['revenue_type'] == 'lapsed') | (df_copy['revenue_type'] == 'hibernating'), 'at-risk',
                            'unknown'))

In [49]:
df_copy = df_copy.sort_values(['customer_id', 'month_end_date'])

In [50]:
result = [x.reset_index(drop=True) for _, x in df_copy.groupby(['customer_id'])] 

In [51]:
# NUM_DAYS_SINCE_LAST_PURCHASE = (365 * 2)
df_res =[]
for item in result:
    df = item.copy()

    for i, (index, row) in enumerate(df.iterrows()):
        if (i == len(df) - 1) and ((df.loc[index, 'revenue_type'] == 'hibernating')):
            df['churned_customer'] = 'yes'
        else:
            df['churned_customer'] = 'no'


    df_res.append(df)

df_copy = pd.concat(df_res,ignore_index=True)
df_copy.shape

(98934, 11)

In [52]:
df_copy['churned_customer'].value_counts(dropna=False)

no     90094
yes     8840
Name: churned_customer, dtype: int64

In [53]:
NUM_DAYS_SINCE_LAST_PURCHASE = (365 * 2)
# df_res =[]
# for item in result:
#     df = item.copy()

#     for i, (index, row) in enumerate(df.iterrows()):
#         if (i == len(df) - 1) and ((df.loc[index, 'duration'] > NUM_DAYS_SINCE_LAST_PURCHASE)):
#             df['churned_customer'] = 'yes'
#         else:
#             df['churned_customer'] = 'no'


#     df_res.append(df)

# df_copy = pd.concat(df_res,ignore_index=True)
# df_copy.shape

In [54]:
# df_copy['churned_customer'].value_counts(dropna=False)

In [55]:
df_copy[df_copy['customer_id'] == 266794]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer
83,266794,2565.81,2011-03-31,2565.81,,,new,2014-12-31,30.0,active,no
84,266794,11125.14,2011-04-30,11125.14,2565.81,8559.33,expansion,2014-12-31,31.0,active,no
85,266794,0.0,2011-05-31,0.0,11125.14,-11125.14,lapsed,2014-12-31,30.0,at-risk,no
86,266794,0.0,2011-06-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
87,266794,0.0,2011-07-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
88,266794,0.0,2011-08-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no
89,266794,0.0,2011-09-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
90,266794,2948.14,2011-10-31,2948.14,0.0,2948.14,reactivation,2014-12-31,30.0,active,no
91,266794,0.0,2011-11-30,0.0,2948.14,-2948.14,lapsed,2014-12-31,31.0,at-risk,no
92,266794,0.0,2011-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no


In [56]:
# def classify_as_churned(df, num_days_dormant):
#     churned_customers = list(df[df['duration'] > num_days_dormant]['customer_id'].unique())
#     df['churned_customer'] = np.where(df['customer_id'].isin(churned_customers), 'yes', 'no')
#     return df

In [57]:
# df_copy = classify_as_churned(df_copy, 365) #730

In [58]:
df_copy['churned_customer'].value_counts(dropna=False)

no     90094
yes     8840
Name: churned_customer, dtype: int64

In [59]:
df_copy['churned_customer'].value_counts(dropna=False, normalize=True)

no    0.91
yes   0.09
Name: churned_customer, dtype: float64

In [60]:
df_copy[df_copy['churned_customer'] == 'no'].head(2)

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer
0,266783,-5838.82,2011-09-30,0.0,,,new,2014-12-31,31.0,active,no
1,266783,0.0,2011-10-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no


In [61]:
df_copy[df_copy['customer_id'] == 266783]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer
0,266783,-5838.82,2011-09-30,0.0,,,new,2014-12-31,31.0,active,no
1,266783,0.0,2011-10-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no
2,266783,0.0,2011-11-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
3,266783,0.0,2011-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
4,266783,0.0,2012-01-31,0.0,0.0,0.0,hibernating,2014-12-31,28.0,at-risk,no
5,266783,0.0,2012-02-28,0.0,0.0,0.0,hibernating,2014-12-31,32.0,at-risk,no
6,266783,0.0,2012-03-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no
7,266783,0.0,2012-04-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
8,266783,0.0,2012-05-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no
9,266783,0.0,2012-06-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no


In [62]:
# df_copy[(df_copy['churned_customer'] == 'yes') & (df_copy['churn'] == 'no')].tail(2)

In [63]:
df_copy[df_copy['customer_id'] == 275233].tail()

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer
98653,275233,216.58,2013-09-30,216.58,0.0,216.58,reactivation,2014-12-31,31.0,active,no
98654,275233,0.0,2013-10-31,0.0,216.58,-216.58,lapsed,2014-12-31,30.0,at-risk,no
98655,275233,0.0,2013-11-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
98656,275233,0.0,2013-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
98657,275233,1564.68,2014-01-31,1564.68,0.0,1564.68,reactivation,2014-12-31,334.0,active,no


In [64]:
df_copy[df_copy['customer_id'] == 266794].head()

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer
83,266794,2565.81,2011-03-31,2565.81,,,new,2014-12-31,30.0,active,no
84,266794,11125.14,2011-04-30,11125.14,2565.81,8559.33,expansion,2014-12-31,31.0,active,no
85,266794,0.0,2011-05-31,0.0,11125.14,-11125.14,lapsed,2014-12-31,30.0,at-risk,no
86,266794,0.0,2011-06-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no
87,266794,0.0,2011-07-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no


In [65]:
# df_copy_duration_since_last_purchase = df_copy[['customer_id', 'duration']].groupby(['customer_id'], as_index=False).agg({'duration':'last'})

In [66]:
# df_copy_duration_since_last_purchase.head(2)

In [67]:
df_copy['counter'] = 1

In [68]:
def count_active_status(x):
    #x = value
    if x == 'active':
        return 1
    else:
        return 0
    
def count_at_risk_status(x):
    #x = value
    if x == 'at-risk':
        return 1
    else:
        return 0   
    
def count_new_revenue(x):
    #x = value
    if x == 'new':
        return 1
    else:
        return 0
    
def count_expansion_revenue(x):
    #x = value
    if x == 'expansion':
        return 1
    else:
        return 0  
    

def count_reactivation_revenue(x):
    #x = value
    if x == 'reactivation':
        return 1
    else:
        return 0
    
def count_same_revenue(x):
    #x = value
    if x == 'same':
        return 1
    else:
        return 0
    
def count_lapsed_revenue(x):
    #x = value
    if x == 'lapsed':
        return 1
    else:
        return 0   
    
def count_hibernating_revenue(x):
    #x = value
    if x == 'hibernating':
        return 1
    else:
        return 0
    
def count_contraction_revenue(x):
    #x = value
    if x == 'contraction':
        return 1
    else:
        return 0  

In [69]:
df_copy['customer_status.active'] = df_copy['customer_status'].apply(count_active_status)
df_copy['customer_status.at_risk'] = df_copy['customer_status'].apply(count_at_risk_status)
df_copy['revenue_type.new'] = df_copy['revenue_type'].apply(count_new_revenue)
df_copy['revenue_type.expansion'] = df_copy['revenue_type'].apply(count_expansion_revenue)
df_copy['revenue_type.reactivation'] = df_copy['revenue_type'].apply(count_reactivation_revenue)
df_copy['revenue_type.same'] = df_copy['revenue_type'].apply(count_same_revenue)
df_copy['revenue_type.lapsed'] = df_copy['revenue_type'].apply(count_lapsed_revenue)
df_copy['revenue_type.hibernating'] = df_copy['revenue_type'].apply(count_hibernating_revenue)
df_copy['revenue_type.contraction'] = df_copy['revenue_type'].apply(count_contraction_revenue)

In [70]:
customer_status_data = [] #array to store customer status data

for customerid in df_copy['customer_id'].unique():
    #print(customerid)
    cdf = df_copy.loc[df_copy['customer_id'] == customerid].sort_values(by=['month_end_date'], ascending=False)
    
    #print("date: {}, 28 days earlier: {}".format(cdf.iloc[0].date, cdf.iloc[0].date + pd.DateOffset(days=-28)))
    
    prior_df = cdf.loc[cdf['month_end_date']>cdf.iloc[-1]['month_end_date'] + pd.DateOffset(months=-9)] #only select transactions from 28 days earlier
    
#     #calculate the total deposits for each customer 28 days earlier using prior_df to minimize data amount
#     total_at_risk = df_copy.loc[df_copy['customer_status']=='at-risk'].shape[0]
#     total_at_risk_count = df_copy.loc[df_copy['customer_status']=='at-risk'].agg({'amount': 'sum'}).amount
    
#     print("total acc credits: {}, total_deposited:{}".format(total_deposits , total_amount_deposited))
#     customer_data.append([customerid, total_deposits, total_amount_deposited])
    
    #print(index)

In [71]:
df_copy.iloc[-1]['month_end_date'] + pd.DateOffset(months=-9)

Timestamp('2012-06-30 00:00:00')

In [72]:
df_copy.tail()

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer,counter,customer_status.active,customer_status.at_risk,revenue_type.new,revenue_type.expansion,revenue_type.reactivation,revenue_type.same,revenue_type.lapsed,revenue_type.hibernating,revenue_type.contraction
98929,275264,2594.54,2011-08-31,2594.54,0.0,2594.54,reactivation,2014-12-31,1218.0,active,no,1,1,0,0,0,1,0,0,0,0
98930,275265,868.53,2011-12-31,868.53,,,new,2014-12-31,397.0,active,no,1,1,0,1,0,0,0,0,0,0
98931,275265,0.0,2013-01-31,0.0,868.53,-868.53,lapsed,2014-12-31,28.0,at-risk,no,1,0,1,0,0,0,0,1,0,0
98932,275265,0.0,2013-02-28,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
98933,275265,2383.49,2013-03-31,2383.49,0.0,2383.49,reactivation,2014-12-31,640.0,active,no,1,1,0,0,0,1,0,0,0,0


In [73]:
df_copy[df_copy['customer_id']==266783].shape

(25, 21)

In [74]:
df_copy[df_copy['customer_id']==266783]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer,counter,customer_status.active,customer_status.at_risk,revenue_type.new,revenue_type.expansion,revenue_type.reactivation,revenue_type.same,revenue_type.lapsed,revenue_type.hibernating,revenue_type.contraction
0,266783,-5838.82,2011-09-30,0.0,,,new,2014-12-31,31.0,active,no,1,1,0,1,0,0,0,0,0,0
1,266783,0.0,2011-10-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
2,266783,0.0,2011-11-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
3,266783,0.0,2011-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
4,266783,0.0,2012-01-31,0.0,0.0,0.0,hibernating,2014-12-31,28.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
5,266783,0.0,2012-02-28,0.0,0.0,0.0,hibernating,2014-12-31,32.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
6,266783,0.0,2012-03-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
7,266783,0.0,2012-04-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
8,266783,0.0,2012-05-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
9,266783,0.0,2012-06-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0


In [75]:
prior_df[prior_df['customer_id']==266783].shape

(0, 21)

In [76]:
prior_df[prior_df['customer_id']==266783]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer,counter,customer_status.active,customer_status.at_risk,revenue_type.new,revenue_type.expansion,revenue_type.reactivation,revenue_type.same,revenue_type.lapsed,revenue_type.hibernating,revenue_type.contraction


In [77]:
prior_df.head()

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer,counter,customer_status.active,customer_status.at_risk,revenue_type.new,revenue_type.expansion,revenue_type.reactivation,revenue_type.same,revenue_type.lapsed,revenue_type.hibernating,revenue_type.contraction
98933,275265,2383.49,2013-03-31,2383.49,0.0,2383.49,reactivation,2014-12-31,640.0,active,no,1,1,0,0,0,1,0,0,0,0
98932,275265,0.0,2013-02-28,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
98931,275265,0.0,2013-01-31,0.0,868.53,-868.53,lapsed,2014-12-31,28.0,at-risk,no,1,0,1,0,0,0,0,1,0,0
98930,275265,868.53,2011-12-31,868.53,,,new,2014-12-31,397.0,active,no,1,1,0,1,0,0,0,0,0,0


### Aggregate Data

In [78]:
df_copy_agg = df_copy.groupby(['customer_id'], as_index=False).agg({'trans_amt_month_total.clean':'sum',
                                                                    'counter':'sum',
                                                     'duration':['min', 'max', 'mean', 'last'],
                                                     'customer_status.active':'sum', 
                                                     'customer_status.at_risk':'sum',
                                                     'revenue_type.new':'sum',
                                                     'revenue_type.expansion':'sum',
                                                     'revenue_type.reactivation':'sum',
                                                     'revenue_type.same':'sum',
                                                     'revenue_type.lapsed':'sum',
                                                     'revenue_type.hibernating':'sum',
                                                     'revenue_type.contraction':'sum',
                                                                   'churned_customer':'last'})

In [79]:
df_copy_agg.columns = [".".join(x).strip('.') for x in df_copy_agg.columns.ravel()]

In [80]:
df_copy_agg.head()

Unnamed: 0,customer_id,trans_amt_month_total.clean.sum,counter.sum,duration.min,duration.max,duration.mean,duration.last,customer_status.active.sum,customer_status.at_risk.sum,revenue_type.new.sum,revenue_type.expansion.sum,revenue_type.reactivation.sum,revenue_type.same.sum,revenue_type.lapsed.sum,revenue_type.hibernating.sum,revenue_type.contraction.sum,churned_customer.last
0,266783,3113.89,25,28.0,457.0,47.52,457.0,4,21,1,0,3,0,2,19,0,no
1,266784,5694.06,6,30.0,822.0,162.5,822.0,2,4,1,0,1,0,1,2,1,no
2,266785,21613.8,24,28.0,671.0,57.12,671.0,5,19,1,1,3,0,3,16,0,no
3,266788,6092.97,28,28.0,365.0,42.43,365.0,4,24,1,0,3,0,3,21,0,no
4,266794,27981.92,46,0.0,32.0,29.8,0.0,7,39,1,1,5,0,5,34,0,no


In [81]:
df_copy[df_copy['customer_id'] == 266785]

Unnamed: 0,customer_id,trans_amt_month_total,month_end_date,trans_amt_month_total.clean,prev_trans_amt_month_total.clean,month_revenue_change,revenue_type,assessment_date,duration,customer_status,churned_customer,counter,customer_status.active,customer_status.at_risk,revenue_type.new,revenue_type.expansion,revenue_type.reactivation,revenue_type.same,revenue_type.lapsed,revenue_type.hibernating,revenue_type.contraction
31,266785,6911.77,2011-03-31,6911.77,,,new,2014-12-31,30.0,active,no,1,1,0,1,0,0,0,0,0,0
32,266785,0.0,2011-04-30,0.0,6911.77,-6911.77,lapsed,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,1,0,0
33,266785,0.0,2011-05-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
34,266785,0.0,2011-06-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
35,266785,0.0,2011-07-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
36,266785,0.0,2011-08-31,0.0,0.0,0.0,hibernating,2014-12-31,30.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
37,266785,0.0,2011-09-30,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0
38,266785,3135.99,2011-10-31,3135.99,0.0,3135.99,reactivation,2014-12-31,30.0,active,no,1,1,0,0,0,1,0,0,0,0
39,266785,0.0,2011-11-30,0.0,3135.99,-3135.99,lapsed,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,1,0,0
40,266785,0.0,2011-12-31,0.0,0.0,0.0,hibernating,2014-12-31,31.0,at-risk,no,1,0,1,0,0,0,0,0,1,0


In [82]:
df_master.head(2)

Unnamed: 0,customer_id,dob,gender,customer_status,city,state_code,state,region,division,conversion_date,last_purchase_date,number_of_unique_purchase_days,duration.min,duration.max,duration.mean,duration.last,trans_id.count,qty.min,qty.max,qty.sum,cogs.sum,total_amt.sum,assessment_date,account_age.days,account_age.years,account_age.years.group,count.Flagship store,count.MBR,count.TeleShop,count.e-Shop,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,mean.Flagship store,mean.MBR,mean.TeleShop,mean.e-Shop,Flagship_store_spend.prop,MBR_spend.prop,TeleShop_spend.prop,e-Shop.prop,count.Bags,count.Books,count.Clothing,count.Electronics,count.Footwear,count.Home and kitchen,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,mean.Bags,mean.Books,mean.Clothing,mean.Electronics,mean.Footwear,mean.Home and kitchen,Bags.prop,Books.prop,Clothing.prop,Electronics.prop,Footwear.prop,Home and kitchen.prop,count.Academic,count.Audio and video,count.Bath,count.Cameras,count.Children,count.Comics,count.Computers,count.DIY,count.Fiction,...,count.Mens,count.Mobiles,count.Non-Fiction,count.Personal Appliances,count.Tools,count.Women,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,mean.Academic,mean.Audio and video,mean.Bath,mean.Cameras,mean.Children,mean.Comics,mean.Computers,mean.DIY,mean.Fiction,mean.Furnishing,mean.Kids,mean.Kitchen,mean.Mens,mean.Mobiles,mean.Non-Fiction,mean.Personal Appliances,mean.Tools,mean.Women,Academic.prop,Audio and video.prop,Bath.prop,Cameras.prop,Children.prop,Comics.prop,Computers.prop,DIY.prop,Fiction.prop,Furnishing.prop,Kids.prop,Kitchen.prop,Mens.prop,Mobiles.prop,Non-Fiction.prop,Personal Appliances.prop,Tools.prop,Women.prop,conversion_date.year,conversion_date.month_num,conversion_date.month,conversion_date.year_month,biological_age.actual,biological_age,biological_age.group,repeat_purchaser,returned_item_before,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,1_day_num.avg,7_day_num.avg,30_day_num.avg
0,268408,1970-02-01,M,converted,Pittsburgh,PA,Pennsylvania,Northeast,Middle Atlantic,2011-12-07,2014-01-13,11.0,4.0,323.0,99.18,323.0,11.0,-5.0,5.0,33.0,2526.93,24272.43,2014-12-02,1091.0,2.99,02_03,5.0,1.0,3.0,2.0,6217.84,6491.88,3894.02,7668.7,1243.57,6491.88,1298.01,3834.35,0.26,0.27,0.16,0.32,3.0,1.0,1.0,2.0,2.0,2.0,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,354.7,1033.17,890.63,3834.35,3763.08,3044.83,0.04,0.04,0.04,0.32,0.31,0.25,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,3.0,0.0,0.0,0.0,0.0,2.0,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,3044.83,2163.96,0.0,0.0,0.0,0.0,977.37,0.0,0.12,0.0,0.2,0.0,0.0,0.0,0.0,0.04,0.0,0.04,0.25,0.27,0.0,0.0,0.0,0.0,0.08,2011.0,12.0,12.0_Dec,2011.0_12.0,44.83,45.0,40_45,yes,yes,22.25,155.74,667.44,0.03,0.21,0.91
1,269696,1970-07-01,F,converted,Dallas,TX,Texas,South,West South Central,2011-09-18,2012-08-04,3.0,111.0,850.0,390.33,850.0,3.0,-4.0,4.0,3.0,1043.91,4488.51,2014-12-02,1171.0,3.21,03_04,0.0,3.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,1496.17,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2011.0,9.0,9.0_Sep,2011.0_9.0,44.42,44.0,40_45,yes,yes,3.83,26.83,114.99,0.0,0.02,0.08


In [83]:
print(list(df_master))

['customer_id', 'dob', 'gender', 'customer_status', 'city', 'state_code', 'state', 'region', 'division', 'conversion_date', 'last_purchase_date', 'number_of_unique_purchase_days', 'duration.min', 'duration.max', 'duration.mean', 'duration.last', 'trans_id.count', 'qty.min', 'qty.max', 'qty.sum', 'cogs.sum', 'total_amt.sum', 'assessment_date', 'account_age.days', 'account_age.years', 'account_age.years.group', 'count.Flagship store', 'count.MBR', 'count.TeleShop', 'count.e-Shop', 'sum.Flagship store', 'sum.MBR', 'sum.TeleShop', 'sum.e-Shop', 'mean.Flagship store', 'mean.MBR', 'mean.TeleShop', 'mean.e-Shop', 'Flagship_store_spend.prop', 'MBR_spend.prop', 'TeleShop_spend.prop', 'e-Shop.prop', 'count.Bags', 'count.Books', 'count.Clothing', 'count.Electronics', 'count.Footwear', 'count.Home and kitchen', 'sum.Bags', 'sum.Books', 'sum.Clothing', 'sum.Electronics', 'sum.Footwear', 'sum.Home and kitchen', 'mean.Bags', 'mean.Books', 'mean.Clothing', 'mean.Electronics', 'mean.Footwear', 'mean.Ho

In [84]:
# count_cols = list(df_master.columns[df_master.columns.str.startswith('count')])
# other_cols = ['customer_id', 'gender', 'city_code', 'customer_status', 'number_of_unique_purchase_days', 'tax.sum',  'account_age.years.group', 'biological_age.group', 'repeat_purchaser', 'returned_item_before']

sum_cols = list(df_master.columns[df_master.columns.str.startswith('sum')])
avg_spend_days = ['1_day_amt.avg', '7_day_amt.avg', '30_day_amt.avg']
day_cols = ['number_of_unique_purchase_days']
identifier = ['customer_id']
account_cols = ['customer_status', 'account_age.years.group', 'repeat_purchaser', 'returned_item_before', 'conversion_date.year_month']
demographic_cols = ['gender', 'biological_age.group', 'state', 'region']

In [85]:
select_cols =  identifier + demographic_cols + account_cols + day_cols + sum_cols + avg_spend_days

In [86]:
df_master_sub = df_master[select_cols]
mt.check_unique_no(df_master_sub, ['customer_id'])
df_master_sub.shape

Data has 5647 unique customer_id


(5647, 42)

In [87]:
mt.check_unique_no(df_copy_agg, ['customer_id'])
df_copy_agg.shape

Data has 5506 unique customer_id


(5506, 17)

In [88]:
df_master_sub = pd.merge(df_master_sub, df_copy_agg[['customer_id', 'churned_customer.last']], on='customer_id', how='left', suffixes=['.master', '.agg'])
mt.check_unique_no(df_master_sub, ['customer_id'])
df_master_sub.shape

Data has 5647 unique customer_id


(5647, 43)

In [89]:
df_master_sub.head(2)

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_status,account_age.years.group,repeat_purchaser,returned_item_before,conversion_date.year_month,number_of_unique_purchase_days,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,churned_customer.last
0,268408,M,40_45,Pennsylvania,Northeast,converted,02_03,yes,yes,2011.0_12.0,11.0,6217.84,6491.88,3894.02,7668.7,1064.11,1033.17,890.63,7668.7,7526.15,6089.66,0.0,2873.0,0.0,4795.7,0.0,0.0,0.0,0.0,1033.17,0.0,1034.28,6089.66,6491.88,0.0,0.0,0.0,0.0,1954.74,22.25,155.74,667.44,no
1,269696,F,40_45,Texas,South,converted,03_04,yes,yes,2011.0_9.0,3.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4488.51,0.0,0.0,0.0,0.0,3.83,26.83,114.99,yes


In [90]:
df_master_sub.rename(columns={'churned_customer.last':'churned'}, inplace=True)

In [91]:
df_master_sub = df_master_sub.sort_values('30_day_amt.avg',ascending=False)

df_master_sub['30_day_amt.avg.cumsum'] = df_master_sub['30_day_amt.avg'].cumsum()

df_master_sub = df_master_sub.sort_values('30_day_amt.avg.cumsum',ascending=True)

df_master_sub['cum_count'] = df_master_sub.reset_index().index+1

df_master_sub['30_day_amt.avg.cumsum_pct'] = (df_master_sub['30_day_amt.avg.cumsum']/df_master_sub['30_day_amt.avg.cumsum'].max())#*100

df_master_sub['cum_count_pct'] = (df_master_sub['cum_count']/df_master_sub['cum_count'].max())#*100

df_master_sub.head()

df_master_sub.tail()

Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_status,account_age.years.group,repeat_purchaser,returned_item_before,conversion_date.year_month,number_of_unique_purchase_days,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,churned,30_day_amt.avg.cumsum,cum_count,30_day_amt.avg.cumsum_pct,cum_count_pct
3951,274213,M,25_30,Texas,South,converted,00_01,no,no,2014.0_12.0,1.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,0.0,0.0,0.0,0.0,0.0,0.0,2943.72,20606.04,88311.6,no,88311.6,1,0.06,0.0
1700,268801,M,35_40,Pennsylvania,Northeast,converted,00_01,no,no,2014.0_10.0,1.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3787.94,0.0,0.0,0.0,0.0,0.0,0.0,62.1,434.68,1862.92,no,90174.52,2,0.07,0.0
371,267634,M,40_45,Ohio,Midwest,converted,01_02,yes,no,2013.0_4.0,6.0,740.35,0.0,7641.07,21699.99,0.0,6139.38,740.35,6099.6,1405.56,15696.52,0.0,0.0,8055.45,0.0,0.0,6139.38,6099.6,0.0,0.0,0.0,740.35,7641.07,0.0,0.0,0.0,0.0,0.0,1405.56,49.97,349.78,1499.07,no,91673.6,3,0.07,0.0
169,273398,F,40_45,Texas,South,converted,02_03,yes,no,2012.0_9.0,4.0,0.0,15293.2,6723.93,5657.6,0.0,0.0,15293.2,5657.6,6723.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5657.6,0.0,0.0,0.0,14011.4,0.0,0.0,0.0,0.0,0.0,0.0,8005.73,34.51,241.55,1035.21,no,92708.81,4,0.07,0.0
5029,267592,F,22_25,Ohio,Midwest,converted,01_02,yes,yes,2013.0_5.0,6.0,0.0,3411.14,0.0,16320.85,0.0,0.0,0.0,0.0,12289.81,7442.18,0.0,0.0,7442.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11483.16,0.0,0.0,0.0,0.0,0.0,0.0,806.65,34.08,238.56,1022.38,no,93731.19,5,0.07,0.0


Unnamed: 0,customer_id,gender,biological_age.group,state,region,customer_status,account_age.years.group,repeat_purchaser,returned_item_before,conversion_date.year_month,number_of_unique_purchase_days,sum.Flagship store,sum.MBR,sum.TeleShop,sum.e-Shop,sum.Bags,sum.Books,sum.Clothing,sum.Electronics,sum.Footwear,sum.Home and kitchen,sum.Academic,sum.Audio and video,sum.Bath,sum.Cameras,sum.Children,sum.Comics,sum.Computers,sum.DIY,sum.Fiction,sum.Furnishing,sum.Kids,sum.Kitchen,sum.Mens,sum.Mobiles,sum.Non-Fiction,sum.Personal Appliances,sum.Tools,sum.Women,1_day_amt.avg,7_day_amt.avg,30_day_amt.avg,churned,30_day_amt.avg.cumsum,cum_count,30_day_amt.avg.cumsum_pct,cum_count_pct
5476,270232,M,,California,West,voluntarily churned - not converted,,never purchased,no,nan_nan,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5643,,1.0
5493,268693,M,,Ohio,Midwest,voluntarily churned - not converted,,never purchased,no,nan_nan,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5644,,1.0
5533,266947,M,,Ohio,Midwest,voluntarily churned - not converted,,never purchased,no,nan_nan,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5645,,1.0
5540,270973,F,,Ohio,Midwest,voluntarily churned - not converted,,never purchased,no,nan_nan,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5646,,1.0
5641,272125,F,,Pennsylvania,Northeast,voluntarily churned - not converted,,never purchased,no,nan_nan,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5647,,1.0


In [92]:
df_master_sub['churned'].value_counts(dropna=False)

no     5052
yes     454
NaN     141
Name: churned, dtype: int64

## Export Data

In [93]:
df_master_sub.to_csv(filepaths.processed_churn_data, index=False)