## Import Modules

In [1]:
# Set paths
import os
from imp import reload

# Data manipulation
import pandas as pd
import numpy as np
from scipy import stats

# Date manipulation
import datetime as dt
import calendar
calendar.setfirstweekday(calendar.SUNDAY) 

# Custom package for data preprocessing
import mytools as mt 

# Set notebook options
pd.set_option("display.max_columns", 150)
pd.set_option("display.max_rows", 10000)
pd.set_option("display.float_format", lambda x: '%.2f' % x)

# Pretty display of multiple functions in a cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### File Location

In [2]:
# Data paths
import filepaths

## Load Data

In [3]:
df_master = pd.read_csv(filepaths.master_file_data, sep=',')

df_transactions = pd.read_csv(filepaths.interim_transactions_data, sep=',')

# df_transactions_aggregated = pd.read_csv(filepaths.interim_transactions_data_aggregated, sep=',')

In [4]:
df_transactions.head(2)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
0,25890929042,266783,2011-09-23,1,2,4,1321,554.82,5838.82,e-Shop,yes,yes,Footwear,Mens,1,2014-12-02,1.0,23,9,2011,2011_09,0,06_Friday,38,09_Sep,6,month.week4,2011-09-23,2011_09,2011_09,new,yes
1,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes


In [5]:
df_transactions = df_transactions[~((df_transactions['returned_item'] == 'yes') & (df_transactions['drop_record'] == 'yes'))]

mt.check_unique_no(df_transactions, ['customer_id'])
df_transactions.shape
df_transactions.head(2)

Data has 5506 unique customer_id


(20996, 32)

Unnamed: 0,trans_id,customer_id,trans_date,prod_subcat_code,prod_cat_code,qty,rate,cogs,total_amt,store_type,returned_item,drop_record,prod_cat,prod_subcat,counter,assessment_date,duration,trans_date.day,trans_date.month_num,trans_date.year,trans_date.year_month,trans_date.hour,trans_date.weekday,trans_date.week_of_year,trans_date.month,trans_date.weekday_num,trans_date.week_of_month,first_purchase_date,first_purchase.cohort,purchase_date.cohort,customer_type.period,returned_item_before
1,25890929042,266783,2011-09-24,1,2,-4,-1321,554.82,-5838.82,e-Shop,yes,no,Footwear,Mens,1,2014-12-02,393.0,24,9,2011,2011_09,0,07_Saturday,38,09_Sep,7,month.week4,2011-09-23,2011_09,2011_09,existing,yes
2,98477711300,266783,2012-10-21,4,1,3,93,29.3,308.3,TeleShop,no,yes,Clothing,Mens,1,2014-12-02,122.0,21,10,2012,2012_10,0,01_Sunday,42,10_Oct,1,month.week3,2011-09-23,2011_09,2012_10,existing,no


In [6]:
df_transactions['trans_date'] = pd.to_datetime(df_transactions['trans_date'])

df_transactions['trans_date'].sort_values().min()
df_transactions['trans_date'].sort_values().max()

Timestamp('2011-01-02 00:00:00')

Timestamp('2014-12-02 00:00:00')

In [14]:
EXTRACTION_DATE = dt.datetime(2014,12,2)

In [13]:
def categorize_customers(rfm_score):
    x = rfm_score
    if (x == '111'):
        return 'best customers'
    elif (x == '311'):
        return 'almost lost'
    elif (x == '411'):
        return 'lost customers'
    elif (x == '444'):
        return 'lost cheap customers'
    else:
        return 'other'  

In [15]:
df_rfm= df_transactions.groupby('customer_id', as_index=False).agg({'trans_date': lambda date: (EXTRACTION_DATE - date.max()).days,
                                    'trans_id': lambda num: len(num),
                                    'total_amt': lambda price: price.sum()})

In [16]:
df_rfm.rename(columns={'trans_date':'recency', 'trans_id':'frequency', 'total_amt':'monetary'}, inplace=True)

In [17]:
df_rfm['r_quartile'] = pd.qcut(rfm['recency'], 4, ['1','2','3','4'])
df_rfm['f_quartile'] = pd.qcut(rfm['frequency'], 4, ['4','3','2','1'])
df_rfm['m_quartile'] = pd.qcut(rfm['monetary'], 4, ['4','3','2','1'])

In [18]:
df_rfm['RFM_Score'] = df_rfm.r_quartile.astype(str)+ df_rfm.f_quartile.astype(str) + df_rfm.m_quartile.astype(str)

In [19]:
df_master = pd.merge(df_master, df_rfm, on='customer_id', how='left')

MemoryError: 

In [None]:





# print(rfm)
# df_customer_produce = df_produce[['CustomerID', 'CustomerCityName', 'CustomerState', 
#                                   'CustomerRegion', 'CustomerDivision']].drop_duplicates(subset=['CustomerID']).reset_index(drop=True)

# df_master = pd.merge(df_master, df_transactions, on='customer_id', how='left')


df_master['rfm_customer_segment'] = df_master['RFM_Score'].apply(categorize_customers)    

df_master['rfm_customer_segment'] = np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                (df_master['m_quartile'] == '1')), 'big spender',
                                           np.where(((df_master['rfm_customer_segment'] == 'other') & 
                                                     (df_master['f_quartile'] == '1')), 'loyal customers',
                                                   df_master['rfm_customer_segment']))

df_master[['customer_id', 'customer_segment']].head()

In [None]:
df_transactions_agg['purchase_date.cohort'].dtype

In [None]:
df_transactions_agg['purchase_date.month_start'] = df_transactions_agg['purchase_date.cohort'].str.replace('_', '-')
df_transactions_agg.head(2)

In [None]:
df_transactions_agg['purchase_date.month_start'] = df_transactions_agg['purchase_date.month_start'] + '-01'
df_transactions_agg.head(2)

In [None]:
df_transactions_agg.drop(['purchase_date.cohort'], axis=1, inplace=True)

In [None]:
df_transactions_agg.rename(columns={'purchase_date.month_start':'trans_date', 'total_amt':'trans_amt'}, inplace=True)
df_transactions_agg['trans_date'] = pd.to_datetime(df_transactions_agg['trans_date'])

df_transactions_agg['trans_date_month_num'] = df_transactions_agg['trans_date'].dt.month
df_transactions_agg['trans_date_year'] = df_transactions_agg['trans_date'].dt.year
df_transactions_agg.head(2)

In [None]:
df_transactions_agg = df_transactions_agg.sort_values(['customer_id', 'trans_date'])
df_transactions_agg.head(2)

In [None]:
df = df_transactions_agg.copy()

In [None]:
unique_customer_no = df['customer_id'].unique()

MONTH_NUMS = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
MONTHS = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
MONTH_NAMES = {n: name for n, name in zip(MONTH_NUMS, MONTHS)}


def _add_rows(copy, months, year):
    global df
    _rows = []
    for _month in months:
        _row = dict(copy)

        _row.update(dict(
            trans_date_year=year,
            trans_date_month_num=_month,
            trans_date_month=MONTH_NAMES[_month],
            trans_amt_month_total=0
        ))

        _rows.append(_row)

    df = df.append(_rows, ignore_index=True)


In [None]:
%%time
for customer_no in unique_customer_no:
    _df = df[df['customer_id'] == customer_no]

    if _df.shape[0] < 2:
        continue

    _df = _df.sort_values(by=['trans_date_year', 'trans_date_month_num'], ascending=True)

    _prev = None
    for _, row in _df.iterrows():
        if _prev is None:
            _prev = row
            continue

        ys, ms = _prev['trans_date_year'], _prev['trans_date_month_num']
        ye, me = row['trans_date_year'], row['trans_date_month_num']

        if ys == ye:
            if ms < me - 1:
                # Fill missing months same year
                _add_rows(_prev, range(ms + 1, me), ys)
        else:
            if ms < 12:
                # Fill in months up to dec
                _add_rows(_prev, range(ms + 1, 13), ys)

            if me > 1:
                # Fill in months starting from jan
                _add_rows(_prev, range(1, me), ye)

        _prev = row

In [None]:
df['trans_amt_month_total'] = np.where(df['trans_amt_month_total'].isnull(), df['trans_amt'], df['trans_amt_month_total'])
df.drop(['trans_date', 'trans_amt', 'trans_date_month'], axis=1, inplace=True)

In [None]:
df['trans_date_month_num'] = df['trans_date_month_num'].map("{:02}".format)

In [None]:
MONTH_LENGTH = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
MONTH_NUMS = ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']

MONTH_DICT = dict(zip(MONTH_NUMS, MONTH_LENGTH))

for k, v in MONTH_DICT.items():
    mask = df['trans_date_month_num'].str.contains(k, case=True)

    df.loc[mask,'trans_date_month_len'] = v

In [None]:
df['trans_date_month_len'] = df['trans_date_month_len'].astype('int64')

In [None]:
df['month_end_date'] = df['trans_date_year'].map(str) + '-' + df['trans_date_month_num'] + '-' + df['trans_date_month_len'].map(str)

In [None]:
df = df[['customer_id', 'trans_amt_month_total', 'month_end_date']]
df['month_end_date'] = pd.to_datetime(df['month_end_date'])

df = df.sort_values(['customer_id', 'month_end_date']).reset_index(drop=True)
mt.check_unique_no(df, ['customer_id'])
df.shape
df.head(25)

df_copy = df.copy()

In [None]:
df_copy['trans_amt_month_total.clean'] = np.where(df_copy['trans_amt_month_total'] > 0, df_copy['trans_amt_month_total'], 0 )

In [None]:
# df_first_purchase_month = df.groupby(['customer_id'], as_index=False).agg({'month_end_date':'first'})
# df_first_purchase_month.head(2)

In [None]:
# df = pd.merge(df, df_first_purchase_month, on=['customer_id'], how='left')
# mt.check_unique_no(df, ['customer_id'])
# df.shape
# df.head(2)

In [None]:
# df['revenue_type'] = np.where(df['month_end_date_x'] == df['month_end_date_y'], 'new', 'existing')

In [None]:
df_copy.head(2)

In [None]:
# df['month_revenue_change'] = df.apply(lambda r: r['trans_amt_month_total'] if (r['revenue_type'] == 'new') else r['trans_amt_month_total'].transform(pd.Series.diff), axis=1)

In [None]:
df_copy['prev_trans_amt_month_total.clean'] = df_copy.groupby(['customer_id'])['trans_amt_month_total.clean'].shift(1)

In [None]:
# df_copy['prev_trans_amt_month_total.clean'] = np.where(df_copy['prev_trans_amt_month_total.clean'].isnull(),0, df_copy['prev_trans_amt_month_total.clean'])

In [None]:
# df_copy['month_revenue_change'] = df_copy.groupby(['customer_id'])['trans_amt_month_total.clean'].transform(pd.Series.diff)

In [None]:
df_copy['month_revenue_change'] = df_copy['trans_amt_month_total.clean'] - df_copy['prev_trans_amt_month_total.clean']

In [None]:
# %%time
# df_status = []
# for customer_no in unique_customer_no:
#     _df = df_copy[df_copy['customer_id'] == customer_no]

# #     if _df.shape[0] < 2:
# #         continue

#     _df = _df.sort_values(by=['month_end_date'], ascending=True)

#     _prev = None
#     for _, row in _df.iterrows():
#         if _prev is None:
#             _prev = row
#             continue

#     previous = _prev['trans_amt_month_total.clean']
#     current = row['trans_amt_month_total.clean']

#     if (previous == 0) & (current == 0):
#         _df['revenue_type'] = 'hibernating'
#     else:
#         _df['revenue_type'] = 'not hibernating'

# #         if ys == ye:
# #             if ms < me - 1:
# #                 # Fill missing months same year
# #                 _add_rows(_prev, range(ms + 1, me), ys)
# #         else:
# #             if ms < 12:
# #                 # Fill in months up to dec
# #                 _add_rows(_prev, range(ms + 1, 13), ys)

# #             if me > 1:
# #                 # Fill in months starting from jan
# #                 _add_rows(_prev, range(1, me), ye)

#         df_status.append(_df)
    
# df_new = pd.concat(df_status,ignore_index=True)

In [None]:
df_copy[df_copy['customer_id'] == 275264]

In [None]:
df_copy['revenue_type'] = np.where((df_copy['trans_amt_month_total.clean'] == 0) & (df_copy['prev_trans_amt_month_total.clean'] > 0), 'lapsed',
                                      np.where((df_copy['trans_amt_month_total.clean'] == 0) & (df_copy['prev_trans_amt_month_total.clean'] == 0),'hibernating',
                                               np.where((df_copy['trans_amt_month_total.clean'] > 0) & (df_copy['prev_trans_amt_month_total.clean'] == 0),'reactivation',
                                                np.where((df_copy['trans_amt_month_total.clean'] > df_copy['prev_trans_amt_month_total.clean']),'expansion',
                                                          np.where((df_copy['trans_amt_month_total.clean'] < df_copy['prev_trans_amt_month_total.clean']),'contraction',
                                                                   np.where((df_copy['trans_amt_month_total.clean'] == df_copy['prev_trans_amt_month_total.clean']),'same',
                                                                   np.where((df_copy['trans_amt_month_total.clean'] >= 0) & (df_copy['prev_trans_amt_month_total.clean'].isnull()), 'new',
                                     'unknown')))))))

In [None]:
df_copy[df_copy['customer_id'] == 275264]

In [None]:
df_copy[df_copy['customer_id'] == 268720]

In [None]:
df_copy['customer_id'].value_counts(dropna=False).head(2)

In [None]:
df_copy['revenue_type'].value_counts(dropna=False)

In [None]:
df_copy[df_copy['revenue_type'] == 'same'].head(2)

In [None]:
df_copy[df_copy['customer_id'] == 269886]

In [None]:
df_copy[df_copy['customer_id'] == 266784]

In [None]:
df_copy[df_copy['revenue_type'] == 'unknown'].head(2)

In [None]:
df_copy[df_copy['customer_id'] == 266833]

In [None]:
df_copy = df_copy.sort_values(['customer_id','month_end_date'])
df_copy['assessment_date'] = df_copy['month_end_date'].sort_values().max()
df_copy['duration'] = df_copy.groupby(['customer_id'])['month_end_date'].transform(pd.Series.diff).shift(-1)
df_copy['duration'] = df_copy.apply(lambda r: r['assessment_date'] - r['month_end_date'] if pd.isnull(r['duration']) else r['duration'], axis=1)
df_copy['duration'] = (df_copy['duration']/np.timedelta64(1, 'D'))

In [None]:
df_copy[df_copy['customer_id'] == 266833]

In [None]:
df_copy['duration'].describe()

In [None]:
df_copy['customer_status'] = np.where((df_copy['revenue_type'] == 'new') | (df_copy['revenue_type'] == 'expansion') | (df_copy['revenue_type'] == 'reactivation') | (df_copy['revenue_type'] == 'same'), 'active',
                                np.where((df_copy['revenue_type'] == 'contraction') | (df_copy['revenue_type'] == 'lapsed') | (df_copy['revenue_type'] == 'hibernating'), 'at-risk',
                            'unknown'))

In [None]:
df_copy = df_copy.sort_values(['customer_id', 'month_end_date'])

In [None]:
result = [x.reset_index(drop=True) for _, x in df_copy.groupby(['customer_id'])] 

In [None]:
# NUM_DAYS_SINCE_LAST_PURCHASE = (365 * 2)
df_res =[]
for item in result:
    df = item.copy()

    for i, (index, row) in enumerate(df.iterrows()):
        if (i == len(df) - 1) and ((df.loc[index, 'revenue_type'] == 'hibernating')):
            df['churned_customer'] = 'yes'
        else:
            df['churned_customer'] = 'no'


    df_res.append(df)

df_copy = pd.concat(df_res,ignore_index=True)
df_copy.shape

In [None]:
df_copy['churned_customer'].value_counts(dropna=False)

In [None]:
NUM_DAYS_SINCE_LAST_PURCHASE = (365 * 2)
# df_res =[]
# for item in result:
#     df = item.copy()

#     for i, (index, row) in enumerate(df.iterrows()):
#         if (i == len(df) - 1) and ((df.loc[index, 'duration'] > NUM_DAYS_SINCE_LAST_PURCHASE)):
#             df['churned_customer'] = 'yes'
#         else:
#             df['churned_customer'] = 'no'


#     df_res.append(df)

# df_copy = pd.concat(df_res,ignore_index=True)
# df_copy.shape

In [None]:
# df_copy['churned_customer'].value_counts(dropna=False)

In [None]:
df_copy[df_copy['customer_id'] == 266794]

In [None]:
# def classify_as_churned(df, num_days_dormant):
#     churned_customers = list(df[df['duration'] > num_days_dormant]['customer_id'].unique())
#     df['churned_customer'] = np.where(df['customer_id'].isin(churned_customers), 'yes', 'no')
#     return df

In [None]:
# df_copy = classify_as_churned(df_copy, 365) #730

In [None]:
df_copy['churned_customer'].value_counts(dropna=False)

In [None]:
df_copy['churned_customer'].value_counts(dropna=False, normalize=True)

In [None]:
df_copy[df_copy['churned_customer'] == 'no'].head(2)

In [None]:
df_copy[df_copy['customer_id'] == 266783]

In [None]:
# df_copy[(df_copy['churned_customer'] == 'yes') & (df_copy['churn'] == 'no')].tail(2)

In [None]:
df_copy[df_copy['customer_id'] == 275233].tail()

In [None]:
df_copy[df_copy['customer_id'] == 266794].head()

In [None]:
# df_copy_duration_since_last_purchase = df_copy[['customer_id', 'duration']].groupby(['customer_id'], as_index=False).agg({'duration':'last'})

In [None]:
# df_copy_duration_since_last_purchase.head(2)

In [None]:
df_copy['counter'] = 1

In [None]:
def count_active_status(x):
    #x = value
    if x == 'active':
        return 1
    else:
        return 0
    
def count_at_risk_status(x):
    #x = value
    if x == 'at-risk':
        return 1
    else:
        return 0   
    
def count_new_revenue(x):
    #x = value
    if x == 'new':
        return 1
    else:
        return 0
    
def count_expansion_revenue(x):
    #x = value
    if x == 'expansion':
        return 1
    else:
        return 0  
    

def count_reactivation_revenue(x):
    #x = value
    if x == 'reactivation':
        return 1
    else:
        return 0
    
def count_same_revenue(x):
    #x = value
    if x == 'same':
        return 1
    else:
        return 0
    
def count_lapsed_revenue(x):
    #x = value
    if x == 'lapsed':
        return 1
    else:
        return 0   
    
def count_hibernating_revenue(x):
    #x = value
    if x == 'hibernating':
        return 1
    else:
        return 0
    
def count_contraction_revenue(x):
    #x = value
    if x == 'contraction':
        return 1
    else:
        return 0  

In [None]:
df_copy['customer_status.active'] = df_copy['customer_status'].apply(count_active_status)
df_copy['customer_status.at_risk'] = df_copy['customer_status'].apply(count_at_risk_status)
df_copy['revenue_type.new'] = df_copy['revenue_type'].apply(count_new_revenue)
df_copy['revenue_type.expansion'] = df_copy['revenue_type'].apply(count_expansion_revenue)
df_copy['revenue_type.reactivation'] = df_copy['revenue_type'].apply(count_reactivation_revenue)
df_copy['revenue_type.same'] = df_copy['revenue_type'].apply(count_same_revenue)
df_copy['revenue_type.lapsed'] = df_copy['revenue_type'].apply(count_lapsed_revenue)
df_copy['revenue_type.hibernating'] = df_copy['revenue_type'].apply(count_hibernating_revenue)
df_copy['revenue_type.contraction'] = df_copy['revenue_type'].apply(count_contraction_revenue)

In [None]:
customer_status_data = [] #array to store customer status data

for customerid in df_copy['customer_id'].unique():
    #print(customerid)
    cdf = df_copy.loc[df_copy['customer_id'] == customerid].sort_values(by=['month_end_date'], ascending=False)
    
    #print("date: {}, 28 days earlier: {}".format(cdf.iloc[0].date, cdf.iloc[0].date + pd.DateOffset(days=-28)))
    
    prior_df = cdf.loc[cdf['month_end_date']>cdf.iloc[-1]['month_end_date'] + pd.DateOffset(months=-9)] #only select transactions from 28 days earlier
    
#     #calculate the total deposits for each customer 28 days earlier using prior_df to minimize data amount
#     total_at_risk = df_copy.loc[df_copy['customer_status']=='at-risk'].shape[0]
#     total_at_risk_count = df_copy.loc[df_copy['customer_status']=='at-risk'].agg({'amount': 'sum'}).amount
    
#     print("total acc credits: {}, total_deposited:{}".format(total_deposits , total_amount_deposited))
#     customer_data.append([customerid, total_deposits, total_amount_deposited])
    
    #print(index)

In [None]:
df_copy.iloc[-1]['month_end_date'] + pd.DateOffset(months=-9)

In [None]:
df_copy.tail()

In [None]:
df_copy[df_copy['customer_id']==266783].shape

In [None]:
df_copy[df_copy['customer_id']==266783]

In [None]:
prior_df[prior_df['customer_id']==266783].shape

In [None]:
prior_df[prior_df['customer_id']==266783]

In [None]:
prior_df.head()

### Aggregate Data

In [None]:
df_copy_agg = df_copy.groupby(['customer_id'], as_index=False).agg({'trans_amt_month_total.clean':'sum',
                                                                    'counter':'sum',
                                                     'duration':['min', 'max', 'mean', 'last'],
                                                     'customer_status.active':'sum', 
                                                     'customer_status.at_risk':'sum',
                                                     'revenue_type.new':'sum',
                                                     'revenue_type.expansion':'sum',
                                                     'revenue_type.reactivation':'sum',
                                                     'revenue_type.same':'sum',
                                                     'revenue_type.lapsed':'sum',
                                                     'revenue_type.hibernating':'sum',
                                                     'revenue_type.contraction':'sum',
                                                                   'churned_customer':'last'})

In [None]:
df_copy_agg.columns = [".".join(x).strip('.') for x in df_copy_agg.columns.ravel()]

In [None]:
df_copy_agg.head()

In [None]:
df_copy[df_copy['customer_id'] == 266785]

In [None]:
df_master.head(2)

In [None]:
print(list(df_master))

In [None]:
# count_cols = list(df_master.columns[df_master.columns.str.startswith('count')])
# other_cols = ['customer_id', 'gender', 'city_code', 'customer_status', 'number_of_unique_purchase_days', 'tax.sum',  'account_age.years.group', 'biological_age.group', 'repeat_purchaser', 'returned_item_before']

sum_cols = list(df_master.columns[df_master.columns.str.startswith('sum')])
avg_spend_days = ['1_day_amt.avg', '7_day_amt.avg', '30_day_amt.avg']
day_cols = ['number_of_unique_purchase_days']
identifier = ['customer_id']
account_cols = ['customer_status', 'account_age.years.group', 'repeat_purchaser', 'returned_item_before', 'conversion_date.year_month']
demographic_cols = ['gender', 'biological_age.group', 'state', 'region']

In [None]:
select_cols =  identifier + demographic_cols + account_cols + day_cols + sum_cols + avg_spend_days

In [None]:
df_master_sub = df_master[select_cols]
mt.check_unique_no(df_master_sub, ['customer_id'])
df_master_sub.shape

In [None]:
mt.check_unique_no(df_copy_agg, ['customer_id'])
df_copy_agg.shape

In [None]:
df_master_sub = pd.merge(df_master_sub, df_copy_agg[['customer_id', 'churned_customer.last']], on='customer_id', how='left', suffixes=['.master', '.agg'])
mt.check_unique_no(df_master_sub, ['customer_id'])
df_master_sub.shape

In [None]:
df_master_sub.head(2)

In [None]:
df_master_sub.rename(columns={'churned_customer.last':'churned'}, inplace=True)

In [None]:
df_master_sub = df_master_sub.sort_values('30_day_amt.avg',ascending=False)

df_master_sub['30_day_amt.avg.cumsum'] = df_master_sub['30_day_amt.avg'].cumsum()

df_master_sub = df_master_sub.sort_values('30_day_amt.avg.cumsum',ascending=True)

df_master_sub['cum_count'] = df_master_sub.reset_index().index+1

df_master_sub['30_day_amt.avg.cumsum_pct'] = (df_master_sub['30_day_amt.avg.cumsum']/df_master_sub['30_day_amt.avg.cumsum'].max())#*100

df_master_sub['cum_count_pct'] = (df_master_sub['cum_count']/df_master_sub['cum_count'].max())#*100

df_master_sub.head()

df_master_sub.tail()

In [None]:
df_master_sub['churned'].value_counts(dropna=False)

## Export Data

In [None]:
df_master_sub.to_csv(filepaths.processed_churn_data, index=False)