In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import feather
import datetime as dt
from IPython.core.display import display
pd.options.display.max_columns = 999
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import pickle

In [2]:
'''
IMPORTANT CHANGE FROM ORIGINAL VERSION:
-invoice table was change to the asof month, year style
-preprocess the data so that the model will aim to predict churn 60 day in advance
-elapse date is referenced with month, date of the table
-all the data with elapse date <60 days are filtered out
-elapse day is adjusted 60 days back (-60) 
'''

# account
df_accs = feather.read_dataframe('../src/df_accs_samp-raw')
df_accs = df_accs.drop('START_DT',axis=1)
df_accs = df_accs.set_index('ACCOUNT')
df_accs = df_accs.dropna(subset=['PROVINCE'])
cat_cols = ['BILL_DELIVER','PAY_METHOD','PROVINCE']
enc_dict = dict()
for col in  cat_cols:
    enc_dict[col] = LabelEncoder()
    df_accs[col] = enc_dict[col].fit_transform(df_accs[col])

with open('../out/enc.dictionary', 'wb') as config_dictionary_file: 
    pickle.dump(enc_dict, config_dictionary_file)
    
for year, month in tqdm_notebook(zip([2017]*12 + [2018]*12, list(range(1,13))*2)):
    
    try:
        # aggregation of past product
        df_prods = pd.read_csv('../bin/monthly/df_product_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_prods = df_prods.loc[(df_prods.elapse_end > 60) | (df_prods.elapse_end == -1)]
        
        # adjust elapse date
        df_prods['elapse_end'] = df_prods['elapse_end'].apply(lambda x:x-60 if x>60 else x)
        df_prods['elapse_start'] = df_prods['elapse_start'].apply(lambda x:x-60 if x>60 else x)
        df_prods_agg_past = df_prods.loc[df_prods.elapse_end != -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                                   'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                           'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                           'CHARGE':[np.max,np.min,np.median,np.mean]})

        df_prods_agg_past.columns = [col + '-' + stat for col, stat in zip(df_prods_agg_past.columns.get_level_values(0),df_prods_agg_past.columns.get_level_values(1))]

        # aggregation of active product
        df_prod_agg_act = df_prods.loc[df_prods.elapse_end == -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                                   'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                           'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                           'CHARGE':[np.max,np.min,np.median,np.mean]})
        df_prod_agg_act.columns = [col + '-' + stat for col, stat in zip(df_prod_agg_act.columns.get_level_values(0),df_prod_agg_act.columns.get_level_values(1))]

        acc_list = df_prods.ACCOUNT.unique()
        tel_n_acc = df_prods[['ACCOUNT','TEL']]

        # aggregation of service
        df_scoms = pd.read_csv('../bin/monthly/df_scoms_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_scoms = df_scoms.merge(tel_n_acc,how='inner',on='TEL').drop('TEL',axis=1)
        df_scoms['CMPTIME'] = pd.to_timedelta(df_scoms.CMPTIME)
        df_scoms = df_scoms.loc[df_scoms.elapse>60]
        
        # adjust elapse date
        df_scoms['elapse'] = df_scoms['elapse'].apply(lambda x:x-60 if x>60 else x)
        
        df_scoms_agg_detail = df_scoms.groupby('ACCOUNT').sum().drop('elapse',axis=1)
        df_scoms_agg_recency = df_scoms.groupby('ACCOUNT').agg({'elapse':[np.max,np.min,np.median,np.mean]})

        # aggregation of invoice
        df_invs = pd.read_csv('../bin/monthly/df_invs_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_list)]
        df_invs = df_invs.drop(['BILL_DT','DUE_DATE','SETTLED_DT'],axis=1)
        df_invs['ADJUST'] = df_invs.ADJUST.fillna(0)

        # filter out records that already have been counted as churn in the past (elapse > 90)
        serie_filt =  df_invs.groupby('ACCOUNT')['elapse_due'].agg(lambda x:True if x.min() < 90 else False)
        
        acc_keep = serie_filt.loc[serie_filt==True].index
        df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_keep)]

        # define churn target as account with min settled elapse > 60
        serie_acc_churn = df_invs.groupby('ACCOUNT')['elapse_due'].agg(lambda x:1 if x.min() > 60 else 0)
        serie_acc_churn.name = 'churn'

        # filter out records with elapse > 60
        df_invs['elapse_settled'] = df_invs['elapse_settled']-60
        df_invs['elapse_due'] = df_invs['elapse_due']-60
        df_invs_agg = df_invs.loc[df_invs.elapse_due > 0].groupby('ACCOUNT').agg([np.max,np.min,np.median,np.mean])
        df_invs_agg.columns = [col + '-' + stat for col, stat in zip(df_invs_agg.columns.get_level_values(0),df_invs_agg.columns.get_level_values(1))]

        # merge all feature
        df_all = pd.concat([df_accs,
                            df_prod_agg_act,
                            df_prods_agg_past,
                            df_scoms_agg_detail,
                            df_scoms_agg_recency,
                            df_invs_agg,serie_acc_churn],
                            axis=1).dropna(subset=['churn']).fillna(0)

        df_all.to_csv('../bin/monthly/df_feat_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
    
    except:
        print(f'month: {month}, year: {year} skipped...')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

month: 1, year: 2017 skipped...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.







# recheck the validity of target definition

In [36]:
# check for account in acc_keep... their min elapse due should be <90 -----> valid
np.sum(df_invs.loc[df_invs.ACCOUNT.isin(acc_keep)].groupby('ACCOUNT')['elapse_due'].agg(lambda x:x.min())>90)

0

In [37]:
df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_keep)]

In [38]:
# define churn target as account with min settled elapse > 60
serie_acc_churn = df_invs.groupby('ACCOUNT')['elapse_due'].agg(lambda x:1 if x.min() > 60 else 0)
serie_acc_churn.name = 'churn'

In [39]:
len(serie_acc_churn),np.sum(serie_acc_churn), np.sum(serie_acc_churn)/len(serie_acc_churn)

(21708, 224, 0.010318776487930717)

In [40]:
serie_acc_churn[serie_acc_churn==1].index

Index(['03860012C092D86AB3096D8E3898AFE2', '04822E975F38F75D4559C104ECE9146B',
       '05507CEBF6D1C99EB55502168C7AF142', '05AF514541AC911832A74191A99EA493',
       '0676AA1421F1A087855CDCDC0EE0D895', '0884F8DD1FE3465F03C0437952A67F49',
       '0AC543E2EEBB1E7CFDA1627C840109E6', '0B7DAD5CC14BBC44B4A68FF30DE486C7',
       '0B9751082AE536B5A190867C69CF7287', '0BFE4671297C4103B08E68186719C1C8',
       ...
       'F79CC3E5CDC24D1DC0B55801F025AE42', 'F7F358706605567459C24574FBFE5EBE',
       'F81AD0F09E89761F8140341AAB6A8044', 'F929AF96DAD86515E099AF5DC75850F2',
       'F990E0556A4C1A7196C1AC4A992C9CCF', 'FA85A6841AB6BACC65547B3B423BA42D',
       'FAC665329E5A7B210CB814A6F188E382', 'FC2C82D11A6DDDBC17C49C91072C75C4',
       'FDAE4B20818DC1CAF08289AA8BF72212', 'FFB22E58737FF6B9A21C1DF79B496D7D'],
      dtype='object', name='ACCOUNT', length=224)

In [46]:
df_invs[df_invs.ACCOUNT == 'FA85A6841AB6BACC65547B3B423BA42D'].elapse_due.min()

66