In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import feather
import datetime as dt
from IPython.core.display import display
pd.options.display.max_columns = 999
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import pickle

In [17]:
'''
IMPORTANT CHANGE FROM ORIGINAL VERSION:
-invoice table was change to the asof month, year style
-preprocess the data so that the model will aim to predict churn 60 day in advance
-elapse date is referenced with month, date of the table
-all the data with elapse date <60 days are filtered out
-elapse day is adjusted 60 days back (-60) 
'''

# account
df_accs = feather.read_dataframe('../src/df_accs_samp-raw')
df_accs = df_accs.drop('START_DT',axis=1)
df_accs = df_accs.set_index('ACCOUNT')

cat_cols = ['BILL_DELIVER','PAY_METHOD','PROVINCE']
enc_dict = dict()
for col in  cat_cols:
    enc_dict[col] = LabelEncoder()
    df_accs[col] = enc_dict[col].fit_transform(df_accs[col])

with open('../out/enc.dictionary', 'wb') as config_dictionary_file: 
    pickle.dump(enc_dict, config_dictionary_file)
    
for year, month in tqdm_notebook(zip([2017]*12, list(range(1,13)))):
    
    try:
        # aggregation of past product
        df_prods = pd.read_csv('../bin/monthly/df_product_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_prods = df_prods.loc[(df_prods.elapse_end > 60) | (df_prods.elapse_end == -1)]
        
        # adjust elapse date
        df_prods['elapse_end'] = df_prods['elapse_end'].apply(lambda x:x-60 if x>60 else x)
        df_prods['elapse_start'] = df_prods['elapse_start'].apply(lambda x:x-60 if x>60 else x)
        df_prods_agg_past = df_prods.loc[df_prods.elapse_end != -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                                   'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                           'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                           'CHARGE':[np.max,np.min,np.median,np.mean]})

        df_prods_agg_past.columns = [col + '-' + stat for col, stat in zip(df_prods_agg_past.columns.get_level_values(0),df_prods_agg_past.columns.get_level_values(1))]

        # aggregation of active product
        df_prod_agg_act = df_prods.loc[df_prods.elapse_end == -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                                   'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                           'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                           'CHARGE':[np.max,np.min,np.median,np.mean]})
        df_prod_agg_act.columns = [col + '-' + stat for col, stat in zip(df_prod_agg_act.columns.get_level_values(0),df_prod_agg_act.columns.get_level_values(1))]

        acc_list = df_prods.ACCOUNT.unique()
        tel_n_acc = df_prods[['ACCOUNT','TEL']]

        # aggregation of service
        df_scoms = pd.read_csv('../bin/monthly/df_scoms_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_scoms = df_scoms.merge(tel_n_acc,how='inner',on='TEL').drop('TEL',axis=1)
        df_scoms['CMPTIME'] = pd.to_timedelta(df_scoms.CMPTIME)
        df_scoms = df_scoms.loc[df_scoms.elapse>60]
        
        # adjust elapse date
        df_scoms['elapse'] = df_scoms['elapse'].apply(lambda x:x-60 if x>60 else x)
        
        df_scoms_agg_detail = df_scoms.groupby('ACCOUNT').sum().drop('elapse',axis=1)
        df_scoms_agg_recency = df_scoms.groupby('ACCOUNT').agg({'elapse':[np.max,np.min,np.median,np.mean]})

        # aggregation of invoice
        df_invs = pd.read_csv('../bin/monthly/df_invs_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
        df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_list)]
        df_invs = df_invs.drop(['BILL_DT','DUE_DATE','SETTLED_DT'],axis=1)
        df_invs['ADJUST'] = df_invs.ADJUST.fillna(0)

        # filter out records that have been cuonted as churn in the past (elapse > 90)
        serie_filt =  df_invs.groupby('ACCOUNT')['elapse_settled'].agg(lambda x:True if x.min() < 90 else False)
        acc_keep = serie_filt.loc[serie_filt==True].index
        df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_keep)]

        # define churn target as account with min settled elapse > 60
        serie_acc_churn = df_invs.groupby('ACCOUNT')['elapse_settled'].agg(lambda x:1 if x.min() > 60 else 0)
        serie_acc_churn.name = 'churn'

        # filter out records with elapse > 60
        df_invs['elapse_settled'] = df_invs['elapse_settled']-60
        df_invs['elapse_due'] = df_invs['elapse_due']-60
        df_invs_agg = df_invs.loc[df_invs.elapse_settled > 0].groupby('ACCOUNT').agg([np.max,np.min,np.median,np.mean])
        df_invs_agg.columns = [col + '-' + stat for col, stat in zip(df_invs_agg.columns.get_level_values(0),df_invs_agg.columns.get_level_values(1))]

        # merge all feature
        df_all = pd.concat([df_accs,
                            df_prod_agg_act,
                            df_prods_agg_past,
                            df_scoms_agg_detail,
                            df_scoms_agg_recency,
                            df_invs_agg,serie_acc_churn],
                            axis=1).dropna(subset=['churn']).fillna(0)

        df_all.to_csv('../bin/monthly/df_feat_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
    except:
        print(f'month: {month}, year: {year} skipped...')

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

month: 1, year: 2017 skipped...


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.







In [18]:
df_all.head()

Unnamed: 0,BILL_DELIVER,PAY_METHOD,PROVINCE,TEL-len,TEL-<lambda>,elapse_start-amax,elapse_start-amin,elapse_start-median,elapse_start-mean,elapse_end-amax,elapse_end-amin,elapse_end-median,elapse_end-mean,CHARGE-amax,CHARGE-amin,CHARGE-median,CHARGE-mean,TEL-len.1,TEL-<lambda>.1,elapse_start-amax.1,elapse_start-amin.1,elapse_start-median.1,elapse_start-mean.1,elapse_end-amax.1,elapse_end-amin.1,elapse_end-median.1,elapse_end-mean.1,CHARGE-amax.1,CHARGE-amin.1,CHARGE-median.1,CHARGE-mean.1,CHKWORK_AN,CHKWORK_CN,CHKWORK_FN,CHKWORK_HN,CHKWORK_LN,CHKWORK_NA,CHKWORK_SN,CHKWORK_UN,CHKWORK_WN,CHKWORK_XN,CHKWORK_nan,SERVICE_TYPE2_ADSL,SERVICE_TYPE2_FTTx,SERVICE_TYPE2_nan,FAULTRANGE_ภายใน 1 ชม.,FAULTRANGE_มากกว่า 1 - 2 ชม.,FAULTRANGE_มากกว่า 1 วัน - 7 วัน,FAULTRANGE_มากกว่า 2 - 3 ชม.,FAULTRANGE_มากกว่า 3 - 3.5 ชม.,FAULTRANGE_มากกว่า 3.5 - 4 ชม.,FAULTRANGE_มากกว่า 4 - 6 ชม.,FAULTRANGE_มากกว่า 6 - 24 ชม.,FAULTRANGE_มากกว่า 7 วัน,FAULTRANGE_อยู่ระหว่างดำเนินการ,FAULTRANGE_nan,"(elapse, amax)","(elapse, amin)","(elapse, median)","(elapse, mean)",AMT-amax,AMT-amin,AMT-median,AMT-mean,VAT-amax,VAT-amin,VAT-median,VAT-mean,TOTAL-amax,TOTAL-amin,TOTAL-median,TOTAL-mean,ADJUST-amax,ADJUST-amin,ADJUST-median,ADJUST-mean,PAYMENT-amax,PAYMENT-amin,PAYMENT-median,PAYMENT-mean,OUTSTANDING-amax,OUTSTANDING-amin,OUTSTANDING-median,OUTSTANDING-mean,late_payment_days-amax,late_payment_days-amin,late_payment_days-median,late_payment_days-mean,elapse_settled-amax,elapse_settled-amin,elapse_settled-median,elapse_settled-mean,elapse_due-amax,elapse_due-amin,elapse_due-median,elapse_due-mean,churn
00011C87BDA6234B88063995395FF4EA,2.0,1.0,145.0,1.0,1.0,594.0,594.0,594.0,594.0,-1.0,-1.0,-1.0,-1.0,590.0,590.0,590.0,590.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,256.0,256.0,256.0,256.0,590.0,590.0,590.0,590.0,41.3,41.3,41.3,41.3,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,1.0,-11.0,-7.5,-7.0,230.0,17.0,120.0,124.125,222.0,10.0,117.5,117.125,0.0
0008C0916E8E3BA503B50A9FC5A2886E,2.0,1.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,759.0,759.0,759.0,759.0,197.0,197.0,197.0,197.0,590.0,590.0,590.0,590.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,258.0,258.0,258.0,258.0,590.0,590.0,590.0,590.0,41.3,41.3,41.3,41.3,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,27.0,-13.0,3.0,2.666667,250.0,6.0,110.0,114.222222,237.0,-5.0,117.0,116.888889,0.0
00126E8B76AEBCE23AD373289C9DD713,2.0,1.0,4.0,1.0,1.0,476.0,476.0,476.0,476.0,-1.0,-1.0,-1.0,-1.0,690.0,690.0,690.0,690.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,255.0,255.0,255.0,255.0,690.0,623.23,690.0,667.743333,48.3,43.63,48.3,46.743333,738.3,666.86,738.3,714.486667,0.0,0.0,0.0,0.0,2214.9,666.86,738.3,1206.686667,0.0,0.0,0.0,0.0,6.0,-17.0,5.0,-2.0,211.0,53.0,184.0,149.333333,217.0,36.0,189.0,147.333333,0.0
0012C36934548CFD4C6CFA35881B3690,2.0,1.0,98.0,1.0,1.0,449.0,449.0,449.0,449.0,-1.0,-1.0,-1.0,-1.0,390.0,390.0,390.0,390.0,4.0,1.0,2461.0,1092.0,1335.5,1556.0,1519.0,450.0,1123.5,1054.0,640.0,390.0,565.0,540.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,390.0,390.0,390.0,390.0,27.3,27.3,27.3,27.3,417.3,417.3,417.3,417.3,274.29,176.13,225.21,225.21,834.6,417.3,625.95,625.95,0.0,0.0,0.0,0.0,39.0,8.0,12.0,17.75,201.0,72.0,139.0,137.75,209.0,87.0,163.0,155.5,0.0
0031DD2B8225C4BB612F1BE4803D8050,1.0,1.0,135.0,1.0,1.0,1776.0,1776.0,1776.0,1776.0,-1.0,-1.0,-1.0,-1.0,490.0,490.0,490.0,490.0,2.0,1.0,2111.0,2050.0,2080.5,2080.5,2051.0,1777.0,1914.0,1914.0,490.0,390.0,440.0,440.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,490.0,490.0,490.0,490.0,34.3,34.3,34.3,34.3,524.3,524.3,524.3,524.3,0.0,0.0,0.0,0.0,524.3,524.3,524.3,524.3,0.0,0.0,0.0,0.0,6.0,-1.0,4.0,3.25,217.0,7.0,114.0,113.875,222.0,10.0,117.5,117.125,0.0
