In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import feather
import datetime as dt
from IPython.core.display import display
pd.options.display.max_columns = 999
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder
import pickle

Prep data of last month (Dec 2018)

In [4]:
# account
df_accs = feather.read_dataframe('../src/df_accs_samp-raw')
df_accs = df_accs.drop('START_DT',axis=1)
df_accs = df_accs.set_index('ACCOUNT')
df_accs = df_accs.dropna(subset=['PROVINCE'])
cat_cols = ['BILL_DELIVER','PAY_METHOD','PROVINCE']
enc_dict = dict()
for col in  cat_cols:
    enc_dict[col] = LabelEncoder()
    df_accs[col] = enc_dict[col].fit_transform(df_accs[col])

with open('../out/enc.dictionary', 'wb') as config_dictionary_file: 
    pickle.dump(enc_dict, config_dictionary_file)
    
for year, month in tqdm_notebook([(2018,12)]):

    # aggregation of past product
    df_prods = pd.read_csv('../bin/monthly/df_product_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
    df_prods = df_prods.loc[(df_prods.elapse_end > 0) | (df_prods.elapse_end == -1)]

    # adjust elapse date
#         df_prods['elapse_end'] = df_prods['elapse_end'].apply(lambda x:x-60 if x>60 else x)
#         df_prods['elapse_start'] = df_prods['elapse_start'].apply(lambda x:x-60 if x>60 else x)
    df_prods_agg_past = df_prods.loc[df_prods.elapse_end != -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                               'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                       'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                       'CHARGE':[np.max,np.min,np.median,np.mean]})

    df_prods_agg_past.columns = [col + '-' + stat for col, stat in zip(df_prods_agg_past.columns.get_level_values(0),df_prods_agg_past.columns.get_level_values(1))]

    # aggregation of active product
    df_prod_agg_act = df_prods.loc[df_prods.elapse_end == -1].groupby('ACCOUNT').agg({'TEL':[len,lambda x:x.nunique()],
                                                                               'elapse_start':[np.max,np.min,np.median,np.mean],
                                                                                       'elapse_end':[np.max,np.min,np.median,np.mean],
                                                                                       'CHARGE':[np.max,np.min,np.median,np.mean]})
    df_prod_agg_act.columns = [col + '-' + stat for col, stat in zip(df_prod_agg_act.columns.get_level_values(0),df_prod_agg_act.columns.get_level_values(1))]

    acc_list = df_prods.ACCOUNT.unique()
    tel_n_acc = df_prods[['ACCOUNT','TEL']]

    # aggregation of service
    df_scoms = pd.read_csv('../bin/monthly/df_scoms_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
    df_scoms = df_scoms.merge(tel_n_acc,how='inner',on='TEL').drop('TEL',axis=1)
    df_scoms['CMPTIME'] = pd.to_timedelta(df_scoms.CMPTIME)
    df_scoms = df_scoms.loc[df_scoms.elapse>0]

    # adjust elapse date
#         df_scoms['elapse'] = df_scoms['elapse'].apply(lambda x:x-60 if x>60 else x)

    df_scoms_agg_detail = df_scoms.groupby('ACCOUNT').sum().drop('elapse',axis=1)
    df_scoms_agg_recency = df_scoms.groupby('ACCOUNT').agg({'elapse':[np.max,np.min,np.median,np.mean]})

    # aggregation of invoice
    df_invs = pd.read_csv('../bin/monthly/df_invs_'+str(year).zfill(4)+str(month).zfill(2)+'.csv')
    df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_list)]
    df_invs = df_invs.drop(['BILL_DT','DUE_DATE','SETTLED_DT'],axis=1)
    df_invs['ADJUST'] = df_invs.ADJUST.fillna(0)

    # filter out records that already have been counted as churn in the past (elapse > 90)
    serie_filt =  df_invs.groupby('ACCOUNT')['elapse_due'].agg(lambda x:True if x.min() < 30 else False)

    acc_keep = serie_filt.loc[serie_filt==True].index
    df_invs = df_invs.loc[df_invs.ACCOUNT.isin(acc_keep)]

    # define churn target as account with min settled elapse > 60
#         serie_acc_churn = df_invs.groupby('ACCOUNT')['elapse_due'].agg(lambda x:1 if x.min() > 60 else 0)
#         serie_acc_churn.name = 'churn'

    # filter out records with elapse > 60
#         df_invs['elapse_settled'] = df_invs['elapse_settled']-60
#         df_invs['elapse_due'] = df_invs['elapse_due']-60
    df_invs_agg = df_invs.loc[df_invs.elapse_due > 0].groupby('ACCOUNT').agg([np.max,np.min,np.median,np.mean])
    df_invs_agg.columns = [col + '-' + stat for col, stat in zip(df_invs_agg.columns.get_level_values(0),df_invs_agg.columns.get_level_values(1))]

    # merge all feature
    df_all = pd.concat([df_accs,
                        df_prod_agg_act,
                        df_prods_agg_past,
                        df_scoms_agg_detail,
                        df_scoms_agg_recency,
                        df_invs_agg],
                        axis=1)

#         df_all.to_csv('../bin/monthly/df_feat_test.csv')

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [6]:
print(df_all.shape)
df_all = df_all.dropna(subset=['elapse_due-mean']).fillna(0)
print(df_all.shape)

(278198, 103)
(89235, 103)


Load model and predict

In [7]:
from sklearn.externals import joblib
# Output a pickle file for the model
# joblib.dump(rfc, '../out/rfc_1.pkl')
rfc = joblib.load('../out/rfc_1.pkl')

In [8]:
df_all.head()

Unnamed: 0,BILL_DELIVER,PAY_METHOD,PROVINCE,TEL-len,TEL-<lambda>,elapse_start-amax,elapse_start-amin,elapse_start-median,elapse_start-mean,elapse_end-amax,elapse_end-amin,elapse_end-median,elapse_end-mean,CHARGE-amax,CHARGE-amin,CHARGE-median,CHARGE-mean,TEL-len.1,TEL-<lambda>.1,elapse_start-amax.1,elapse_start-amin.1,elapse_start-median.1,elapse_start-mean.1,elapse_end-amax.1,elapse_end-amin.1,elapse_end-median.1,elapse_end-mean.1,CHARGE-amax.1,CHARGE-amin.1,CHARGE-median.1,CHARGE-mean.1,CHKWORK_0,CHKWORK_AN,CHKWORK_BN,CHKWORK_CN,CHKWORK_FN,CHKWORK_HN,CHKWORK_IN,CHKWORK_LN,CHKWORK_MN,CHKWORK_NA,CHKWORK_PN,CHKWORK_QN,CHKWORK_SN,CHKWORK_UN,CHKWORK_VN,CHKWORK_WN,CHKWORK_XN,CHKWORK_nan,SERVICE_TYPE2_ADSL,SERVICE_TYPE2_FTTx,SERVICE_TYPE2_nan,FAULTRANGE_ภายใน 1 ชม.,FAULTRANGE_มากกว่า 1 - 2 ชม.,FAULTRANGE_มากกว่า 1 วัน - 7 วัน,FAULTRANGE_มากกว่า 2 - 3 ชม.,FAULTRANGE_มากกว่า 3 - 3.5 ชม.,FAULTRANGE_มากกว่า 3.5 - 4 ชม.,FAULTRANGE_มากกว่า 4 - 6 ชม.,FAULTRANGE_มากกว่า 6 - 24 ชม.,FAULTRANGE_มากกว่า 7 วัน,FAULTRANGE_อยู่ระหว่างดำเนินการ,FAULTRANGE_nan,"(elapse, amax)","(elapse, amin)","(elapse, median)","(elapse, mean)",AMT-amax,AMT-amin,AMT-median,AMT-mean,VAT-amax,VAT-amin,VAT-median,VAT-mean,TOTAL-amax,TOTAL-amin,TOTAL-median,TOTAL-mean,ADJUST-amax,ADJUST-amin,ADJUST-median,ADJUST-mean,PAYMENT-amax,PAYMENT-amin,PAYMENT-median,PAYMENT-mean,OUTSTANDING-amax,OUTSTANDING-amin,OUTSTANDING-median,OUTSTANDING-mean,late_payment_days-amax,late_payment_days-amin,late_payment_days-median,late_payment_days-mean,elapse_settled-amax,elapse_settled-amin,elapse_settled-median,elapse_settled-mean,elapse_due-amax,elapse_due-amin,elapse_due-median,elapse_due-mean
0000E0AEB11C6EBF282C123D98CE2941,2.0,1.0,134.0,1.0,1.0,456.0,456.0,456.0,456.0,-1.0,-1.0,-1.0,-1.0,750.0,750.0,750.0,750.0,1.0,1.0,880.0,880.0,880.0,880.0,457.0,457.0,457.0,457.0,750.0,750.0,750.0,750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,244.0,244.0,244.0,244.0,750.0,750.0,750.0,750.0,52.5,52.5,52.5,52.5,802.5,802.5,802.5,802.5,0.0,0.0,0.0,0.0,802.5,802.5,802.5,802.5,0.0,0.0,0.0,0.0,-4.0,-16.0,-12.5,-11.444444,667.0,24.0,408.0,379.277778,652.0,14.0,394.5,367.833333
0000F7F83B77CD37BBF29D292D6C0886,0.0,1.0,158.0,1.0,1.0,50.0,50.0,50.0,50.0,-1.0,-1.0,-1.0,-1.0,590.0,590.0,590.0,590.0,3.0,1.0,1474.0,371.0,751.0,865.333333,752.0,51.0,372.0,391.666667,790.0,590.0,590.0,656.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,464.0,464.0,464.0,464.0,590.0,590.0,590.0,590.0,41.3,41.3,41.3,41.3,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,6.0,-1.0,3.0,3.045455,641.0,5.0,326.0,326.0,647.0,9.0,328.5,329.045455
00011C87BDA6234B88063995395FF4EA,2.0,1.0,235.0,1.0,1.0,1019.0,1019.0,1019.0,1019.0,-1.0,-1.0,-1.0,-1.0,590.0,590.0,590.0,590.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,681.0,69.0,321.0,324.857143,590.0,590.0,590.0,590.0,41.3,41.3,41.3,41.3,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,631.3,631.3,631.3,631.3,0.0,0.0,0.0,0.0,3.0,-13.0,-5.0,-4.545455,655.0,8.0,337.5,333.590909,647.0,9.0,328.5,329.045455
00018FD1D88359BFA190BE3609329CBC,2.0,1.0,169.0,1.0,1.0,1882.0,1882.0,1882.0,1882.0,-1.0,-1.0,-1.0,-1.0,390.0,390.0,390.0,390.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,4.0,0.0,2.0,0.0,2.0,0.0,0.0,0.0,0.0,357.0,4.0,77.0,125.333333,390.0,377.0,390.0,389.409091,27.3,26.39,27.3,27.258636,417.3,403.39,417.3,416.667727,0.0,0.0,0.0,0.0,417.3,403.39,417.3,416.667727,0.0,0.0,0.0,0.0,39.0,-4.0,5.5,9.090909,663.0,5.0,330.0,334.954545,662.0,24.0,343.5,344.045455
0001E3C0291F8C876196D97E9D800013,2.0,1.0,147.0,1.0,1.0,194.0,194.0,194.0,194.0,-1.0,-1.0,-1.0,-1.0,590.0,590.0,590.0,590.0,2.0,1.0,592.0,478.0,535.0,535.0,479.0,195.0,337.0,337.0,700.0,590.0,645.0,645.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,147.0,147.0,147.0,147.0,700.0,66.78,696.78,635.976842,49.0,4.67,48.77,44.517895,749.0,71.45,745.55,680.494737,0.0,0.0,0.0,0.0,749.0,71.45,745.55,680.494737,0.0,0.0,0.0,0.0,9.0,-18.0,6.0,5.631579,576.0,3.0,276.0,277.789474,558.0,9.0,282.0,283.421053


In [9]:
print(df_all.shape)
df_all = df_all.loc[df_all['PAYMENT-mean']>500]
print(df_all.shape)

(89235, 103)
(75645, 103)


In [10]:
y_pred = rfc.predict_proba(df_all.values)

In [12]:
df_out = pd.DataFrame({'account':df_all.index,'churn_prob':y_pred[:,1]})

In [14]:
df_out.to_csv('../out/submission_last_month.csv',index=False)