In [1]:
%pylab inline
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

EVENT = {'INCOMING_CALL':0, 'OUTGOING_CALL':1, 'IDD_CALL':2, 'OUTGOING_SMS':4, 'INCOMING_SMS':5}

Populating the interactive namespace from numpy and matplotlib


In [2]:
infile = '../Data/cleaned_data_2.csv'

# read in csv 
# Convert 'EVENT_DATE' column to Timestamp
# Convert 'DURATION' to timedelta
raw_data = pd.read_csv(infile, sep='|', parse_dates=['EVENT_DATE']) 
raw_data['DURATION'] = pd.to_timedelta(raw_data['DURATION'])

In [3]:
group = raw_data.groupby('A_NUMBER')

In [11]:
churn_duration_str = '15 days'
end_date_str = '2015-4-1'

churn_timedelta = pd.to_timedelta(churn_duration_str)
end_date = pd.to_datetime(end_date_str)
    
def aggregations(x):
    # Pure Social KPI
    out_degree_call = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['OUTGOING_CALL'])]['B_NUMBER']))
    out_degree_sms = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['OUTGOING_SMS'])]['B_NUMBER']))
    in_degree_call = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['INCOMING_CALL'])]['B_NUMBER']))
    in_degree_sms = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['INCOMING_SMS'])]['B_NUMBER']))
    
    first_recds = x['EVENT_DATE'].min()
    last_recds = x['EVENT_DATE'].max()
#     total_recds = len(x)

    num_out_calls = len(x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']])
    total_out_call_duration = x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']]['DURATION'].sum()
    total_out_call_duration_sec = total_out_call_duration/np.timedelta64(1,'s')

    num_in_calls = len(x[x['EVENT_TYPE']==EVENT['INCOMING_CALL']])
    total_in_call_duration = x[x['EVENT_TYPE']==EVENT['INCOMING_CALL']]['DURATION'].sum()
    total_in_call_duration_sec = total_in_call_duration/np.timedelta64(1,'s')

    num_IDD_calls = len(x[x['EVENT_TYPE']==EVENT['IDD_CALL']])
    
    num_out_sms = len(x[x['EVENT_TYPE']==EVENT['OUTGOING_SMS']])
    num_in_sms = len(x[x['EVENT_TYPE']==EVENT['INCOMING_SMS']])
    
    # Last KPI
    last_call = x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']]['EVENT_DATE'].max()
    last_sms = x[x['EVENT_TYPE']==EVENT['OUTGOING_SMS']]['EVENT_DATE'].max()
    last_idd =  x[x['EVENT_TYPE']==EVENT['IDD_CALL']]['EVENT_DATE'].max()
    last_activity = max([pd.to_datetime(last_call), pd.to_datetime(last_sms), pd.to_datetime(last_idd)])
    
    # Churner identifying -- warning: will not work for label propagation, because churner label is only identified next month    
    # TODO ChurnerOutDegree, ChurnerInDegree

    attr_list = [out_degree_call, out_degree_sms, in_degree_call, in_degree_sms
                ,first_recds, last_recds, 
                num_out_calls, total_out_call_duration_sec,
                num_in_calls, total_in_call_duration_sec,
                num_IDD_calls,
                num_out_sms, num_in_sms,
                last_call, last_sms, last_idd, last_activity]

    headers_list = ['out degree call', 'out degree sms', 'in degree call', 'in degree sms'
                    ,'first recds', 'last recds',
                    'num outgoing calls', 'total out call duration in sec',
                    'num incoming calls', 'total in call duration in sec',
                    'num IDD calls',
                    'num outgoing sms', 'num incoming sms',
                    'last call', 'last sms', 'last idd', 'last activity']

    return pd.Series(attr_list, index=headers_list)

agg_data = group.apply(aggregations) 
agg_data

Unnamed: 0_level_0,out degree call,out degree sms,in degree call,in degree sms,first recds,last recds,num outgoing calls,total out call duration in sec,num incoming calls,total in call duration in sec,num IDD calls,num outgoing sms,num incoming sms,last call,last sms,last idd,last activity
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
710000810,174,16,144,37,2014-10-01 07:16:46,2015-03-31 19:23:17,1004,70222,1173,79910,0,23,1253,2015-03-31 12:41:02,2015-03-19 15:59:17,NaT,2015-03-31 12:41:02
710002560,136,24,117,35,2014-10-01 06:01:11,2015-03-31 23:50:08,1499,280851,456,36042,0,294,909,2015-03-31 20:18:22,2015-03-31 23:13:06,NaT,2015-03-31 23:13:06
710003902,174,3,151,27,2014-10-01 04:14:52,2015-03-31 22:12:31,2095,429884,972,107018,0,4,395,2015-03-31 20:21:47,2015-02-26 09:49:21,NaT,2015-03-31 20:21:47
710004420,134,7,128,29,2014-10-01 11:17:05,2015-03-31 20:05:40,840,64089,1069,59337,0,25,434,2015-03-31 20:05:40,2015-02-26 05:30:19,NaT,2015-03-31 20:05:40
710004816,70,20,106,27,2014-10-01 03:37:00,2015-03-31 13:15:26,1642,352632,1467,383093,0,124,172,2015-03-31 09:24:38,2015-03-27 06:20:16,NaT,2015-03-31 09:24:38
710005499,156,8,165,53,2014-10-01 09:43:43,2015-03-31 21:10:51,1046,254694,783,126365,15,30,1443,2015-03-31 16:37:28,2015-03-17 14:01:44,2015-01-08 12:44:40,2015-03-31 16:37:28
710007474,188,18,230,44,2014-10-01 07:30:17,2015-03-31 20:00:37,1567,177744,1265,70837,0,43,411,2015-03-30 14:42:42,2015-03-25 20:48:50,NaT,2015-03-30 14:42:42
710008915,62,3,35,10,2015-02-18 18:01:59,2015-03-31 22:02:46,252,22247,233,16940,0,9,121,2015-03-31 20:10:53,2015-03-22 21:23:52,NaT,2015-03-31 20:10:53
710008952,103,7,88,27,2014-10-01 08:02:57,2015-03-31 16:54:36,1689,406603,826,178513,0,24,286,2015-03-31 16:54:36,2015-03-29 17:23:25,NaT,2015-03-31 16:54:36
710010999,120,41,110,50,2014-10-01 08:00:59,2015-03-31 20:05:20,1122,170523,895,148304,0,413,307,2015-03-31 17:40:10,2015-03-31 19:23:08,NaT,2015-03-31 19:23:08


In [12]:
# First month data
firstmonth_raw = raw_data[raw_data['EVENT_DATE'].dt.month == 10]
firstmonth_group = firstmonth_raw.groupby('A_NUMBER')

churn_duration_str = '15 days'
end_date_str = '2015-10-31'

churn_timedelta = pd.to_timedelta(churn_duration_str)
end_date = pd.to_datetime(end_date_str)

firstmonth_features = firstmonth_group.apply(aggregations)

In [13]:
firstmonth_features

Unnamed: 0_level_0,out degree call,out degree sms,in degree call,in degree sms,first recds,last recds,num outgoing calls,total out call duration in sec,num incoming calls,total in call duration in sec,num IDD calls,num outgoing sms,num incoming sms,last call,last sms,last idd,last activity
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
710000810,52,0,45,8,2014-10-01 07:16:46,2014-10-31 21:46:52,159,11047,189,13240,0,0,183,2014-10-31 18:48:21,NaT,NaT,2014-10-31 18:48:21
710002560,42,3,30,11,2014-10-01 06:01:11,2014-10-31 23:46:22,222,45173,74,4675,0,37,183,2014-10-31 21:25:15,2014-10-31 23:46:22,NaT,2014-10-31 23:46:22
710003902,50,0,37,8,2014-10-01 04:14:52,2014-10-31 21:28:56,427,83280,131,11089,0,0,57,2014-10-31 21:28:56,NaT,NaT,2014-10-31 21:28:56
710004420,36,0,48,7,2014-10-01 11:17:05,2014-10-31 17:42:55,113,8689,145,10741,0,0,59,2014-10-31 13:35:27,NaT,NaT,2014-10-31 13:35:27
710004816,27,7,39,12,2014-10-01 03:37:00,2014-10-31 21:17:35,285,60163,280,70742,0,28,38,2014-10-31 19:30:28,2014-10-30 19:53:29,NaT,2014-10-31 19:30:28
710005499,47,0,47,10,2014-10-01 09:43:43,2014-10-31 20:48:23,119,20067,106,21917,2,0,212,2014-10-31 20:25:41,NaT,2014-10-20 20:13:31,2014-10-31 20:25:41
710007474,56,5,74,11,2014-10-01 07:30:17,2014-10-31 21:32:14,272,36201,229,15440,0,6,67,2014-10-31 21:32:14,2014-10-21 14:17:17,NaT,2014-10-31 21:32:14
710008952,31,0,29,4,2014-10-01 08:02:57,2014-10-31 20:34:33,296,74457,154,33780,0,0,39,2014-10-31 20:34:33,NaT,NaT,2014-10-31 20:34:33
710010999,50,4,34,5,2014-10-01 08:00:59,2014-10-31 16:42:32,209,21341,114,14967,0,23,26,2014-10-31 16:42:32,2014-10-31 08:22:52,NaT,2014-10-31 16:42:32
710011709,17,0,3,3,2014-10-01 09:53:06,2014-10-31 07:23:35,58,8532,4,217,19,0,21,2014-10-30 13:00:29,NaT,2014-10-31 07:23:35,2014-10-31 07:23:35


In [26]:
halfmonth_raw = raw_data[(raw_data['EVENT_DATE'].dt.month == 11) & (raw_data['EVENT_DATE'].dt.day <16)]
halfmonth_group = halfmonth_raw.groupby(['A_NUMBER'])

def nonchurns(x):
    churner = 0
    return pd.Series([churner], index=['churner'])
    
halfmonth_agg = halfmonth_group.apply(nonchurns)
halfmonth_agg

Unnamed: 0_level_0,churner
A_NUMBER,Unnamed: 1_level_1
710000810,0
710002560,0
710003902,0
710004420,0
710004816,0
710005499,0
710007474,0
710008952,0
710010999,0
710011709,0


In [56]:
halfmonth_raw = raw_data[(raw_data['EVENT_DATE'].dt.month == 11) & (raw_data['EVENT_DATE'].dt.day <16)]
halfmonth_group = halfmonth_raw.groupby(['A_NUMBER'])

def nonchurns(x):
    churner = 0
    return pd.Series([churner], index=['churner'])
    
halfmonth_agg = halfmonth_group.apply(nonchurns)

# join features and churn
joinchurn = firstmonth_features.join(halfmonth_agg)

# Set churner=1 for people who do not make any activities in 15 days
joinchurn['churner'] = joinchurn['churner'].fillna(1) 

In [60]:
# churners list
joinchurn[joinchurn['churner'] != 0]

Unnamed: 0_level_0,out degree call,out degree sms,in degree call,in degree sms,first recds,last recds,num outgoing calls,total out call duration in sec,num incoming calls,total in call duration in sec,num IDD calls,num outgoing sms,num incoming sms,last call,last sms,last idd,last activity,churner
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
710206458,0,0,35,2,2014-10-01 12:41:12,2014-10-25 17:55:19,0,0,124,6905,0,0,4,NaT,NaT,NaT,0,1
710358916,0,0,1,0,2014-10-18 11:46:07,2014-10-20 22:12:54,0,0,3,63,0,0,0,NaT,NaT,NaT,0,1
710453687,9,1,6,3,2014-10-01 03:05:31,2014-10-08 08:09:58,10,1776,9,966,0,48,53,2014-10-07 14:41:23,2014-10-03 21:53:54,NaT,7,1
710532062,21,8,13,15,2014-10-01 08:29:17,2014-10-12 11:31:36,54,6399,20,2198,0,14,38,2014-10-05 12:34:52,2014-10-04 13:50:01,NaT,5,1
710552090,0,0,24,8,2014-10-01 07:03:12,2014-10-17 16:44:39,0,0,39,5054,0,0,28,NaT,NaT,NaT,0,1
710738110,1,0,0,0,2014-10-12 21:08:15,2014-10-21 21:39:50,4,1062,0,0,0,0,0,2014-10-21 21:39:50,NaT,NaT,21,1
710835438,27,2,31,22,2014-10-01 06:35:48,2014-10-25 10:32:16,174,61055,127,14920,0,10,77,2014-10-25 10:32:16,2014-10-24 23:59:48,NaT,25,1
711008183,15,2,10,8,2014-10-01 00:29:23,2014-10-12 10:13:38,162,57867,33,3358,0,27,80,2014-10-07 00:56:42,2014-10-05 09:54:32,NaT,7,1
711070882,0,0,0,1,2014-10-13 20:07:43,2014-10-13 20:07:43,0,0,0,0,0,0,1,NaT,NaT,NaT,0,1
711176896,0,0,0,1,2014-10-01 12:22:40,2014-10-01 12:22:40,0,0,0,0,0,0,1,NaT,NaT,NaT,0,1


In [77]:
# joinchurn = joinchurn.dt.day - Note: not working for dataframes

for column in joinchurn:
    if joinchurn[column].dtypes == '<M8[ns]':    # '<M8[ns]' is datetime 
        joinchurn[column] = joinchurn[column].dt.day
        joinchurn[column] = joinchurn[column].fillna(0)


In [78]:
joinchurn

Unnamed: 0_level_0,out degree call,out degree sms,in degree call,in degree sms,first recds,last recds,num outgoing calls,total out call duration in sec,num incoming calls,total in call duration in sec,num IDD calls,num outgoing sms,num incoming sms,last call,last sms,last idd,last activity,churner
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
710000810,52,0,45,8,1,31,159,11047,189,13240,0,0,183,31,0,0,31,0
710002560,42,3,30,11,1,31,222,45173,74,4675,0,37,183,31,31,0,31,0
710003902,50,0,37,8,1,31,427,83280,131,11089,0,0,57,31,0,0,31,0
710004420,36,0,48,7,1,31,113,8689,145,10741,0,0,59,31,0,0,31,0
710004816,27,7,39,12,1,31,285,60163,280,70742,0,28,38,31,30,0,31,0
710005499,47,0,47,10,1,31,119,20067,106,21917,2,0,212,31,0,20,31,0
710007474,56,5,74,11,1,31,272,36201,229,15440,0,6,67,31,21,0,31,0
710008952,31,0,29,4,1,31,296,74457,154,33780,0,0,39,31,0,0,31,0
710010999,50,4,34,5,1,31,209,21341,114,14967,0,23,26,31,31,0,31,0
710011709,17,0,3,3,1,31,58,8532,4,217,19,0,21,30,0,31,31,0


In [69]:
joinchurn['last sms'].dtypes == '<M8[ns]'

True

In [79]:
joinchurn.to_csv('OneMonthData.csv', index=False)

<H3>Classifier</H3>

In [81]:
# Use Weka

<H3> Next Month's Data and Churner </H3>

In [82]:
# Second month data
secondmonth_raw = raw_data[raw_data['EVENT_DATE'].dt.month == 11]
secondmonth_group = secondmonth_raw.groupby('A_NUMBER')
secondmonth_features = secondmonth_group.apply(aggregations)

halfmonth2_raw = raw_data[(raw_data['EVENT_DATE'].dt.month == 12) & (raw_data['EVENT_DATE'].dt.day <16)]
halfmonth2_group = halfmonth2_raw.groupby(['A_NUMBER'])
halfmonth2_agg = halfmonth2_group.apply(nonchurns)

# join features and churn
joinchurn2 = secondmonth_features.join(halfmonth2_agg)
joinchurn2['churner'] = joinchurn2['churner'].fillna(1)

# Convert datetime to date, and fill in NaN->0
for column in joinchurn2:
    if joinchurn2[column].dtypes == '<M8[ns]':    # '<M8[ns]' is datetime 
        joinchurn2[column] = joinchurn2[column].dt.day
        joinchurn2[column] = joinchurn2[column].fillna(0)


In [83]:
joinchurn2
# Write to csv
joinchurn2.to_csv('SecondMonthData.csv', index=False)

In [84]:
joinchurn2

Unnamed: 0_level_0,out degree call,out degree sms,in degree call,in degree sms,first recds,last recds,num outgoing calls,total out call duration in sec,num incoming calls,total in call duration in sec,num IDD calls,num outgoing sms,num incoming sms,last call,last sms,last idd,last activity,churner
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
710000810,57,2,47,11,1,30,168,10124,170,12701,0,2,142,30,16,0,30,0
710002560,38,4,32,11,1,30,236,42105,90,6099,0,63,151,30,29,0,30,0
710003902,53,0,45,7,1,30,320,72977,132,17453,0,0,52,30,0,0,30,0
710004420,43,1,43,7,1,30,99,6429,142,7112,0,1,50,30,25,0,30,0
710004816,28,8,33,12,1,30,343,91015,297,65623,0,45,36,30,30,0,30,0
710005499,49,2,53,15,1,30,149,16387,136,16456,8,4,188,30,29,17,30,0
710007474,69,1,70,13,1,30,261,31462,223,11288,0,1,61,30,27,0,30,0
710008952,34,2,36,7,1,30,292,61940,151,32814,0,15,51,30,10,0,30,0
710010999,47,3,44,4,1,30,180,31168,134,18677,0,3,4,30,13,0,30,0
710011709,30,0,3,3,2,30,75,13158,3,82,20,0,16,30,0,30,30,0
