In [2]:
%pylab inline
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import networkx as nx

EVENT = {'INCOMING_CALL':0, 'OUTGOING_CALL':1, 'IDD_CALL':2, 'OUTGOING_SMS':4, 'INCOMING_SMS':5}


Populating the interactive namespace from numpy and matplotlib


In [3]:
infile = '../Data/cleaned_data_2.csv'

# read in csv 
# Convert 'EVENT_DATE' column to Timestamp
# Convert 'DURATION' to timedelta
raw_data = pd.read_csv(infile, sep='|', parse_dates=['EVENT_DATE']) 
raw_data['DURATION'] = pd.to_timedelta(raw_data['DURATION'])

In [4]:
# Split into months
first_month = 10
monthly_data = []
for i in range(6):
    month = (first_month + i - 1) % 12 + 1
    monthly_data.append(raw_data[raw_data['EVENT_DATE'].dt.month == month])

In [5]:
def feature_engineer(data):
    ### Return aggreate feature in the form of dataframe
    group = data.groupby('A_NUMBER')
    return group.apply(aggregations)
    
def set_label(feature_data, churn_data):
    ### feature_data is df, churn_data is raw CDR for churn period
    ### Return df with churner column
    halfmonth_group = churn_data.groupby(['A_NUMBER'])
    halfmonth_agg = halfmonth_group.apply(nonchurns)

    # join features and churn
    joinchurn = feature_data.join(halfmonth_agg)

    # Set churner=1 for people who do not make any activities in churn_data
    joinchurn['churner'] = joinchurn['churner'].fillna(1)
    
    polish_data(joinchurn)
    return joinchurn

def aggregations(x):
    # Pure Social KPI
    out_degree_call = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['OUTGOING_CALL'])]['B_NUMBER']))
    out_degree_sms = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['OUTGOING_SMS'])]['B_NUMBER']))
    in_degree_call = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['INCOMING_CALL'])]['B_NUMBER']))
    in_degree_sms = len(pd.unique(x[(x['EVENT_TYPE']==EVENT['INCOMING_SMS'])]['B_NUMBER']))
    
    first_recds = x['EVENT_DATE'].min()
    last_recds = x['EVENT_DATE'].max()
#     total_recds = len(x)

    num_out_calls = len(x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']])
    total_out_call_duration = x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']]['DURATION'].sum()
    total_out_call_duration_sec = total_out_call_duration/np.timedelta64(1,'s')

    num_in_calls = len(x[x['EVENT_TYPE']==EVENT['INCOMING_CALL']])
    total_in_call_duration = x[x['EVENT_TYPE']==EVENT['INCOMING_CALL']]['DURATION'].sum()
    total_in_call_duration_sec = total_in_call_duration/np.timedelta64(1,'s')

    num_IDD_calls = len(x[x['EVENT_TYPE']==EVENT['IDD_CALL']])
    
    num_out_sms = len(x[x['EVENT_TYPE']==EVENT['OUTGOING_SMS']])
    num_in_sms = len(x[x['EVENT_TYPE']==EVENT['INCOMING_SMS']])
    
    # Last KPI
    last_call = x[x['EVENT_TYPE']==EVENT['OUTGOING_CALL']]['EVENT_DATE'].max()
    last_sms = x[x['EVENT_TYPE']==EVENT['OUTGOING_SMS']]['EVENT_DATE'].max()
    last_idd =  x[x['EVENT_TYPE']==EVENT['IDD_CALL']]['EVENT_DATE'].max()
    last_activity = max([pd.to_datetime(last_call), pd.to_datetime(last_sms), pd.to_datetime(last_idd)])
    
    # Churner identifying -- warning: will not work for label propagation, because churner label is only identified next month
    # TODO ChurnerOutDegree, ChurnerInDegree

    attr_list = [out_degree_call, out_degree_sms, in_degree_call, in_degree_sms
                ,first_recds, last_recds, 
                num_out_calls, total_out_call_duration_sec,
                num_in_calls, total_in_call_duration_sec,
                num_IDD_calls,
                num_out_sms, num_in_sms,
                last_call, last_sms, last_idd, last_activity]

    headers_list = ['out degree call', 'out degree sms', 'in degree call', 'in degree sms'
                    ,'first recds', 'last recds',
                    'num outgoing calls', 'total out call duration in sec',
                    'num incoming calls', 'total in call duration in sec',
                    'num IDD calls',
                    'num outgoing sms', 'num incoming sms',
                    'last call', 'last sms', 'last idd', 'last activity']

    return pd.Series(attr_list, index=headers_list)

def nonchurns(x):
    churner = 0
    return pd.Series([churner], index=['churner'])

# Convert datetime to integer + fill in missing data
def polish_data(data):
    for column in data:
        if data[column].dtypes == '<M8[ns]':    # '<M8[ns]' is datetime 
            data[column] = data[column].dt.day
            data[column] = data[column].fillna(0)

In [126]:
def build_graph(data):
    ### Construct graph G from df
    # Adding the weight to prepare for nx
    df = data.groupby(['A_NUMBER', 'B_NUMBER'])['DURATION'].sum().reset_index()
    df['DURATION_SEC'] = df['DURATION'] / np.timedelta64(1, 's')
    G = nx.from_pandas_dataframe(df,'A_NUMBER', 'B_NUMBER', ['DURATION_SEC'])
    return G

def set_initial_churners(data0, data1):
    # Return list of churners
    a0 = pd.unique(data0.A_NUMBER.ravel())
    a1 = pd.unique(data1[data1['EVENT_DATE'].dt.day <16].A_NUMBER.ravel())
    churners = list(set(a0) - set(a1))
    
    return churners

# Remove lone nodes
def remove_lone_nodes(G):
    degree_array = nx.degree(G)
    for n in G.nodes():
        if degree_array[n]==1:
            G.remove_node(n)
    G.number_of_nodes()
    
# Connect 2 A_NUMBER that shares the same B_NUMBER
def connect_edges_from_node(G, b, neighbors):
    neighbors_len = len(neighbors)
    for i in range(neighbors_len):
        for j in range(i+1, neighbors_len):
            a1 = neighbors[i]
            a2 = neighbors[j]
            added_duration = G[a1][b]['DURATION_SEC'] + G[a2][b]['DURATION_SEC']
            if G.has_edge(a1,a2):
                G[a1][a2]['DURATION_SEC'] += added_duration
            else:
                G.add_edge(a1, a2, DURATION_SEC=added_duration)
       
def connect_edges(G):
    for n in G.nodes():
        if type(n) is str:
            connect_edges_from_node(G, n, G.neighbors(n))

# remove B_NUMBER nodes
def remove_BNUMBER_nodes(G):
    for n in G.nodes():
        if type(n) is str:
            G.remove_node(n)
            
def add_influence_label(G):
    degree_list = G.degree().values()
    sorted_degree = sorted(degree_list, reverse=True)[:50]
    threshold = sorted_degree[-1]
    
    # Add label
    for n in G.nodes():
        if G.degree(n) < threshold:
            G.add_node(n, influence=0)
        else:
            G.add_node(n, influence=1)
            

            
#### Main Function #####
# param data is the monthly data
def set_graph_features(data0, data1):
    data = pd.concat([data0, data1])
    # Aggregate call data
    call_data = data[(data['EVENT_TYPE']==EVENT['OUTGOING_CALL']) | (data['EVENT_TYPE']==EVENT['INCOMING_CALL'])]
    
    # Construct graph
    G = build_graph(call_data)
    
    # Preprocess
    remove_lone_nodes(G)
    connect_edges(G)
    remove_BNUMBER_nodes(G)
    
    # Add churner attribute to nodes
    churners = set_initial_churners(data0, data1)
    for n in G.nodes():
        if n in churners:
            G.add_node(n, churner=1)
        else:
            G.add_node(n, churner=0)
    
    # Add influence attribute to nodes
    add_influence_label(G)
    
    # Label Propagation
    Y = label_propagate(G)
    
    # Combine Y with A_NUMBER -> return df
    df = pd.DataFrame(Y, columns=['churn_prob', 'influence_prob'])
    df['A_NUMBER'] = pd.Series(G.nodes())
        
    return df

In [130]:
def build_Y(G):
    arr = []
    for n in G.nodes():
        arr.append([G.node[n]['churner'], G.node[n]['influence']])
    return np.matrix(arr)
   
def fix_labels(G, Y):
    for i, n in enumerate(G.nodes()):
        if G.node[n]['churner']==1:
            Y[i,0]=1.0
            Y[i,1]=0.0
        elif G.node[n]['influence']==1:
            Y[i,1]=1.0
            Y[i,0]=0.0
    return Y

def label_propagate(G):
    A = nx.adjacency_matrix(G, weight='DURATION_SEC')
#     T = normalize(A, axis=0, norm='l1')
    T = A
    Y = build_Y(G)
    
    while(True):
        Y1 = T*Y
        Y1 = normalize(Y1, axis=1, norm='l1') # Row normalize Y
        Y1 = fix_labels(G, Y1)
        Y1 = np.matrix(Y1)
        if np.allclose(Y1, Y, atol=1e-6):
            break
        else:
            Y=Y1
    
    return Y
    

In [131]:
# Main program
churn_period = monthly_data[2][monthly_data[2]['EVENT_DATE'].dt.day <16]
feature_data = feature_engineer(monthly_data[1])

# Add graph features. Build graph from CDR and add churner&influence label
graph_data = set_graph_features(monthly_data[0], monthly_data[1])

# Create training set
feature_data = feature_data.join(graph_data)
train_data = set_label(feature_data, churn_period)

# Write to csv
train_data.to_csv('Month2_LP.csv', index=False)

KeyboardInterrupt: 

In [39]:
monthly_data[0][monthly_data[0]['A_NUMBER']==716753852]

Unnamed: 0,A_NUMBER,B_NUMBER,EVENT_TYPE,EVENT_DATE,DURATION,EVENT_COST
17701811,716753852,0372222271,1,2014-10-23 08:57:04,00:00:42,0.0
17701812,716753852,0372222271,1,2014-10-23 10:16:41,00:00:11,0.0
17701817,716753852,0412228619,1,2014-10-03 07:55:23,00:00:08,0.0
17701819,716753852,0412255339,1,2014-10-22 20:41:20,00:00:12,0.5
17701820,716753852,0412255555,1,2014-10-18 15:51:43,00:00:11,0.0
17701821,716753852,0412255555,1,2014-10-22 20:13:50,00:00:15,0.5
17701825,716753852,0412255597,0,2014-10-19 07:23:14,00:00:02,0.0
17701826,716753852,0412255597,1,2014-10-19 07:16:37,00:01:56,0.0
17701827,716753852,0412255597,1,2014-10-19 07:24:05,00:00:34,0.0
17701828,716753852,0412255597,1,2014-10-19 13:12:19,00:00:23,0.0


In [62]:
# G.edges()
# A = nx.adjacency_matrix(G)
# print A
# print G[1][2]['DURATION'] 
# G.number_of_nodes()
nx.degree(G)

{'0779438464': 1,
 '0714238693': 1,
 '0714238690': 2,
 '0714238691': 1,
 '0332296289': 1,
 '0714238695': 1,
 '0112869513': 1,
 '0719197286': 1,
 '0112869514': 1,
 '0729887509': 1,
 '0779312008': 1,
 '0711974025': 1,
 '0776484388': 1,
 '0725911007': 1,
 '0714238696': 1,
 '0714879421': 1,
 '0784717011': 1,
 '0716552726': 1,
 '0717534633': 1,
 '0522279128': 1,
 '0522279129': 1,
 '0522279126': 1,
 '0522279122': 3,
 '0776026960': 1,
 '0522279120': 1,
 '0777470892': 1,
 '0752583684': 1,
 '0729236560': 1,
 '0729236566': 1,
 '0728645375': 1,
 '0773965420': 1,
 '0773093151': 1,
 '0713610322': 1,
 '0715127664': 1,
 '0412292722': 1,
 '0372224486': 1,
 '0712427618': 1,
 '0412225876': 1,
 '0343749478': 2,
 '0512232297': 1,
 '0771136127': 1,
 '0774232690': 1,
 '0712427615': 1,
 '0718358899': 2,
 '0752864543': 1,
 '0712803577': 1,
 '0711211404': 2,
 '0715753660': 1,
 '0718352295': 1,
 '0719891144': 1,
 '0713566489': 1,
 '0718609877': 1,
 '0712229673': 1,
 '0718609873': 1,
 '0713000223': 3,
 '07784039

In [13]:
# Remove lone nodes
def remove_lone_nodes(G):
    degree_array = nx.degree(G)
    for n in G.nodes():
        if degree_array[n]==1:
            G.remove_node(n)
    G.number_of_nodes()

In [14]:
# check #nodes with degree = 0
# Result = 6
count = 0
for n in G.nodes():
    if G.degree(n)==0:
        count += 1
        print n
print count

719531615
715373859
711475353
711208643
713704741
716944415
6


In [12]:
# Connect 2 A_NUMBER that shares the same B_NUMBER
def connect_edges_from_node(G, b, neighbors):
    neighbors_len = len(neighbors)
    for i in range(neighbors_len):
        for j in range(i+1, neighbors_len):
            a1 = neighbors[i]
            a2 = neighbors[j]
            added_duration = G[a1][b]['DURATION_SEC'] + G[a2][b]['DURATION_SEC']
            if G.has_edge(a1,a2):
                G[a1][a2]['DURATION_SEC'] += added_duration
            else:
                G.add_edge(a1, a2, DURATION_SEC=added_duration)
       
def connect_edges(G):
    for n in G.nodes():
        if type(n) is str:
            connect_edges_from_node(G, n, G.neighbors(n))

In [48]:
len(G.neighbors(716753852))
# G.nodes()

6825

In [49]:
# 0719340809 
monthly_data[0][(monthly_data[0]['A_NUMBER']==716753852) ]
# G[716753852]['0714888281']['DURATION']

Unnamed: 0,A_NUMBER,B_NUMBER,EVENT_TYPE,EVENT_DATE,DURATION,EVENT_COST
17701811,716753852,0372222271,1,2014-10-23 08:57:04,00:00:42,0.0
17701812,716753852,0372222271,1,2014-10-23 10:16:41,00:00:11,0.0
17701817,716753852,0412228619,1,2014-10-03 07:55:23,00:00:08,0.0
17701819,716753852,0412255339,1,2014-10-22 20:41:20,00:00:12,0.5
17701820,716753852,0412255555,1,2014-10-18 15:51:43,00:00:11,0.0
17701821,716753852,0412255555,1,2014-10-22 20:13:50,00:00:15,0.5
17701825,716753852,0412255597,0,2014-10-19 07:23:14,00:00:02,0.0
17701826,716753852,0412255597,1,2014-10-19 07:16:37,00:01:56,0.0
17701827,716753852,0412255597,1,2014-10-19 07:24:05,00:00:34,0.0
17701828,716753852,0412255597,1,2014-10-19 13:12:19,00:00:23,0.0


In [24]:
def build_graph(data):
    ### Construct graph G from df
    # Adding the weight to prepare for nx
    df = data.groupby(['A_NUMBER', 'B_NUMBER'])['DURATION'].sum().reset_index()
    df['DURATION_SEC'] = df['DURATION'] / np.timedelta64(1, 's')
    G = nx.from_pandas_dataframe(df,'A_NUMBER', 'B_NUMBER', ['DURATION_SEC'])
    return G

def set_initial_churners(data0, data1):
    # Return list of churners
    a0 = pd.unique(data0.A_NUMBER.ravel())
    a1 = pd.unique(data1[data1['EVENT_DATE'].dt.day <16].A_NUMBER.ravel())
    churners = list(set(a0) - set(a1))
    
    return churners
    
# param data is the monthly data
def set_graph_features(data0, data1):
    data = pd.concat([data0, data1])
    # Aggregate call data
    call_data = data[(data['EVENT_TYPE']==EVENT['OUTGOING_CALL']) | (data['EVENT_TYPE']==EVENT['INCOMING_CALL'])]
    
    # Construct graph
    G = build_graph(call_data)
    
    # Preprocess
    remove_lone_nodes(G)
    connect_edges(G)
    remove_BNUMBER_nodes(G)
    
    # Add churner attribute to nodes
    churners = set_initial_churners(data0, data1)
    for n in G.nodes():
        if n in churners:
            G.add_node(n, churner=1)
        else:
            G.add_node(n, churner=0)
    
    # Add influence attribute to nodes
    add_influence_label(G)
    
    return G    

In [84]:
# remove_lone_nodes(G2)
connect_edges(G2)

In [85]:
# G2[714954079]['0812389908']['DURATION_SEC']
# len(G2.neighbors(714954079))
# G2.number_of_nodes()

34

In [23]:
# remove B_NUMBER nodes
def remove_BNUMBER_nodes(G):
    for n in G.nodes():
        if type(n) is str:
            G.remove_node(n)
            
def add_influence_label(G):
    degree_list = G.degree().values()
    sorted_degree = sorted(degree_dict, reverse=True)[:50]
    threshold = sorted_degree[-1]
    
    # Add label
    for n in G.nodes():
        if G.degree(n) < threshold:
            G.add_node(n, influence=0)
        else:
            G.add_node(n, influence=1)


In [90]:
A = nx.adjacency_matrix(G2, weight='DURATION_SEC')


IndexError: index (10) out of range

In [92]:
M = A.todense() # Convert to dense matrix

In [96]:
# Get the 50 most influential node
# centrality = nx.katz_centrality(G,weight='DURATION_SEC') # Read more on this. Computation very slow. 

KeyboardInterrupt: 

In [111]:
a0 = pd.unique(monthly_data[0].A_NUMBER.ravel())
a1 = pd.unique(monthly_data[1][monthly_data[1]['EVENT_DATE'].dt.day <16].A_NUMBER.ravel())
churner = list(set(a0) - set(a1))

In [121]:
for n in G.nodes():
    if n in churner:
        G.add_node(n, churner=1)
    else:
        G.add_node(n, churner=0)
    

In [124]:
G.node[710358916]['churner']

1

In [16]:
G = set_graph_features(monthly_data[0], monthly_data[1])

In [28]:
sorted(G.degree().values(), reverse=True)[:50][-1]

4171

In [32]:
A = nx.adjacency_matrix(G, weight='DURATION_SEC')

In [31]:
len(G.nodes())

6944

In [33]:
M = A.todense()

In [34]:
M

matrix([[  0.,   0.,   0., ...,  21.,  40.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.],
        [  0.,   0.,   0., ...,  15.,   0.,   0.],
        ..., 
        [ 21.,   0.,  15., ...,   0.,  27.,   0.],
        [ 40.,   0.,   0., ...,  27.,   0.,   0.],
        [  0.,   0.,   0., ...,   0.,   0.,   0.]])

In [39]:
from sklearn.preprocessing import normalize
row_norm = normalize(M, axis=1, norm='l1')
col_norm = normalize(M, axis=0, norm='l1')

In [54]:
# G.node[G.nodes()[0]]['influence']
# degree_list = G.degree().values()
# sorted_degree = sorted(degree_list, reverse=True)[:50]
# threshold = sorted_degree[-1]

# # Add label
# for n in G.nodes():
#     if G.degree(n) < threshold:
#         G.add_node(n, influence=0)
#     else:
#         G.add_node(n, influence=1)

arr = []
for n in G.nodes():
#     if G.node[n]['churner']==1 and G.node[n]['influence']==1:
#         print 'error'
    arr.append([G.node[n]['churner'], G.node[n]['influence']])
np.matrix(arr)

In [75]:
Y = np.matrix(Y)

In [76]:
Y = T*Y

In [62]:
Y = normalize(Y, axis=1, norm='l1')

In [71]:
Y

array([[ 0.1504285 ,  0.8495715 ],
       [ 0.        ,  0.        ],
       [ 0.95030817,  0.04969183],
       ..., 
       [ 0.74262989,  0.25737011],
       [ 0.17054664,  0.82945336],
       [ 0.        ,  1.        ]])

In [70]:
Y = fix_labels(G, Y)

In [101]:
T = A
Y = build_Y(G)

while(True):
    Y1 = T*Y
    Y1 = normalize(Y1, axis=1, norm='l1') # Row normalize Y
    Y1 = fix_labels(G, Y1)
    Y1 = np.matrix(Y1)
    if np.allclose(Y1, Y, atol=1e-6):
        break
    else:
        Y=Y1

In [102]:
eps = 1e-6

In [90]:
eps*

1e-06

In [112]:
for i in range(6000):
    if Y[i,0] > 0.1:
        print Y[i]

[[ 0.12894933  0.87105067]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.13208994  0.86791006]]
[[ 0.12155981  0.87844019]]
[[ 0.14239356  0.85760644]]
[[ 0.16416212  0.83583788]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.22933823  0.77066177]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.11304139  0.88695861]]
[[ 1.  0.]]
[[ 0.15165232  0.84834768]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.10193919  0.89806081]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.13747028  0.86252972]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.10974072  0.89025928]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.13990637  0.86009363]]
[[ 1.  0.]]
[[ 1.  0.]]
[[ 0.25420595  0.74579405]]
[[ 1.  0.]]


In [109]:
amax(Y,axis=0)

matrix([[ 1.,  1.]])

In [115]:
arr = []
arr.append([1,Y[1,0], Y[1,1]])

In [120]:
df = pd.DataFrame(Y, columns=['churner', 'influence'])
df['A_NUMBER']=pd.Series(G.nodes())

In [121]:
df

Unnamed: 0,churner,influence,A_NUMBER
0,0.062276,0.937724,714080396
1,0.062568,0.937432,717226169
2,0.067409,0.932591,714080694
3,0.064053,0.935947,712332722
4,0.067637,0.932363,713032493
5,0.062201,0.937799,719622133
6,0.062463,0.937537,713032823
7,0.062367,0.937633,711983298
8,0.064568,0.935432,711808537
9,0.064912,0.935088,712682431


In [135]:
churn_period = monthly_data[2][monthly_data[2]['EVENT_DATE'].dt.day <16]
feature_data = feature_engineer(monthly_data[1])

# Add graph features. Build graph from CDR and add churner&influence label
graph_data = set_graph_features(monthly_data[0], monthly_data[1])

In [137]:
graph_data.set_index('A_NUMBER')

Unnamed: 0_level_0,churn_prob,influence_prob
A_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1
714080396,0.062276,0.937724
717226169,0.062568,0.937432
714080694,0.067409,0.932591
712332722,0.064053,0.935947
713032493,0.067637,0.932363
719622133,0.062201,0.937799
713032823,0.062463,0.937537
711983298,0.062367,0.937633
711808537,0.064568,0.935432
712682431,0.064912,0.935088


In [138]:
graph_data

Unnamed: 0,churn_prob,influence_prob,A_NUMBER
0,0.062276,0.937724,714080396
1,0.062568,0.937432,717226169
2,0.067409,0.932591,714080694
3,0.064053,0.935947,712332722
4,0.067637,0.932363,713032493
5,0.062201,0.937799,719622133
6,0.062463,0.937537,713032823
7,0.062367,0.937633,711983298
8,0.064568,0.935432,711808537
9,0.064912,0.935088,712682431
