In [1]:
# data clean packages
import pandas as pd
import numpy as np
import datetime as dt

# plot packages
import seaborn as sns; sns.set(color_codes=True)
import matplotlib.pyplot as plt

# network packages
import networkx as nx
from networkx.algorithms import approximation
from networkx.algorithms import community
import community
import collections

In [2]:
df2015 = pd.read_pickle('df2015')

In [4]:
df2015.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87431 entries, 11210 to 513785
Data columns (total 9 columns):
id               87431 non-null object
start            87431 non-null datetime64[ns]
end              87431 non-null datetime64[ns]
rel_type         87431 non-null object
source_ticker    87395 non-null object
target_ticker    87367 non-null object
source_cusip     87431 non-null object
target_cusip     87431 non-null object
year             87431 non-null object
dtypes: datetime64[ns](2), object(7)
memory usage: 6.7+ MB


In [3]:
df2015.head()

Unnamed: 0,id,start,end,rel_type,source_ticker,target_ticker,source_cusip,target_cusip,year
11210,CUST-102793,2003-04-03,2016-12-14,CUSTOMER,LOGN-CH,IM,H50430232,457153104,2015-12-31
11211,CUST-102795,2003-04-03,2017-08-10,CUSTOMER,LOGN-CH,TECD,H50430232,878237106,2015-12-31
11212,CUST-102796,2003-04-27,2017-08-10,CUSTOMER,LOGN-CH,BBY,H50430232,86516101,2015-12-31
11213,CUST-102797,2003-04-27,2017-08-10,CUSTOMER,LOGN-CH,WMT,H50430232,931142103,2015-12-31
11214,CUST-102798,2003-04-27,2017-08-10,CUSTOMER,LOGN-CH,ODP,H50430232,676220106,2015-12-31


# Net centrality

In [5]:
list = [df2015]
net_centrality = pd.DataFrame()
for item in list:
    G = nx.from_pandas_edgelist(item, 'source_cusip', 'target_cusip', create_using = nx.DiGraph())
    G.remove_edges_from(nx.selfloop_edges(G))
    # out_degree centrality
    out_degree_centrality = pd.Series(nx.out_degree_centrality(G))
    out_degree_centrality = pd.DataFrame(out_degree_centrality, columns = ['out_degree_centrality'])
    
    # in_degree centrality
    in_degree_centrality = pd.Series(nx.in_degree_centrality(G))
    in_degree_centrality = pd.DataFrame(in_degree_centrality, columns = ['in_degree_centrality'])
    
    
    # closeness
    closeness_centrality = pd.Series(nx.closeness_centrality(G))
    closeness_centrality = pd.DataFrame(closeness_centrality, columns = ['closeness_centrality'])
    
    # betweenness
    betweenness_centrality = pd.Series(nx.betweenness_centrality(G, normalized=True))
    betweenness_centrality = pd.DataFrame(betweenness_centrality, columns=['betweenness_centrality'])
    
    # eigenvector
    eigenvector_centrality = pd.Series(nx.eigenvector_centrality(G, max_iter=100, tol=1e-06, nstart=None, weight=None))
    eigenvector_centrality = pd.DataFrame(eigenvector_centrality, columns=['eigenvector_centrality'])
    
    # harmonic_centrality
    harmonic_centrality = pd.Series(nx.harmonic_centrality(G))
    harmonic_centrality = pd.DataFrame(harmonic_centrality, columns = ['harmonic_centrality'])
    
    # average_neighbor_degree
    average_neighbor_degree = pd.Series(nx.average_neighbor_degree(G))
    average_neighbor_degree = pd.DataFrame(average_neighbor_degree, columns = ['average_neighbor_degree'])
    
    #square_clustering 
    square_clustering = pd.Series(nx.square_clustering(G))
    square_clustering = pd.DataFrame(square_clustering, columns = ['square_clustering'])
    
    # core_number 
    core_number = pd.Series(nx.core_number(G))
    core_number = pd.DataFrame(core_number, columns = ['core_number'])
    
    # pagerank
    pagerank = pd.Series(nx.pagerank(G, alpha = 0.9))
    pagerank = pd.DataFrame(pagerank, columns = ['pagerank'])
    
    # hubs and authorties
    hits = nx.hits(G, max_iter=500)
    
    hubs = pd.Series(hits[0])
    hubs = pd.DataFrame(hubs, columns = ['hubs'])
    
    authorities = pd.Series(hits[1])
    authorities = pd.DataFrame(authorities, columns = ['authorities'])
    
    
    # merge_time
    data_merge_time = pd.concat([out_degree_centrality, in_degree_centrality, closeness_centrality, 
                                 betweenness_centrality, eigenvector_centrality, harmonic_centrality,
                                average_neighbor_degree,square_clustering,core_number,pagerank,
                                 hubs,authorities],axis =1)
    #data_merge_time['year'] = " ".join(str(x) for x in item['year'].unique())
    
    # merge_all
    net_centrality = net_centrality.append(data_merge_time)
    net_centrality.to_pickle('network_features_2015.pkl')

In [6]:
net_centrality.head()

Unnamed: 0,out_degree_centrality,in_degree_centrality,closeness_centrality,betweenness_centrality,eigenvector_centrality,harmonic_centrality,average_neighbor_degree,square_clustering,core_number,pagerank,hubs,authorities
H50430232,0.002037,0.000719,0.122002,0.000145,0.006178,2188.354365,8.647059,0.00247,19,4.3e-05,0.000341,0.000204
457153104,0.000779,0.003535,0.155264,0.000662,0.073291,2889.579762,6.461538,0.002287,19,0.000424,4.4e-05,0.001082
878237106,0.002636,0.001977,0.14258,0.002216,0.036882,2620.60119,52.568182,0.002846,19,0.000147,0.00057,0.000587
086516101,0.000419,0.006651,0.156708,0.000706,0.077164,2940.965873,23.285714,0.002941,19,0.000543,4.6e-05,0.001983
931142103,0.001738,0.024028,0.183122,0.012074,0.155457,3534.92381,8.137931,0.002457,19,0.003122,6.4e-05,0.004436


In [7]:
net_centrality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16690 entries, H50430232 to Y7081A107
Data columns (total 12 columns):
out_degree_centrality      16690 non-null float64
in_degree_centrality       16690 non-null float64
closeness_centrality       16690 non-null float64
betweenness_centrality     16690 non-null float64
eigenvector_centrality     16690 non-null float64
harmonic_centrality        16690 non-null float64
average_neighbor_degree    16690 non-null float64
square_clustering          16690 non-null float64
core_number                16690 non-null int64
pagerank                   16690 non-null float64
hubs                       16690 non-null float64
authorities                16690 non-null float64
dtypes: float64(11), int64(1)
memory usage: 1.7+ MB


In [8]:
net_centrality.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
out_degree_centrality,16690.0,0.000292,0.000619,0.0,0.0,5.991971e-05,0.0003,0.018395
in_degree_centrality,16690.0,0.000292,0.000939,0.0,5.991971e-05,5.991971e-05,0.00024,0.024687
closeness_centrality,16690.0,0.067169,0.055858,0.0,5.991971e-05,0.09359593,0.114743,0.183122
betweenness_centrality,16690.0,9.4e-05,0.000562,0.0,0.0,0.0,3.5e-05,0.029313
eigenvector_centrality,16690.0,0.002412,0.007356,1.2171e-24,2.3124900000000003e-23,5.852633e-05,0.001389,0.155457
harmonic_centrality,16690.0,1188.778773,996.049835,0.0,1.0,1622.659,2028.832909,3534.92381
average_neighbor_degree,16690.0,11.430576,20.273111,0.0,0.0,3.0,15.25,230.0
square_clustering,16690.0,0.036473,0.507751,0.0,0.0,0.0,0.001137,23.0
core_number,16690.0,5.052067,4.973907,1.0,1.0,3.0,7.0,19.0
pagerank,16690.0,6e-05,0.000111,1.921588e-05,1.948718e-05,2.597707e-05,5.3e-05,0.003122
