In [2]:
# data clean packages
import pandas as pd
import numpy as np
import datetime as dt

# plot packages
import seaborn as sns; sns.set(color_codes=True)
import matplotlib.pyplot as plt

# network packages
import networkx as nx
from networkx.algorithms import approximation
from networkx.algorithms import community
import community
import collections

In [3]:
df = pd.read_pickle('data_competitor_2015.pkl')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 886 entries, 4 to 1656
Data columns (total 6 columns):
ID               886 non-null object
start            886 non-null datetime64[ns]
end              886 non-null datetime64[ns]
source_ticker    886 non-null object
target_ticker    886 non-null object
date             886 non-null object
dtypes: datetime64[ns](2), object(4)
memory usage: 48.5+ KB


# undirected network features

In [5]:
G = nx.from_pandas_edgelist(df, 'source_ticker', 'target_ticker', create_using = nx.Graph())

node_clique_number = pd.Series(nx.node_clique_number(G))
node_clique_number = pd.DataFrame(node_clique_number, columns = ['node_clique_number'])

load_centrality = pd.Series(nx.load_centrality(G))
load_centrality = pd.DataFrame(load_centrality, columns = ['load_centrality'])

number_of_cliques = pd.Series(nx.number_of_cliques(G))
number_of_cliques = pd.DataFrame(number_of_cliques, columns = ['load_centrality'])

triangles = pd.Series(nx.triangles(G))
triangles = pd.DataFrame(triangles, columns = ['triangles'])

clustering = pd.Series(nx.clustering(G))
clustering = pd.DataFrame(clustering, columns = ['triangles'])

undirected = pd.concat([node_clique_number,load_centrality,number_of_cliques,triangles,clustering],axis =1)

In [6]:
undirected.head()

Unnamed: 0,node_clique_number,load_centrality,load_centrality.1,triangles,triangles.1
TIT-IT,2,0.0,1,0,0.0
MSFT,4,0.338732,214,119,0.005581
TTEC,2,0.0,1,0,0.0
IBM,4,0.316568,220,116,0.005236
CRAY,3,0.0,1,1,1.0


In [24]:
node_clique_number = nx.node_clique_number(G)

In [31]:
load_centrality = nx.load_centrality(G)

In [40]:
number_of_cliques = nx.number_of_cliques(G)

In [43]:
triangles = nx.triangles(G)

In [45]:
clustering = nx.clustering(G)

# network centrality

In [10]:
list = [df]
net_centrality = pd.DataFrame()
for item in list:
    G = nx.from_pandas_edgelist(item, 'source_ticker', 'target_ticker', create_using = nx.DiGraph())
    # degree centrality
    degree_centrality = pd.Series(nx.out_degree_centrality(G))
    degree_centrality = pd.DataFrame(degree_centrality, columns = ['degree_centrality'])
    
    # closeness
    closeness_centrality = pd.Series(nx.closeness_centrality(G))
    closeness_centrality = pd.DataFrame(closeness_centrality, columns = ['closeness_centrality'])
    
    # betweenness
    betweenness_centrality = pd.Series(nx.betweenness_centrality(G, normalized=True))
    betweenness_centrality = pd.DataFrame(betweenness_centrality, columns=['betweenness_centrality'])
    
    # eigenvector
    eigenvector_centrality = pd.Series(nx.eigenvector_centrality(G, max_iter=100, tol=1e-06, nstart=None, weight=None))
    eigenvector_centrality = pd.DataFrame(eigenvector_centrality, columns=['eigenvector_centrality'])
    
    #katz_centrality
    katz_centrality = pd.Series(nx.katz_centrality(G, alpha=0.1, beta=1.0, max_iter=1000, tol=1e-06, nstart=None, normalized=True, weight=None))
    katz_centrality = pd.DataFrame(katz_centrality, columns=['katz_centrality'])
    
    # harmonic_centrality
    harmonic_centrality = pd.Series(nx.harmonic_centrality(G))
    harmonic_centrality = pd.DataFrame(harmonic_centrality, columns = ['harmonic_centrality'])
    
    # merge_time
    data_merge_time = pd.concat([degree_centrality, closeness_centrality, 
                                 betweenness_centrality, eigenvector_centrality, 
                                 katz_centrality, harmonic_centrality],axis =1)
    #data_merge_time['date'] = " ".join(str(x) for x in item['date'].unique())
    
    # merge_all
    net_centrality = data_merge_time.append(net_centrality)

In [11]:
net_centrality.head()

Unnamed: 0,degree_centrality,closeness_centrality,betweenness_centrality,eigenvector_centrality,katz_centrality,harmonic_centrality
TIT-IT,0.001435,0.0,0.0,2.251837e-12,0.015547,0.0
MSFT,0.038737,0.420229,0.023059,0.5104381,0.523359,329.833333
TTEC,0.001435,0.0,0.0,2.251837e-12,0.015547,0.0
IBM,0.021521,0.380832,0.013784,0.3526544,0.469269,318.666667
CRAY,0.002869,0.0,0.0,2.251837e-12,0.015547,0.0


In [12]:
net_centrality.info()

<class 'pandas.core.frame.DataFrame'>
Index: 698 entries, TIT-IT to TTWO
Data columns (total 6 columns):
degree_centrality         698 non-null float64
closeness_centrality      698 non-null float64
betweenness_centrality    698 non-null float64
eigenvector_centrality    698 non-null float64
katz_centrality           698 non-null float64
harmonic_centrality       698 non-null float64
dtypes: float64(6)
memory usage: 38.2+ KB


In [13]:
net_centrality.describe()

Unnamed: 0,degree_centrality,closeness_centrality,betweenness_centrality,eigenvector_centrality,katz_centrality,harmonic_centrality
count,698.0,698.0,698.0,698.0,698.0,698.0
mean,0.001838,0.014127,6.8e-05,0.007205444,0.021,10.459408
std,0.003138,0.059063,0.001033,0.03718504,0.031513,43.944082
min,0.0,0.0,0.0,2.251837e-12,0.015547,0.0
25%,0.001435,0.0,0.0,2.251837e-12,0.015547,0.0
50%,0.001435,0.0,0.0,2.251837e-12,0.015547,0.0
75%,0.001435,0.001913,0.0,1.705766e-10,0.017257,1.5
max,0.038737,0.420229,0.023059,0.5104381,0.523359,329.833333


In [14]:
net_centrality.corr()

Unnamed: 0,degree_centrality,closeness_centrality,betweenness_centrality,eigenvector_centrality,katz_centrality,harmonic_centrality
degree_centrality,1.0,0.153226,0.510317,0.306582,0.401534,0.166247
closeness_centrality,0.153226,1.0,0.375451,0.920893,0.76533,0.998632
betweenness_centrality,0.510317,0.375451,1.0,0.656879,0.841432,0.407434
eigenvector_centrality,0.306582,0.920893,0.656879,1.0,0.937185,0.931154
katz_centrality,0.401534,0.76533,0.841432,0.937185,1.0,0.791176
harmonic_centrality,0.166247,0.998632,0.407434,0.931154,0.791176,1.0


# network features

In [69]:
list = [df]
net_features = pd.DataFrame()
for item in list:
    G = nx.from_pandas_edgelist(item, 'source_ticker', 'target_ticker', create_using = nx.DiGraph())
    # average_neighbor_degree
    average_neighbor_degree = pd.Series(nx.average_neighbor_degree(G))
    average_neighbor_degree = pd.DataFrame(average_neighbor_degree, columns = ['average_neighbor_degree'])
    
    # clustering
#     clustering = pd.Series(nx.clustering(G))
#     clustering = pd.DataFrame(clustering, columns = ['clustering'])
    
    #square_clustering 
    square_clustering = pd.Series(nx.square_clustering(G))
    square_clustering = pd.DataFrame(square_clustering, columns = ['square_clustering'])
    
    # core_number 
    core_number = pd.Series(nx.core_number(G))
    core_number = pd.DataFrame(core_number, columns = ['core_number'])
    
    # pagerank
    pagerank = pd.Series(nx.pagerank(G, alpha = 0.9))
    pagerank = pd.DataFrame(pagerank, columns = ['pagerank'])
    
    # hubs and authorties
    hits = nx.hits(G, max_iter=500)
    
    hubs = pd.Series(hits[0])
    hubs = pd.DataFrame(hubs, columns = ['hubs'])
    
    authorities = pd.Series(hits[1])
    authorities = pd.DataFrame(authorities, columns = ['authorities'])
    
    # constraint
    constraint = pd.Series(nx.constraint(G))
    constraint = pd.DataFrame(constraint, columns = ['constraint'])
    
    #merge_time
    data_merge_time = pd.concat([average_neighbor_degree,square_clustering, core_number, pagerank,hubs, authorities, constraint],axis =1)
    #data_merge_time['date'] = " ".join(str(x) for x in item['date'].unique())
    
    #merge_all
    net_features = data_merge_time.append(net_features)

In [70]:
net_features.head()

Unnamed: 0,average_neighbor_degree,square_clustering,core_number,pagerank,hubs,authorities,constraint,date
TIT-IT,27.0,0.0,1,0.000477,0.002051,0.0,1.0,2015-12-31
MSFT,1.703704,45.0,5,0.168897,0.003051,0.341465,0.033211,2015-12-31
TTEC,15.0,0.0,1,0.000477,0.002038,0.0,1.0,2015-12-31
IBM,2.733333,16.0,5,0.098422,0.002782,0.339364,0.034203,2015-12-31
CRAY,7.5,0.0,2,0.000477,0.002546,0.0,0.510271,2015-12-31


In [4]:
ratio = pd.read_sas('firm_ratio.sas7bdat', encoding='iso-8859-1')

In [5]:
ratio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 722492 entries, 0 to 722491
Data columns (total 75 columns):
GVKEY              722492 non-null object
public_date        722492 non-null datetime64[ns]
adate              720043 non-null datetime64[ns]
qdate              722492 non-null datetime64[ns]
DPR                500868 non-null float64
PEG_TRAILING       331072 non-null float64
BM                 697244 non-null float64
CAPEI              700041 non-null float64
DIVYIELD           256676 non-null float64
EVM                717862 non-null float64
PCF                718144 non-null float64
PE_EXI             707475 non-null float64
PE_INC             707550 non-null float64
PE_OP_BASIC        707370 non-null float64
PE_OP_DIL          707332 non-null float64
PS                 709020 non-null float64
PTB                697244 non-null float64
EFFTAX             484669 non-null float64
GPROF              719752 non-null float64
AFTRET_EQ          718186 non-null float64
AFTRET_EQ