In [3]:
import numpy as np
import pandas as pd
import random
import networkx as nx
import pickle
from tqdm import tqdm
import math

In [4]:
with open('graph.txt', 'rb') as file:
    graph = pickle.load(file)

In [None]:
with open('Train_HHH.txt', 'rb') as file:
    train = pickle.load(file)

In [None]:
# Trian w label
source=train.Source
sink=train.Sink
Label=train.Label
dataframe_train['Label']=Label

## Nodes Features

In [10]:
# Source following
following_so=[]
for elem in source:
    following_so.append(graph.out_degree(elem))

In [12]:
dataframe_train['Source_following']=following_so

In [13]:
# Sink following
following_si=[]
for elem in sink:
    following_si.append(graph.out_degree(elem))

In [14]:
dataframe_train['Sink_following']=following_si

In [15]:
# Source follower
follower_so=[]
for elem in source:
    follower_so.append(graph.in_degree(elem))

In [16]:
dataframe_train['Source_follower']=follower_so

In [17]:
# Sink follower
follower_si=[]
for elem in sink:
    follower_si.append(graph.in_degree(elem))

In [18]:
dataframe_train['Sink_follower']=follower_si

In [20]:
shortest_path=[]
for i in range(len(source)):
    try:
        ll=nx.shortest_path_length(graph,source[i],sink[i])
    except:
        ll=0
    shortest_path.append(ll)

In [21]:
x = np.array(shortest_path)
np.unique(x)

array([0, 2, 3, 4, 5])

In [22]:
dataframe_train['Shortest_path']=shortest_path

In [23]:
dataframe_train

Unnamed: 0,Source_following,Sink_following,Source_follower,Sink_follower,Shortest_path
0,21,0,3,29,4
1,71,0,13,9,3
2,205,0,80,17,2
3,506,0,32,36,2
4,18,0,5,46,3
...,...,...,...,...,...
1995,53,0,16,2,3
1996,95,0,53,41,3
1997,27,0,6,2,3
1998,56,0,7,3,3


In [31]:
print('The Number of zeros:')
dataframe_train.isin([0]).sum()

The Number of zeros


Source_following       0
Sink_following      1631
Source_follower        0
Sink_follower          0
Shortest_path         23
dtype: int64

## Similarity Functions for Directed Graph

In [32]:
def getNeighbors_Source(node, graph):
    nei = [n for n in graph.neighbors(node)]
    return nei

In [33]:
def getNeighbors_Sink(node, graph):
    nei = [n for n in graph.predecessors(node)]
    return nei

In [34]:
# Common Neighbors
def CN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = len(set(nei1).intersection(set(nei2)))
    return cn

In [35]:
# All the Neighbors for the Two Nodes
def union(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    union = len(set(nei1).union(set(nei2)))
    return union

In [36]:
# Jaccard Coefficient
def JC(n1,n2,graph):
    cn = CN(n1,n2,graph)
    un = union(n1,n2,graph)
    jc = cn/(un+1)
    return jc

In [37]:
# Adamic Adar Index
def AAI(n1, n2, graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    
    aai = 0
    for i in cn:
        n = getNeighbors_Source(i, graph)
        num = len(n)
        aai += 1 / math.log(num + 0.5)   
    return aai

In [38]:
# Preferential Attachment (*)
def PA(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    pa = len(nei1)*len(nei2)
    return pa

In [57]:
# CN/min()
def HP(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = set(nei1).intersection(set(nei2))
    hp = len(cn)/min(len(nei1),len(nei2))
    return hp

In [51]:
# Resource Allocation Index
def RA(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    
    ra = 0
    for i in cn:
        n = getNeighbors_Source(i, graph)
        num = len(n)
        ra += 1 / (num + 0.5)
    return ra

In [58]:
# CN/PA
def LHN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    lhn = len(cn)/(PA(n1,n2,graph))
    return lhn

## Extract the Similarity Features from Dataset

In [41]:
dataframe_train

Unnamed: 0,Source_following,Sink_following,Source_follower,Sink_follower,Shortest_path
0,21,0,3,29,4
1,71,0,13,9,3
2,205,0,80,17,2
3,506,0,32,36,2
4,18,0,5,46,3
...,...,...,...,...,...
1995,53,0,16,2,3
1996,95,0,53,41,3
1997,27,0,6,2,3
1998,56,0,7,3,3


In [42]:
cn = []
for i in tqdm(range(len(source))):
    cn.append(CN(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:00<00:00, 3451.20it/s]


In [43]:
dataframe_train['CN']=cn

In [44]:
aai = []
for i in tqdm(range(len(source))):
    aai.append(AAI(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:28<00:00, 69.67it/s] 


In [45]:
dataframe_train['AAI']=aai

In [46]:
jc = []
for i in tqdm(range(len(source))):
    jc.append(JC(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:00<00:00, 2022.19it/s]


In [47]:
dataframe_train['JC']=jc

In [48]:
pa = []
for i in tqdm(range(len(source))):
    pa.append(PA(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:00<00:00, 8695.68it/s]


In [49]:
dataframe_train['PA']=pa

In [52]:
ra = []
for i in tqdm(range(len(source))):
    ra.append(RA(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:20<00:00, 99.50it/s] 


In [53]:
dataframe_train['RA']=ra

In [59]:
hp = []
for i in tqdm(range(len(source))):
    hp.append(HP(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:00<00:00, 4010.40it/s]


In [60]:
dataframe_train['HP']=hp

In [61]:
lhn = []
for i in tqdm(range(len(source))):
    lhn.append(LHN(source[i],sink[i],graph))

100%|██████████| 2000/2000 [00:00<00:00, 3141.72it/s]


In [62]:
dataframe_train['LHN']=lhn

In [65]:
with open('PageRank.txt', 'rb') as file:
    pr = pickle.load(file)
    
dataframe_train['PageRank_Src'] = source.apply(lambda row: pr.get(row))
dataframe_train['PageRank_Sink'] = sink.apply(lambda row: pr.get(row))


In [66]:
dataframe_train

Unnamed: 0,Source_following,Sink_following,Source_follower,Sink_follower,Shortest_path,CN,AAI,JC,PA,RA,HP,LHN,PageRank_Src,PageRank_Sink
0,21,0,3,29,4,0,0.000000,0.000000,609,0.000000,0.000000,0.000000,2.100194e-07,2.108565e-07
1,71,0,13,9,3,0,0.000000,0.000000,639,0.000000,0.000000,0.000000,2.097592e-07,2.196712e-07
2,205,0,80,17,2,2,0.404742,0.009050,3485,0.014290,0.117647,0.000574,2.502170e-07,2.185784e-07
3,506,0,32,36,2,2,0.178818,0.003697,18216,0.000028,0.055556,0.000110,2.079456e-07,2.060563e-07
4,18,0,5,46,3,0,0.000000,0.000000,828,0.000000,0.000000,0.000000,2.059882e-07,2.053494e-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,53,0,16,2,3,0,0.000000,0.000000,106,0.000000,0.000000,0.000000,2.198319e-07,2.047780e-07
1996,95,0,53,41,3,0,0.000000,0.000000,3895,0.000000,0.000000,0.000000,2.553266e-07,2.050740e-07
1997,27,0,6,2,3,0,0.000000,0.000000,54,0.000000,0.000000,0.000000,2.064844e-07,2.049514e-07
1998,56,0,7,3,3,0,0.000000,0.000000,168,0.000000,0.000000,0.000000,2.287362e-07,2.048797e-07


In [67]:
print('The Number of Zeros:')
dataframe_train.isin([0]).sum()

The Number of Zeros:


Source_following       0
Sink_following      1631
Source_follower        0
Sink_follower          0
Shortest_path         23
CN                   922
AAI                  922
JC                   922
PA                     0
RA                   922
HP                   922
LHN                  922
PageRank_Src           0
PageRank_Sink          0
dtype: int64

## Centrality Features

In [69]:
with open('Eigenvector_Centrality.txt', 'rb') as file:
    centrality = pickle.load(file)
dataframe_train['ECentrality_Sour'] = source.apply(lambda row: centrality.get(row))
dataframe_train['ECentrality_Sink'] = sink.apply(lambda row: centrality.get(row))

In [70]:
with open('Degree_centrality.txt', 'rb') as file:
    dc = pickle.load(file)
dataframe_train['Degree_Centrality_Sour'] = source.apply(lambda row: dc.get(row))
dataframe_train['Degree_Centrality_Sink'] = sink.apply(lambda row: dc.get(row))

In [71]:
print('The Number of Zeros:')
dataframe_train.isin([0]).sum()

The Number of Zeros:


Source_following             0
Sink_following            1631
Source_follower              0
Sink_follower                0
Shortest_path               23
CN                         922
AAI                        922
JC                         922
PA                           0
RA                         922
HP                         922
LHN                        922
PageRank_Src                 0
PageRank_Sink                0
ECentrality_Sour             0
ECentrality_Sink             0
Degree_Centrality_Sour       0
Degree_Centrality_Sink       0
dtype: int64

## Output Feature Dataframe

In [72]:
with open('dataframe_train.txt','wb') as file:
    pickle.dump(dataframe_train,file)

In [115]:
dataframe_train

Unnamed: 0,Source,Sink,CN,AAI,JC,PA,RA
0,3563811,3600160,0,0.000000,0.000000,609,0.000000
1,2052043,1401960,0,0.000000,0.000000,639,0.000000
2,4517994,1690636,2,0.404742,0.009050,3485,0.014290
3,1660006,4349447,2,0.178818,0.003697,18216,0.000028
4,581111,1882617,0,0.000000,0.000000,828,0.000000
...,...,...,...,...,...,...,...
1995,1461386,2341683,0,0.000000,0.000000,106,0.000000
1996,4057755,1871227,0,0.000000,0.000000,3895,0.000000
1997,4242514,1413468,0,0.000000,0.000000,54,0.000000
1998,555531,1290080,0,0.000000,0.000000,168,0.000000


## Code Debug

In [76]:
print(getNeighbors('3563811', graph),getNeighbors('3600160', graph))

['787039', '3519329', '2472993', '3274877', '1541304', '673835', '1405689', '2401069', '1628939', '4016137', '4008078', '3017264', '2331231', '127885', '3355752', '1523982', '2166423', '1120944', '3638703', '2558493', '4027460'] []


In [77]:
getNeighbors('3600160', graph)

[]

In [78]:
with open('source_node.txt', 'rb') as file:
    source_node = pickle.load(file)

In [80]:
'3600160' in source_node

False

In [None]:
list(graph.predecessors('3600160'))