In [1]:
import numpy as np
import pandas as pd
import random
import networkx as nx
import pickle
from tqdm import tqdm
import math

In [2]:
with open('graph.txt', 'rb') as file:
    graph = pickle.load(file)

In [3]:
with open('Train_HHH.txt', 'rb') as file:
    train = pickle.load(file)

## Similarity Functions for Directed Graph

In [99]:
def getNeighbors_Source(node, graph):
    nei = [n for n in graph.neighbors(node)]
    return nei

In [83]:
def getNeighbors_Sink(node, graph):
    nei = [n for n in graph.predecessors(node)]
    return nei

In [84]:
# Common Neighbors
def CN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = len(set(nei1).intersection(set(nei2)))
    return cn

In [85]:
# All the Neighbors for the Two Nodes
def union(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    union = len(set(nei1).union(set(nei2)))
    return union

In [86]:
# Jaccard Coefficient
def JC(n1,n2,graph):
    cn = CN(n1,n2,graph)
    un = union(n1,n2,graph)
    jc = cn/(un+1)
    return jc

In [None]:
# Adamic Adar Index
def AAI(n1, n2, graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    
    aai = 0
    for i in cn:
        n = getNeighbors_Source(i, graph)
        num = len(n)
        aai += 1 / math.log(num + 0.5)   
    return aai

In [88]:
# Preferential Attachment (*)
def PA(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    pa = len(nei1)*len(nei2)
    return pa

In [90]:
# CN/min()
def HP(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = set(nei1).intersection(set(nei2))
    hp = cn/min(len(nei1),len(nei2))
    return hp

In [93]:
# CN/PA
def LHN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    lhn = cn/(PA(n1,n2,graph))
    return lhn

## Extract the Similarity Features from Dataset

In [117]:
source=train.Source
sink=train.Sink

In [120]:
dataframe_train=pd.DataFrame()

In [121]:
dataframe_train['Source']=source
dataframe_train['Sink']=sink

In [122]:
dataframe_train

Unnamed: 0,Source,Sink
0,1604753,3794609
1,1927211,2402534
2,3438576,3589472
3,2029671,2363623
4,2031305,4169107
...,...,...
59995,1840280,4251197
59996,506428,1721946
59997,261881,3480188
59998,2673351,2388735


In [None]:
cn = []
for i in tqdm(range(len(source))):
    cn.append(CN(source[i],sink[i],graph))

In [None]:
dataframe_train['CN']=cn

In [None]:
hp = []
for i in tqdm(range(len(source))):
    hp.append(HP(source[i],sink[i],graph))

In [None]:
dataframe_train['HP']=hp

In [None]:
lhn = []
for i in tqdm(range(len(source))):
    lhn.append(LHN(source[i],sink[i],graph))

In [None]:
dataframe_train['LHN']=lhn

In [112]:
dataframe_train

Unnamed: 0,Source,Sink,CN,AAI,JC,PA,RA
0,3563811,3600160,0,0.000000,0.000000,609,0.000000
1,2052043,1401960,0,0.000000,0.000000,639,0.000000
2,4517994,1690636,2,0.404742,0.009050,3485,0.014290
3,1660006,4349447,2,0.178818,0.003697,18216,0.000028
4,581111,1882617,0,0.000000,0.000000,828,0.000000
...,...,...,...,...,...,...,...
1995,1461386,2341683,0,0.000000,0.000000,106,0.000000
1996,4057755,1871227,0,0.000000,0.000000,3895,0.000000
1997,4242514,1413468,0,0.000000,0.000000,54,0.000000
1998,555531,1290080,0,0.000000,0.000000,168,0.000000


In [None]:
print('The Number of Zeros: \nHP： %d \nLHN: %d'%(hp.count(0),lhn.count(0)))

In [114]:
with open('dataframe_train2.txt','wb') as file:
    pickle.dump(dataframe_train,file)

In [115]:
dataframe_test

Unnamed: 0,Source,Sink,CN,AAI,JC,PA,RA
0,3563811,3600160,0,0.000000,0.000000,609,0.000000
1,2052043,1401960,0,0.000000,0.000000,639,0.000000
2,4517994,1690636,2,0.404742,0.009050,3485,0.014290
3,1660006,4349447,2,0.178818,0.003697,18216,0.000028
4,581111,1882617,0,0.000000,0.000000,828,0.000000
...,...,...,...,...,...,...,...
1995,1461386,2341683,0,0.000000,0.000000,106,0.000000
1996,4057755,1871227,0,0.000000,0.000000,3895,0.000000
1997,4242514,1413468,0,0.000000,0.000000,54,0.000000
1998,555531,1290080,0,0.000000,0.000000,168,0.000000


## Code Debug

In [76]:
print(getNeighbors('3563811', graph),getNeighbors('3600160', graph))

['787039', '3519329', '2472993', '3274877', '1541304', '673835', '1405689', '2401069', '1628939', '4016137', '4008078', '3017264', '2331231', '127885', '3355752', '1523982', '2166423', '1120944', '3638703', '2558493', '4027460'] []


In [77]:
getNeighbors('3600160', graph)

[]

In [78]:
with open('source_node.txt', 'rb') as file:
    source_node = pickle.load(file)

In [80]:
'3600160' in source_node

False

In [None]:
list(graph.predecessors('3600160'))