In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
from tqdm import tqdm
import math

In [None]:
with open('graph.txt', 'rb') as file:
    graph = pickle.load(file)

In [None]:
with open('final_neg_30000.txt', 'rb') as file:
    train = pickle.load(file)

In [None]:
# Train w label
dataframe_train=pd.DataFrame()
source=train.Source
sink=train.Sink
Label=train.Label
dataframe_train['Label']=Label

## Nodes Features

In [None]:
# Source following
following_so=[]
for elem in source:
    following_so.append(graph.out_degree(elem))

In [None]:
dataframe_train['Source_following']=following_so

In [None]:
# Sink following
following_si=[]
for elem in sink:
    following_si.append(graph.out_degree(elem))

In [None]:
dataframe_train['Sink_following']=following_si

In [None]:
# Source follower
follower_so=[]
for elem in source:
    follower_so.append(graph.in_degree(elem))

In [None]:
dataframe_train['Source_follower']=follower_so

In [None]:
# Sink follower
follower_si=[]
for elem in sink:
    follower_si.append(graph.in_degree(elem))

In [None]:
dataframe_train['Sink_follower']=follower_si

In [None]:
# shortest_path
shortest_path=[]
for i in range(len(source)):
    try:
        ll=nx.shortest_path_length(graph,source[i],sink[i])
    except:
        ll=0
    shortest_path.append(ll)

In [None]:
#check the number of unique path length
x = np.array(shortest_path)
np.unique(x)

In [None]:
dataframe_train['Shortest_path']=shortest_path

In [None]:
dataframe_train

In [None]:
print('The Number of zeros:')
dataframe_train.isin([0]).sum()

## Similarity Functions for Directed Graph

In [None]:
# Out Degree
def getNeighbors_Source(node, graph):
    nei = [n for n in graph.neighbors(node)]
    return nei

In [None]:
# In Degree
def getNeighbors_Sink(node, graph):
    nei = [n for n in graph.predecessors(node)]
    return nei

In [None]:
# Common Neighbors
def CN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = len(set(nei1).intersection(set(nei2)))
    return cn

In [None]:
# All the Neighbors for the Two Nodes
def union(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    union = len(set(nei1).union(set(nei2)))
    return union

In [None]:
# Jaccard Coefficient
def JC(n1,n2,graph):
    cn = CN(n1,n2,graph)
    un = union(n1,n2,graph)
    jc = cn/(un+1)
    return jc

In [None]:
# Adamic Adar Index
def AAI(n1, n2, graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    
    aai = 0
    for i in cn:
        n = getNeighbors_Source(i, graph)
        num = len(n)
        aai += 1 / math.log(num + 0.5)   
    return aai

In [None]:
# Preferential Attachment (*)
def PA(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    pa = len(nei1)*len(nei2)
    return pa

In [None]:
# CN/+
def SI(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = set(nei1).intersection(set(nei2))
    add = cn/(len(nei1) + len(nei2))
    return add

In [None]:
# CN/min()
def HP(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = set(nei1).intersection(set(nei2))
    try:
        hp = len(cn)/min(len(nei1),len(nei2))
    except:
        hp=0
    return hp

In [None]:
# CN/max()
def HD(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)
    cn = set(nei1).intersection(set(nei2))
    hd = cn/max(len(nei1),len(nei2))
    return hd

In [None]:
# Resource Allocation Index
def RA(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    
    ra = 0
    for i in cn:
        n = getNeighbors_Source(i, graph)
        num = len(n)
        ra += 1 / (num + 0.5)
    return ra

In [None]:
# CN/PA
def LHN(n1,n2,graph):
    nei1 = getNeighbors_Source(n1, graph)
    nei2 = getNeighbors_Sink(n2, graph)  
    cn = set(nei1).intersection(set(nei2))
    try:
        lhn = len(cn)/(PA(n1,n2,graph))
    except:
        lhn=0
    return lhn

## Extract the Similarity Features from Dataset

In [None]:
cn = []
for i in tqdm(range(len(source))):
    cn.append(CN(source[i],sink[i],graph))

In [None]:
dataframe_train['CN']=cn

In [None]:
aai = []
for i in tqdm(range(len(source))):
    aai.append(AAI(source[i],sink[i],graph))

In [None]:
dataframe_train['AAI']=aai

In [None]:
jc = []
for i in tqdm(range(len(source))):
    jc.append(JC(source[i],sink[i],graph))

In [None]:
dataframe_train['JC']=jc

In [None]:
pa = []
for i in tqdm(range(len(source))):
    pa.append(PA(source[i],sink[i],graph))

In [None]:
dataframe_train['PA']=pa

In [None]:
ra = []
for i in tqdm(range(len(source))):
    ra.append(RA(source[i],sink[i],graph))

In [None]:
dataframe_train['RA']=ra

In [None]:
hp = []
for i in tqdm(range(len(source))):
    hp.append(HP(source[i],sink[i],graph))

In [None]:
dataframe_train['HP']=hp

In [None]:
lhn = []
for i in tqdm(range(len(source))):
    lhn.append(LHN(source[i],sink[i],graph))

In [None]:
dataframe_train['LHN']=lhn

In [None]:
with open('PageRank.txt', 'rb') as file:
    pr = pickle.load(file)
    
dataframe_train['PageRank_Src'] = source.apply(lambda row: pr.get(row))
dataframe_train['PageRank_Sink'] = sink.apply(lambda row: pr.get(row))


In [None]:
dataframe_train

In [None]:
print('The Number of Zeros:')
dataframe_train.isin([0]).sum()

## Centrality Features

In [None]:
with open('Eigenvector_Centrality.txt', 'rb') as file:
    centrality = pickle.load(file)
dataframe_train['ECentrality_Sour'] = source.apply(lambda row: centrality.get(row))
dataframe_train['ECentrality_Sink'] = sink.apply(lambda row: centrality.get(row))

In [None]:
with open('Degree_centrality.txt', 'rb') as file:
    dc = pickle.load(file)
dataframe_train['Degree_Centrality_Sour'] = source.apply(lambda row: dc.get(row))
dataframe_train['Degree_Centrality_Sink'] = sink.apply(lambda row: dc.get(row))

In [None]:
print('The Number of Zeros:')
dataframe_train.isin([0]).sum()

## Output Feature Dataframe

In [None]:
with open('dataframe_train.txt','wb') as file:
    pickle.dump(dataframe_train,file)

In [None]:
dataframe_train