In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from tqdm import tqdm
import pickle

***
**Read the `.txt` file to dataframe**
***

In [2]:
train_data=pd.read_csv("train.txt", delimiter=",", header=None,names=['Neighbours'],index_col=False)

In [3]:
train_data['ID']=train_data['Neighbours'].apply(lambda x: x.split('\t')[0])  # Get the ID

In [4]:
train_data['Neighbours']=train_data['Neighbours'].apply(lambda x: x.split('\t')[1:])# set up the neighbours

In [5]:
train_data=train_data[["ID","Neighbours"]] # set "ID" as the index

***
**Transfer the table data to the graph data**
***

In [6]:
num_source=train_data.shape[0]
sink=train_data.iloc[0,]
len_sink=len(sink)

In [7]:
train_data

Unnamed: 0,ID,Neighbours
0,540762,"[1912140, 1537559, 3091331, 2757277, 3237295, ..."
1,2129843,"[65840, 3414168, 4523797, 2851163, 4321895, 13..."
2,3361377,"[955840, 3342058, 1536902, 1850727, 1504632, 1..."
3,1199298,"[2300061, 2635670, 2803600, 744722, 881446, 28..."
4,1392121,"[3845572, 546016, 4361302, 678461, 4294597, 24..."
...,...,...
19995,585576,"[660302, 3279973, 2094235, 2355188, 1296935, 3..."
19996,505961,"[3875645, 2148630, 4288909, 4011139, 340232, 1..."
19997,125824,"[54421, 868022, 385000, 2050130, 3446665, 2040..."
19998,896087,"[431577, 1007572, 499457, 3642500, 3734728, 28..."


In [16]:
diG = nx.DiGraph()
for i in range(num_source):
    source=train_data.iloc[i,0] # The source
    sinks=train_data.iloc[i,1] # Neighbours
    len_sink=len(sinks)
    for j in range(len_sink):
        sink=sinks[j]
        diG.add_edge(source,sink)
        if sink not in diG.nodes:
            diG.add_node(sink)      

In [17]:
list(diG.edges)[:5]

[('540762', '1912140'),
 ('540762', '1537559'),
 ('540762', '3091331'),
 ('540762', '2757277'),
 ('540762', '3237295')]

In [18]:
print("The Di-graph contains %d nodes and %d edges" %(len(diG.nodes),len(diG.edges)))

The Di-graph contains 4867136 nodes and 23946602 edges


***
Bulid the adjacent matrix for the _source nodes_
***

In [12]:
nodelist=train_data.iloc[:,0]
nodelist.shape
type(nodelist)
list_node = list(nodelist)
list_1 = list_node[:10000]
list_2 = list_node[10000:]
list_1[:5]

['540762', '2129843', '3361377', '1199298', '1392121']

In [None]:
adj_matrix1={}
for node in tqdm(list_1):
    for n in list_1:
        if diG.has_edge(node, n):
            adj_matrix1[(node,n)]=1
        else:
            adj_matrix1[(node,n)]=0

In [11]:
#adj_matrix1[('540762',1912140)]
diG.has_edge('540762','1912140')

True

In [None]:
with open ('dict_matrix1.txt', 'wb') as file:
    tqdm(must_save = adj_matrix1)
    pickle.dump(must_save, file)

In [None]:
adj_matrix={}
for node in tqdm(list_2):
    for n in list_2:
        if diG.has_edge(int(node), n):
            adj_matrix[(node,n)]=1
        else:
            adj_matrix[(node,n)]=0

In [18]:
adj_matrix.shape

In [None]:
nodelist=train_data.iloc[:,0]
n=len(nodelist)
adj_matrix=np.zeros((n,n))
for i in tqdm(range(n)):
    for j in range(n):
        if diG.has_edge(nodelist[i],nodelist[j]) and i!=j:
            adj_matrix[i][j]=1

In [None]:
with open ('dict_matrix.pkl', 'wb') as file:
    must_save = adj_matrix
    pickle.dump(must_save, file)

In [None]:
adj_Matrix=nx.to_numpy_matrix(diG,nodelist)

In [14]:
distance_twoNodes=dict(nx.shortest_path_length(diG)) # find the distance between two nodes

In [15]:
distance_twoNodes[][]

4886706

In [None]:
nx.all_simple_paths(diG,nodelist,nodelist)

***
** Get the positive samples: `edges_pairs` and negative samples: `No_edges_pairs`.**
***

In [None]:
No_edges_pairs = []
edges_pairs=[]

# traverse adjacency matrix
offset = 0
for i in range(adj_Matrix.shape[0]):
  for j in range(offset,adj_Matrix.shape[1]):
    if i != j:
        if adj_Matrix[i,j] == 0 and distance_twoNodes[nodelist[i]][nodelist[j]]<=3:
            
            No_edges_pairs.append([nodelist[i],nodelist[j]])
        else:
            edges_pairs.append([nodelist[i],nodelist[j]])
  offset = offset + 1