In [3]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

sys.path.append('../')

from utils import Graph

graph = Graph('bolt://localhost:7687', 'neo4j', 'neo4jneo4j')

In [4]:
# Find positive examples
train_existing_links = graph.query_run_df("""
MATCH (n:Company)-[r:SUPPLIES]-(p:Company)
RETURN n.id AS node1, p.id AS node2, 1 AS label
LIMIT 10000
""",{})

In [5]:
train_existing_links.drop_duplicates(inplace = True)
train_existing_links 

Unnamed: 0,node1,node2,label
0,132317,26284,1
1,132317,12842,1
2,132317,17277,1
3,29965,364847,1
4,29965,39826,1
...,...,...,...
9995,25545,119686,1
9996,25545,176063,1
9997,25545,28991,1
9998,25545,94969,1


In [6]:
# Find negative examples
nodes = list(train_existing_links['node1'].unique())

train_missing_links = graph.query_run_df("""
UNWIND $nodes AS node1
MATCH (n:Company) WHERE n.id = node1
CALL {
    WITH n
    MATCH (n:Company)-[r:SUPPLIES*2]-(p:Company) WHERE not((n:Company)-[:SUPPLIES]-(p:Company))
    RETURN p
    LIMIT 6
}
RETURN n.id AS node1, p.id as node2, 0 as label
""", {"nodes" : nodes})

In [7]:
train_missing_links.drop_duplicates(inplace = True)
train_missing_links 

Unnamed: 0,node1,node2,label
0,132317,30132,0
1,132317,66459,0
2,132317,75459,0
3,132317,282214,0
4,132317,293629,0
...,...,...,...
8323,25545,35130,0
8324,25545,332289,0
8325,25545,81791,0
8326,25545,1854,0


In [8]:
dataset = pd.concat([train_existing_links, train_missing_links], ignore_index=True)
X,y = dataset[['node1', 'node2']], dataset['label']

X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y, shuffle=True)
X_val, X_test, y_val, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state=42, stratify=y_rem, shuffle=True)

In [9]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.query_run_df(query, params)
    return pd.merge(data, features, on = ["node1", "node2"])

In [10]:
a = apply_graphy_features(X_train, 'SUPPLIES')

In [10]:
a.to_csv('train.csv', index=False)

Unnamed: 0,node1,node2,cn,pa,tn
0,29965,165578,0.0,120.0,26.0
1,132317,66459,3.0,18.0,6.0
2,122314,296081,1.0,1.0,1.0
3,132317,30132,2.0,21.0,8.0
4,29965,28743,2.0,48.0,12.0
5,122314,31143,1.0,5.0,4.0
6,132317,282214,1.0,3.0,3.0
7,132317,293629,1.0,3.0,3.0
8,29965,2611,3.0,3228.0,532.0
9,132317,30093,2.0,96.0,32.0
