In [5]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

sys.path.append('../')

from utils import Graph

graph = Graph('bolt://localhost:7687', 'neo4j', 'neo4jneo4j')

In [16]:
#train sub graph
query = """
MATCH (a)-[r:SUPPLIES]->(b) 
WHERE r.date.year < 2021
WITH a,b,r
MERGE (a)-[l:SUPPLIES_TRAIN]-(b)
ON CREATE
    SET
        l.year = r.date.year, 
        l.distance = r.distance, 
        l.revenue_pct = r.revenue_pct
"""
graph.query_run(query, {})

[]


In [3]:
#validation sub graph
query = """
MATCH (a)-[r:SUPPLIES]->(b) 
WHERE r.date.year >= 2021 AND r.date.year < 2022
WITH a,b,r
MERGE (a)-[l:SUPPLIES_VALID]-(b)
ON CREATE
    SET
        l.year = r.date.year, 
        l.distance = r.distance, 
        l.revenue_pct = r.revenue_pct
"""
graph.query_run(query, {})

[]


In [4]:
#test sub graph
query = """
MATCH (a)-[r:SUPPLIES]->(b) 
WHERE r.date.year >= 2022
WITH a,b,r
MERGE (a)-[l:SUPPLIES_TEST]-(b)
ON CREATE
    SET
        l.year = r.date.year, 
        l.distance = r.distance, 
        l.revenue_pct = r.revenue_pct
"""
graph.query_run(query, {})

[]


# Training set

In [18]:
# Find positive examples
train_existing_links = graph.query_run_df("""
MATCH (n:Company)-[r:SUPPLIES_TRAIN]-(p:Company)
RETURN n.id AS node1, p.id AS node2, 1 AS label
LIMIT 25000
""",{})

In [19]:
train_existing_links.drop_duplicates(inplace = True)
train_existing_links 

Unnamed: 0,node1,node2,label
0,194545,31763,1
1,194552,316262,1
2,194704,13550,1
3,368240,24973,1
4,194904,48662,1
...,...,...,...
24995,205832,29148,1
24996,205832,144924,1
24997,205832,243354,1
24998,29992,8147,1


In [22]:
# Find negative examples
nodes = list(train_existing_links['node1'].unique())

train_missing_links = graph.query_run_df("""
UNWIND $nodes AS node1
MATCH (n:Company) WHERE n.id = node1
CALL {
    WITH n
    MATCH (n:Company)-[r:SUPPLIES_TRAIN*2..3]-(p:Company) WHERE not((n:Company)-[:SUPPLIES_TRAIN]-(p:Company))
    RETURN p
    LIMIT 2
}
RETURN n.id AS node1, p.id as node2, 0 as label
""", {"nodes" : nodes})


# # Find negative examples
# train_missing_links = graph.query_run_df("""
# MATCH (a:Company)
# WHERE (a)-[:SUPPLIES_TRAIN]-()
# MATCH (a)-[:SUPPLIES_TRAIN*2..3]-(other)
# WHERE not((a)-[:SUPPLIES_TRAIN]-(other))
# RETURN id(a) AS node1, id(other) AS node2, 0 AS label
# LIMIT 10000
# """,{})

In [23]:
train_missing_links.drop_duplicates(inplace = True)
train_missing_links 

Unnamed: 0,node1,node2,label
0,194545,70003,0
1,194545,118332,0
2,194552,23973,0
3,194552,77718,0
4,194704,7790,0
...,...,...,...
28099,334375,248011,0
28100,205832,34100,0
28101,205832,308523,0
28102,29992,2103,0


In [24]:
# Down sample negative examples
train_missing_links = train_missing_links.sample(
    n=len(train_existing_links))

In [25]:
training_data = pd.concat([train_existing_links, train_missing_links], ignore_index=True)

In [26]:
training_data

Unnamed: 0,node1,node2,label
0,194545,31763,1
1,194552,316262,1
2,194704,13550,1
3,368240,24973,1
4,194904,48662,1
...,...,...,...
49995,128587,270190,0
49996,90694,205429,0
49997,101284,212604,0
49998,132572,5267,0


In [6]:
#TODO: make a batched version 


def apply_graphy_features(data, rel_type):

    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """

    pairs = [{"node1": data.node1, "node2": data.node2} for _, data in data.iterrows()]

    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.query_run_df(query, params)

    return pd.merge(data, features, on = ['node1', 'node2'])

In [28]:
training = apply_graphy_features(training_data, 'SUPPLIES_TRAIN')

In [29]:
training 

Unnamed: 0,node1,node2,label,cn,pa,tn
0,194545,31763,1,0.0,14.0,15.0
1,194552,316262,1,0.0,29.0,30.0
2,194704,13550,1,0.0,54.0,55.0
3,368240,24973,1,0.0,31.0,32.0
4,194904,48662,1,0.0,15.0,16.0
...,...,...,...,...,...,...
49995,128587,270190,0,1.0,1.0,1.0
49996,90694,205429,0,1.0,1.0,1.0
49997,101284,212604,0,1.0,1.0,1.0
49998,132572,5267,0,1.0,711.0,711.0


In [30]:
training.to_csv('train1.csv', index=False)

# Validation Data

In [7]:
# Find positive examples
validation_existing_links = graph.query_run_df("""
MATCH (n:Company)-[r:SUPPLIES_VALID]-(p:Company)
RETURN n.id AS node1, p.id AS node2, 1 AS label
LIMIT 25000
""",{})

validation_existing_links.drop_duplicates(inplace = True)
validation_existing_links 

Unnamed: 0,node1,node2,label
0,194545,32137,1
1,326676,354678,1
2,195084,166537,1
3,195473,50247,1
4,195920,3599,1
...,...,...,...
24995,17225,352238,1
24996,29760,333732,1
24997,29760,5267,1
24998,29760,229473,1


In [8]:
# Find negative examples
nodes = list(validation_existing_links['node1'].unique())

validation_missing_links = graph.query_run_df("""
UNWIND $nodes AS node1
MATCH (n:Company) WHERE n.id = node1
CALL {
    WITH n
    MATCH (n:Company)-[r:SUPPLIES_VALID*2..3]-(p:Company) WHERE not((n:Company)-[:SUPPLIES_VALID]-(p:Company))
    RETURN p
    LIMIT 2
}
RETURN n.id AS node1, p.id as node2, 0 as label
""", {"nodes" : nodes})


# # Find negative examples
# train_missing_links = graph.query_run_df("""
# MATCH (a:Company)
# WHERE (a)-[:SUPPLIES_TRAIN]-()
# MATCH (a)-[:SUPPLIES_TRAIN*2..3]-(other)
# WHERE not((a)-[:SUPPLIES_TRAIN]-(other))
# RETURN id(a) AS node1, id(other) AS node2, 0 AS label
# LIMIT 10000
# """,{})

In [9]:
validation_missing_links.drop_duplicates(inplace = True)
validation_missing_links 

Unnamed: 0,node1,node2,label
0,194545,173061,0
1,194545,36816,0
2,326676,6283,0
3,326676,313331,0
4,195084,2960,0
...,...,...,...
30560,117396,304573,0
30561,17225,130220,0
30562,17225,25450,0
30563,29760,293155,0


In [5]:
# Down sample negative examples
validation_missing_links = validation_missing_links.sample(
    n=len(validation_existing_links))

In [6]:
validation_data = pd.concat([validation_existing_links, validation_missing_links], ignore_index=True)

In [9]:
validation = apply_graphy_features(validation_data, 'SUPPLIES_VALID')

In [9]:
validation.to_csv('val.csv', index=False)

# Testing dataset

In [10]:
# Find positive examples
test_existing_links = graph.query_run_df("""
MATCH (n:Company)-[r:SUPPLIES_TEST]-(p:Company)
RETURN n.id AS node1, p.id AS node2, 1 AS label
LIMIT 20000
""",{})

test_existing_links.drop_duplicates(inplace = True)
test_existing_links 

Unnamed: 0,node1,node2,label
0,1441,29450,1
1,29450,1441,1
2,8912,1596,1
3,1596,8912,1
4,4202,455,1
...,...,...,...
19995,373587,24450,1
19996,24450,8586,1
19997,8586,24450,1
19998,24450,3019,1


In [11]:
# Find negative examples
nodes = list(test_existing_links['node1'].unique())

test_missing_links = graph.query_run_df("""
UNWIND $nodes AS node1
MATCH (n:Company) WHERE n.id = node1
CALL {
    WITH n
    MATCH (n:Company)-[r:SUPPLIES_TEST*2..3]-(p:Company) WHERE not((n:Company)-[:SUPPLIES_TEST]-(p:Company))
    RETURN p
    LIMIT 3
}
RETURN n.id AS node1, p.id as node2, 0 as label
""", {"nodes" : nodes})


# # Find negative examples
# train_missing_links = graph.query_run_df("""
# MATCH (a:Company)
# WHERE (a)-[:SUPPLIES_TRAIN]-()
# MATCH (a)-[:SUPPLIES_TRAIN*2..3]-(other)
# WHERE not((a)-[:SUPPLIES_TRAIN]-(other))
# RETURN id(a) AS node1, id(other) AS node2, 0 AS label
# LIMIT 10000
# """,{})

In [12]:
test_missing_links.drop_duplicates(inplace = True)
test_missing_links 

Unnamed: 0,node1,node2,label
0,1441,316654,0
1,1441,242061,0
2,1441,1230,0
3,29450,22221,0
4,29450,253753,0
...,...,...,...
20036,315061,17170,0
20037,315061,220,0
20038,373587,38013,0
20039,373587,17170,0


In [13]:
# Down sample negative examples
test_missing_links = test_missing_links.sample(
    n=len(test_existing_links))

In [14]:
test_data = pd.concat([test_existing_links, test_missing_links], ignore_index=True)

In [15]:
test= apply_graphy_features(test_data, 'SUPPLIES_TEST')

In [16]:
test.to_csv('test.csv', index=False)

In [17]:
train = pd.read_csv('train.csv')
val = pd.read_csv('val.csv')
test = pd.read_csv('test.csv')

# Choosing Random Forest Classifier 

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10,  
                                    random_state=0)

## Train the model

In [19]:
columns = ["cn", "pa", "tn"]
X = train[columns]
y = train["label"]
classifier.fit(X, y)

In [20]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})
def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [21]:
predictions = classifier.predict(test[columns])
y_test = test["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.71365
1,precision,0.651837
2,recall,0.9172


In [22]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,cn,0.400586
1,pa,0.365554
2,tn,0.23386


# Introducing more features (Triangles and The Clustering Coefficient)

In [23]:
query1 = """
CALL gds.graph.drop('myGraph1') YIELD graphName;
"""

query2 = """
CALL gds.graph.drop('myGraph2') YIELD graphName;
"""

query3 = """
CALL gds.graph.drop('myGraph3') YIELD graphName;
"""



graph.query_run(query1,{})
graph.query_run(query2,{})
graph.query_run(query3,{})



ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.drop`: Caused by: java.util.NoSuchElementException: Graph with name `myGraph1` does not exist on database `neo4j`. It might exist on another database.}

In [24]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query1 = """
CALL gds.graph.project(
  'myGraph1',
  'Company',
  {
    SUPPLIES_TRAIN: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query2 = """
CALL gds.graph.project(
  'myGraph2',
  'Company',
  {
    SUPPLIES_VALID: {
    orientation: 'UNDIRECTED'
}
}
)
"""
query3 = """
CALL gds.graph.project(
  'myGraph3',
  'Company',
  {
    SUPPLIES_TEST: {
    orientation: 'UNDIRECTED'
}
  }
)
"""

graph.query_run(query1,{})
graph.query_run(query2,{})
graph.query_run(query3,{})


[{'nodeProjection': {'Company': {'label': 'Company', 'properties': {}}}, 'relationshipProjection': {'SUPPLIES_TRAIN': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'SUPPLIES_TRAIN', 'properties': {}}}, 'graphName': 'myGraph1', 'nodeCount': 195925, 'relationshipCount': 251148, 'projectMillis': 442}]
[{'nodeProjection': {'Company': {'label': 'Company', 'properties': {}}}, 'relationshipProjection': {'SUPPLIES_VALID': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'SUPPLIES_VALID', 'properties': {}}}, 'graphName': 'myGraph2', 'nodeCount': 195925, 'relationshipCount': 168578, 'projectMillis': 123}]
[{'nodeProjection': {'Company': {'label': 'Company', 'properties': {}}}, 'relationshipProjection': {'SUPPLIES_TEST': {'orientation': 'UNDIRECTED', 'aggregation': 'DEFAULT', 'type': 'SUPPLIES_TEST', 'properties': {}}}, 'graphName': 'myGraph3', 'nodeCount': 195925, 'relationshipCount': 43728, 'projectMillis': 56}]


In [25]:
query1 = """ 
CALL gds.triangleCount.write('myGraph1', {
  writeProperty: 'trianglesTrain'
})
"""

query2 = """ 
CALL gds.triangleCount.write('myGraph2', {
  writeProperty: 'trianglesValid'
})
"""

query3 = """ 
CALL gds.triangleCount.write('myGraph3', {
  writeProperty: 'trianglesTest'
})
"""



graph.query_run(query1,{})
graph.query_run(query2,{})
graph.query_run(query3,{})


[{'writeMillis': 298, 'nodePropertiesWritten': 195925, 'globalTriangleCount': 30845, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessingMillis': 2, 'computeMillis': 279, 'configuration': {'writeConcurrency': 4, 'writeProperty': 'trianglesTrain', 'maxDegree': 9223372036854775807, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'username': None, 'concurrency': 4}}]
[{'writeMillis': 65, 'nodePropertiesWritten': 195925, 'globalTriangleCount': 6342, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessingMillis': 0, 'computeMillis': 133, 'configuration': {'writeConcurrency': 4, 'writeProperty': 'trianglesValid', 'maxDegree': 9223372036854775807, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'username': None, 'concurrency': 4}}]
[{'writeMillis': 49, 'nodePropertiesWritten': 195925, 'globalTriangleCount': 961, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessingMillis': 0, 'computeMillis': 23, 'configuration': {'writeConcurrency': 4

In [26]:
query1 = """
CALL gds.localClusteringCoefficient.write('myGraph1', {
    writeProperty: 'coefficientTrain'
});
"""

query2 = """
CALL gds.localClusteringCoefficient.write('myGraph2', {
    writeProperty: 'coefficientValid'
});
"""

query3 = """
CALL gds.localClusteringCoefficient.write('myGraph3', {
    writeProperty: 'coefficientTest'
});
"""

graph.query_run(query1,{})
graph.query_run(query2,{})
graph.query_run(query3,{})


[{'writeMillis': 105, 'nodePropertiesWritten': 195925, 'averageClusteringCoefficient': 0.004668704457283166, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessingMillis': 1, 'computeMillis': 110, 'configuration': {'writeConcurrency': 4, 'triangleCountProperty': None, 'writeProperty': 'coefficientTrain', 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'username': None, 'concurrency': 4}}]
[{'writeMillis': 32, 'nodePropertiesWritten': 195925, 'averageClusteringCoefficient': 0.0025783691169887863, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessingMillis': 0, 'computeMillis': 47, 'configuration': {'writeConcurrency': 4, 'triangleCountProperty': None, 'writeProperty': 'coefficientValid', 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'username': None, 'concurrency': 4}}]
[{'writeMillis': 31, 'nodePropertiesWritten': 195925, 'averageClusteringCoefficient': 0.0006227545168626605, 'nodeCount': 195925, 'postProcessingMillis': 0, 'preProcessi

In [27]:
def apply_triangles_features(data,triangles_prop,coefficient_prop):
    
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Company) WHERE p1.id = pair.node1
    MATCH (p2:Company) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1, 
    pair.node2 AS node2,
    apoc.coll.min([p1[$triangles], p2[$triangles]]) AS minTriangles,
    apoc.coll.max([p1[$triangles], p2[$triangles]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficient], p2[$coefficient]]) AS minCoeff,
    apoc.coll.max([p1[$coefficient], p2[$coefficient]]) AS maxCoeff
    """
    

    pairs = [{"node1": str(pair[0]), "node2": str(pair[1])}  
          for pair in data[["node1", "node2"]].values.tolist()]
        
    params = {
        "pairs": pairs,
        "triangles": triangles_prop,
        "coefficient": coefficient_prop
        }
    
    features = graph.query_run_df(query,params)

    features['node1'] = features['node1'].astype(int)
    features['node2'] = features['node2'].astype(int)

    return pd.merge(data, features, on = ["node1", "node2"])

In [28]:
train1 = apply_triangles_features(train, "trianglesTrain", "coefficientTrain")
test1 = apply_triangles_features(test, "trianglesTest", "coefficientTest")
val1 = apply_triangles_features(val, "trianglesValid", "coefficientValid")

In [29]:
train1

Unnamed: 0,node1,node2,label,cn,pa,tn,minTriangles,maxTriangles,minCoeff,maxCoeff
0,194545,31763,1,0.0,14.0,15.0,0,0,0.0,0.000000
1,194552,316262,1,0.0,29.0,30.0,0,0,0.0,0.000000
2,194704,13550,1,0.0,54.0,55.0,0,11,0.0,0.007687
3,368240,24973,1,0.0,31.0,32.0,0,1,0.0,0.002151
4,194904,48662,1,0.0,15.0,16.0,0,0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...
39995,285311,354698,0,0.0,4.0,5.0,0,2,0.0,0.333333
39996,353587,329165,0,0.0,7.0,8.0,0,1,0.0,0.047619
39997,21382,217249,0,1.0,1.0,1.0,0,0,0.0,0.000000
39998,206061,26892,0,0.0,3.0,4.0,0,0,0.0,0.000000


In [30]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import PredefinedSplit

n_estimators = [10,20,30,40,50,60,70,80,90,100]
max_depth = [2,3,4,5,6,7,8,9,10]
min_samples_split = [2,3,4,5,6,7,8,9,10]
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10]
max_features = ['auto', 'sqrt', 'log2']
bootstrap = [True, False]
criterion = ['gini', 'entropy']

param_grid = {'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'max_features': max_features,
                'bootstrap': bootstrap,
                'criterion': criterion}

rf = RandomForestClassifier()
split_index = [-1]*len(train1) + [0]*len(val1)
X = np.concatenate((train1.loc[:, train1.columns != "label"], val1.loc[:, val1.columns != "label"]), axis=0)
y = np.concatenate((train1["label"], val1["label"]), axis=0)
pds = PredefinedSplit(test_fold = split_index)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 100, cv = pds, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X, y)

Fitting 1 folds for each of 100 candidates, totalling 100 fits


In [31]:
print(rf_random.best_params_)
print(rf_random.best_score_)
print(rf_random.best_estimator_)
print(rf_random.best_estimator_.feature_importances_)

{'n_estimators': 90, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 6, 'criterion': 'gini', 'bootstrap': False}
0.801625
RandomForestClassifier(bootstrap=False, max_depth=6, max_features='log2',
                       min_samples_split=4, n_estimators=90)
[0.03291112 0.01109949 0.4651862  0.11101059 0.15727612 0.0564671
 0.02181105 0.09320323 0.0510351 ]


In [32]:
classifier2 = rf_random.best_estimator_

In [33]:
columns = ["cn", "pa", "tn","minTriangles", "maxTriangles", "minCoeff", "maxCoeff"]
X = train1[columns]
y = train1["label"]
classifier2.fit(X, y)

In [34]:
predictions = classifier2.predict(test1[columns])
y_test = test1["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.758275
1,precision,0.681087
2,recall,0.9714
