In [1]:
from py2neo import Graph
import pandas as pd
graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo4jneo4j"))

# Building testing and training set

## Training dataset

In [2]:
# Find positive examples
train_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [3]:
train_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4494 entries, 0 to 4493
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 140.6+ KB


In [4]:
# Find negative examples
train_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-()
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4*1..2]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [5]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117858 entries, 0 to 117857
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   117858 non-null  object
 1   node2   117858 non-null  object
 2   label   117858 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.7+ MB


In [7]:
import random
randomlist = []
for i in range(0,117858):
    n = random.randint(1,4)
    randomlist.append(n)
#print(randomlist)
train_missing_links['season']=randomlist
train_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ADDAM_MARBRAND,TYRION,0,3
1,ADDAM_MARBRAND,JAIME,0,2
2,ADDAM_MARBRAND,NED,0,2
3,ADDAM_MARBRAND,ROBERT,0,4
4,ADDAM_MARBRAND,SHAGGA,0,4


In [8]:
# Remove duplicates
train_missing_links = train_missing_links.drop_duplicates()

In [9]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 53119 entries, 0 to 117857
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   53119 non-null  object
 1   node2   53119 non-null  object
 2   label   53119 non-null  int64 
 3   season  53119 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.0+ MB


In [10]:
# Down sample negative examples
train_missing_links = train_missing_links.sample(
    n=len(train_existing_links))

In [11]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4494 entries, 44606 to 84131
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 175.5+ KB


In [12]:
# Create DataFrame from positive and negative examples
training_df = pd.concat([train_missing_links,train_existing_links], ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

In [13]:
training_df

Unnamed: 0,node1,node2,label,season
0,MYCAH,THOROS,0,3
1,EDDISON_TOLLETT,BERT,0,4
2,NED,LOCKE,0,2
3,JAIME,VISERYS,0,2
4,BALON_DWARF,JON,0,1
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [14]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8988 entries, 0 to 8987
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   8988 non-null   object  
 1   node2   8988 non-null   object  
 2   label   8988 non-null   category
 3   season  8988 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 219.7+ KB


In [15]:
#Checking if there are indeed labels with the positive category
training_df[training_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
4494,ADDAM_MARBRAND,TYWIN,1,1
4495,ADDAM_MARBRAND,LEO_LEFFORD,1,1
4496,ADDAM_MARBRAND,KEVAN,1,1
4497,AEGON,SHIREEN,1,3
4498,AEGON,DAVOS,1,3
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [16]:
#Checking if there are indeed labels with the negative category
training_df[training_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,MYCAH,THOROS,0,3
1,EDDISON_TOLLETT,BERT,0,4
2,NED,LOCKE,0,2
3,JAIME,VISERYS,0,2
4,BALON_DWARF,JON,0,1
...,...,...,...,...
4489,LANCEL,EDMURE,0,4
4490,THEON,MYCAH,0,1
4491,GILLY,RICKON,0,2
4492,REGINALD,STANNIS,0,2


## Testing dataset

In [17]:
# Find positive examples
test_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [18]:
test_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4110 entries, 0 to 4109
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4110 non-null   object
 1   node2   4110 non-null   object
 2   label   4110 non-null   int64 
 3   season  4110 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 128.6+ KB


In [19]:
# Find negative examples
test_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-()
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8*1..2]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [20]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112504 entries, 0 to 112503
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   112504 non-null  object
 1   node2   112504 non-null  object
 2   label   112504 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.6+ MB


In [22]:
randomlist = []
for i in range(0,112504):
    n = random.randint(5,8)
    randomlist.append(n)
#print(randomlist)
test_missing_links['season']=randomlist
test_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ROBETT,KARSI,0,6
1,ROBETT,LORD_OF_BONES,0,8
2,ROBETT,MANCE,0,8
3,ROBETT,ALLISER_THORNE,0,6
4,ROBETT,ALLISER_THORNE,0,7


In [23]:
# Remove duplicates 
test_missing_links = test_missing_links.drop_duplicates()

In [24]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 45026 entries, 0 to 112503
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   45026 non-null  object
 1   node2   45026 non-null  object
 2   label   45026 non-null  int64 
 3   season  45026 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.7+ MB


In [25]:
# Down sample negative examples
test_missing_links = test_missing_links.sample(n=len(test_existing_links))

In [26]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4110 entries, 23193 to 109578
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4110 non-null   object
 1   node2   4110 non-null   object
 2   label   4110 non-null   int64 
 3   season  4110 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 160.5+ KB


In [27]:
# Create DataFrame from positive and negative examples
test_df = pd.concat([test_missing_links, test_existing_links], ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

In [28]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8220 entries, 0 to 8219
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   8220 non-null   object  
 1   node2   8220 non-null   object  
 2   label   8220 non-null   category
 3   season  8220 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 200.9+ KB


In [29]:
#Checking if there are indeed labels with the positive category
test_df[test_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
4110,ROBETT,TORMUND,1,7
4111,ROBETT,PODRICK,1,7
4112,ROBETT,DAVOS,1,7
4113,ROBETT,BRIENNE,1,7
4114,ROBETT,ARYA,1,7
...,...,...,...,...
8215,WILLA,SARRA,1,8
8216,WILLA,SANSA,1,8
8217,WILLA,HOUND,1,8
8218,WILLA,TORMUND,1,8


In [30]:
#Checking if there are indeed labels with the negative category
test_df[test_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,JANOS,MAESTER_WOLKAN,0,8
1,STRONG,RED_PRIEST,0,7
2,LADY_CRANE,ROBIN,0,7
3,JON,LORAS,0,8
4,TORMUND,YARA,0,5
...,...,...,...,...
4105,OLLY,MAESTER_MORMONT,0,6
4106,OLYVAR,IZEMBARO,0,6
4107,GREY_WORM,NED,0,5
4108,RAZDAL,SAM,0,8


# Choosing Random Forest Classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)

## Generating link prediction features

In [32]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [33]:
train_season1 = training_df[training_df['season'] == 1]
train_season2 = training_df[training_df['season'] == 2]
train_season3 = training_df[training_df['season'] == 3]
train_season4 = training_df[training_df['season'] == 4]
# train_season5 = training_df[training_df['season'] == 5]

In [34]:
test_season5 = test_df[test_df['season'] == 5]
test_season6 = test_df[test_df['season'] == 6]
test_season7 = test_df[test_df['season'] == 7]
test_season8 = test_df[test_df['season'] == 8]

In [35]:
train_season1_v = apply_graphy_features(train_season1, "INTERACTS_1")
train_season2_v = apply_graphy_features(train_season2, "INTERACTS_2")
train_season3_v= apply_graphy_features(train_season3, "INTERACTS_3")
train_season4_v= apply_graphy_features(train_season4, "INTERACTS_4")
# train_season5_v= apply_graphy_features(train_season5, "INTERACTS_5")

In [36]:
train_season1_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1801,MAESTER_AEMON,SAM,1,1,5.0,108.0,16.0
567,DAVOS,PYP,0,1,0.0,0.0,16.0
1203,LORD_OF_BONES,MORAG,0,1,0.0,0.0,0.0
333,NED,SHAGGA,0,1,2.0,228.0,59.0
1702,JORAH,AERYS,1,1,1.0,195.0,27.0


In [37]:
###NOTE: AFTER APPLYING THE FUNCTION, NUMBER OF ROWS INCREASES! ==> why?
train_season1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2208 entries, 4 to 7663
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2208 non-null   object  
 1   node2   2208 non-null   object  
 2   label   2208 non-null   category
 3   season  2208 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 71.3+ KB


In [38]:
train_season1_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2377 entries, 0 to 2376
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2377 non-null   object  
 1   node2   2377 non-null   object  
 2   label   2377 non-null   category
 3   season  2377 non-null   int64   
 4   cn      2377 non-null   float64 
 5   pa      2377 non-null   float64 
 6   tn      2377 non-null   float64 
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 114.0+ KB


In [39]:
train_season2_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1950,ALTON,ROBB,1,2,4.0,196.0,31.0
1202,HODOR,JANOS,0,2,0.0,66.0,17.0
1215,HUGH_OF_THE_VALE,TYRION,0,2,0.0,0.0,33.0
1318,CATELYN,RENLY,1,2,10.0,299.0,26.0
463,BAELOR,WAYMAR_ROYCE,0,2,0.0,0.0,0.0


In [40]:
train_season3_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1278,BRAN,OSHA,1,3,9.0,170.0,18.0
1592,MERYN_TRANT,ILYN_PAYNE,1,3,4.0,60.0,12.0
742,STYR,OTHOR,0,3,0.0,0.0,0.0
888,RODRIK,CERSEI,0,3,0.0,0.0,20.0
872,JANOS,DONTOS,0,3,0.0,0.0,0.0


In [41]:
train_season4_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
908,LYSA,RODRIK,0,4,0.0,0.0,13.0
2659,EDDISON_TOLLETT,JON,1,4,17.0,576.0,33.0
2538,TYWIN,MACE,1,4,15.0,493.0,31.0
2419,SYRIO_FOREL,ARYA,1,4,2.0,75.0,26.0
1431,ALLISER_THORNE,MAESTER_AEMON,1,4,7.0,210.0,22.0


In [42]:
# train_season5_v.sample(5)

In [43]:
test_season5_v = apply_graphy_features(test_season5, "INTERACTS_5")
test_season6_v = apply_graphy_features(test_season6, 'INTERACTS_6')
test_season7_v = apply_graphy_features(test_season7, "INTERACTS_7")
test_season8_v = apply_graphy_features(test_season8, "INTERACTS_8")

In [44]:
test_season6_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
641,YOHN_ROYCE,YOHN_ROYCE,0,6,0.0,9.0,3.0
2086,RICKON,DAVOS,1,6,6.0,225.0,28.0
2172,SANSA,YOHN_ROYCE,1,6,2.0,123.0,42.0
953,IRONBORN_LORD,ROBETT,0,6,0.0,0.0,6.0
1326,ALLISER_THORNE,MELISANDRE,1,6,7.0,135.0,17.0


In [45]:
test_season7_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
2059,BRIENNE,HOUND,1,7,16.0,624.0,34.0
2308,TYCHO,CERSEI,1,7,1.0,64.0,33.0
465,HARRY,ALLISER_THORNE,0,7,0.0,0.0,0.0
39,HARRY,MAESTER_WOLKAN,0,7,0.0,0.0,8.0
1072,QYBURN,HIZDAHR,0,7,0.0,0.0,16.0


In [46]:
#Combining all seasons for trainign set and testing set
frames_training = [train_season1_v, train_season2_v, train_season3_v, train_season4_v]
result_training = pd.concat(frames_training)
frames_test = [test_season5_v, test_season6_v, test_season7_v, test_season8_v]
result_test = pd.concat(frames_test)

In [47]:
result_training

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,BALON_DWARF,JON,0,1,0.0,0.0,26.0
1,ILLYRIO,LOMMY,0,1,0.0,0.0,6.0
2,NED,JACKS,0,1,0.0,0.0,57.0
3,ROBB,MASHA_HEDDLE,0,1,2.0,60.0,30.0
4,JOJEN,STYR,0,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...
3407,YOHN_ROYCE,VANCE_CORBRAY,1,4,3.0,24.0,7.0
3408,YOHN_ROYCE,VANCE_CORBRAY,1,4,0.0,0.0,4.0
3409,YOHN_ROYCE,NED,1,4,3.0,102.0,20.0
3410,YOHN_ROYCE,NED,1,4,0.0,0.0,17.0


In [48]:
result_test

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,TORMUND,YARA,0,5,0.0,0.0,10.0
1,OLENNA,BARRISTAN,0,5,0.0,42.0,13.0
2,CERSEIS_BABY,HOUND,0,5,0.0,0.0,0.0
3,WILLA,BRONN,0,5,0.0,0.0,12.0
4,CAMELLO,THIN_MAN,0,5,0.0,0.0,7.0
...,...,...,...,...,...,...,...
2877,WILLA,SARRA,1,8,3.0,16.0,5.0
2878,WILLA,SANSA,1,8,3.0,164.0,42.0
2879,WILLA,HOUND,1,8,3.0,100.0,26.0
2880,WILLA,TORMUND,1,8,3.0,120.0,31.0


## Train your model

In [49]:
columns = ["cn", "pa", "tn"]
X = result_training[columns]
y = result_training["label"]
classifier.fit(X, y)

# Evaluation

In [50]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})
def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [51]:
predictions = classifier.predict(result_test[columns])
y_test = result_test["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.916277
1,precision,0.914499
2,recall,0.944719


In [52]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,cn,0.459883
1,pa,0.382153
2,tn,0.157963


# Introducing more features (Triangles and The Clustering Coefficient)

## Calculating the Triangle count

In [53]:
query1 = """
CALL gds.graph.drop('myGraph1') YIELD graphName;
"""

query2 = """
CALL gds.graph.drop('myGraph2') YIELD graphName;
"""

query3 = """
CALL gds.graph.drop('myGraph3') YIELD graphName;
"""

query4 = """
CALL gds.graph.drop('myGraph4') YIELD graphName;
"""

query5 = """
CALL gds.graph.drop('myGraph5') YIELD graphName;
"""

query6 = """
CALL gds.graph.drop('myGraph6') YIELD graphName;
"""

query7 = """
CALL gds.graph.drop('myGraph7') YIELD graphName;
"""

query8 = """
CALL gds.graph.drop('myGraph8') YIELD graphName;
"""

graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)


graphName
myGraph8


In [54]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query1 = """
CALL gds.graph.project(
  'myGraph1',
  'Person',
  {
    INTERACTS_1: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query2 = """
CALL gds.graph.project(
  'myGraph2',
  'Person',
  {
    INTERACTS_2: {
    orientation: 'UNDIRECTED'
}
}
)
"""
query3 = """
CALL gds.graph.project(
  'myGraph3',
  'Person',
  {
    INTERACTS_3: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query4 = """
CALL gds.graph.project(
  'myGraph4',
  'Person',
  {
    INTERACTS_4: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query5 = """
CALL gds.graph.project(
  'myGraph5',
  'Person',
  {
    INTERACTS_5: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_5: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_5', properties: {}}}",myGraph5,418,866,16


In [55]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query6 = """
CALL gds.graph.project(
  'myGraph6',
  'Person',
  {
    INTERACTS_6: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query7 = """
CALL gds.graph.project(
  'myGraph7',
  'Person',
  {
    INTERACTS_7: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query8 = """
CALL gds.graph.project(
  'myGraph8',
  'Person',
  {
    INTERACTS_8: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

graph.run(query6)
graph.run(query7)
graph.run(query8)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_8: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_8', properties: {}}}",myGraph8,418,1200,13


In [56]:
query1 = """ 
CALL gds.triangleCount.write('myGraph1', {
  writeProperty: 'trianglesTrain1'
})
"""

query2 = """ 
CALL gds.triangleCount.write('myGraph2', {
  writeProperty: 'trianglesTrain2'
})
"""

query3 = """ 
CALL gds.triangleCount.write('myGraph3', {
  writeProperty: 'trianglesTrain3'
})
"""

query4 = """ 
CALL gds.triangleCount.write('myGraph4', {
  writeProperty: 'trianglesTrain4'
})
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
3,418,1524,418,0,0,4,"{jobId: '82cc89e3-0aa5-4d56-8415-8e972fb10c55', writeConcurrency: 4, writeProperty: 'trianglesTrain4', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [57]:
query5 = """ 
CALL gds.triangleCount.write('myGraph5', {
  writeProperty: 'trianglesTest5'
})
"""

query6 = """ 
CALL gds.triangleCount.write('myGraph6', {
  writeProperty: 'trianglesTest6'
})
"""
query7 = """ 
CALL gds.triangleCount.write('myGraph7', {
  writeProperty: 'trianglesTest7'
})
"""
query8 = """ 
CALL gds.triangleCount.write('myGraph8', {
  writeProperty: 'trianglesTest8'
})
"""
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
2,418,3351,418,0,0,4,"{jobId: 'ddfad0d4-ea0e-4559-a997-e538b05f5286', writeConcurrency: 4, writeProperty: 'trianglesTest8', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [58]:
query1 = """
CALL gds.localClusteringCoefficient.write('myGraph1', {
    writeProperty: 'coefficientTrain1'
});
"""

query2 = """
CALL gds.localClusteringCoefficient.write('myGraph2', {
    writeProperty: 'coefficientTrain2'
});
"""

query3 = """
CALL gds.localClusteringCoefficient.write('myGraph3', {
    writeProperty: 'coefficientTrain3'
});
"""

query4 = """
CALL gds.localClusteringCoefficient.write('myGraph4', {
    writeProperty: 'coefficientTrain4'
});
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
5,418,0.2821768603167667,418,0,0,10,"{jobId: '0b6aa49f-259f-4f7b-ae1f-bba5b95d37a1', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTrain4', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [59]:
query5 = """
CALL gds.localClusteringCoefficient.write('myGraph5', {
    writeProperty: 'coefficientTest5'
});
"""

query6 = """
CALL gds.localClusteringCoefficient.write('myGraph6', {
    writeProperty: 'coefficientTest6'
});
"""

query7 = """
CALL gds.localClusteringCoefficient.write('myGraph7', {
    writeProperty: 'coefficientTest7'
});
"""

query8 = """
CALL gds.localClusteringCoefficient.write('myGraph8', {
    writeProperty: 'coefficientTest8'
});
"""

graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
2,418,0.1246066499305612,418,0,0,8,"{jobId: 'ebe8c38f-8679-4569-b165-2d5010049685', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTest8', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


## Adding the features

In [60]:
def apply_triangles_features(data,triangles_prop,coefficient_prop):
    
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) WHERE p1.id = pair.node1
    MATCH (p2:Person) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1, 
    pair.node2 AS node2,
    apoc.coll.min([p1[$triangles], p2[$triangles]]) AS minTriangles,
    apoc.coll.max([p1[$triangles], p2[$triangles]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficient], p2[$coefficient]]) AS minCoeff,
    apoc.coll.max([p1[$coefficient], p2[$coefficient]]) AS maxCoeff
    """
    

    pairs = [{"node1": str(pair[0]), "node2": str(pair[1])}  
          for pair in data[["node1", "node2"]].values.tolist()]
        
    params = {
        "pairs": pairs,
        "triangles": triangles_prop,
        "coefficient": coefficient_prop
        }
    
    features = graph.run(query,params).to_data_frame()
    
    return pd.merge(data, features, on = ["node1", "node2"])

In [61]:
train_season1_w = apply_triangles_features(train_season1_v, "trianglesTrain1", "coefficientTrain1")
train_season2_w = apply_triangles_features(train_season2_v, "trianglesTrain2", "coefficientTrain2")
train_season3_w = apply_triangles_features(train_season3_v, "trianglesTrain3", "coefficientTrain3")
train_season4_w = apply_triangles_features(train_season4_v, "trianglesTrain4", "coefficientTrain4")
# train_season5_w = apply_triangles_features(train_season5_v, "trianglesTrain5", "coefficientTrain5")

test_season5_w = apply_triangles_features(test_season5_v, "trianglesTest5", "coefficientTest5")
test_season6_w = apply_triangles_features(test_season6_v, "trianglesTest6", "coefficientTest6")
test_season7_w = apply_triangles_features(test_season7_v, "trianglesTest7", "coefficientTest7")
test_season8_w = apply_triangles_features(test_season8_v, "trianglesTest8", "coefficientTest8")

In [62]:
frames_training_w = [train_season1_w, train_season2_w,
                   train_season3_w, train_season4_w]
result_training_w = pd.concat(frames_training_w)
frames_test_w = [test_season5_w ,test_season6_w, test_season7_w, test_season8_w]
result_test_w = pd.concat(frames_test_w)


# Train Model

In [63]:
classifier2 = RandomForestClassifier(n_estimators=30, max_depth=10,
                                    random_state=0)


In [64]:
columns = ["cn", "pa", "tn","minTriangles", "maxTriangles", "minCoeff", "maxCoeff"]
X = result_training_w[columns]
y = result_training_w["label"]
classifier2.fit(X, y)

In [65]:
predictions = classifier2.predict(result_test_w[columns])
y_test = result_test_w["label"]
evaluate_model(predictions, y_test)


Unnamed: 0,metric,value
0,accuracy,0.93936
1,precision,0.980312
2,recall,0.953481


In [66]:
feature_importance(columns, classifier2)

Unnamed: 0,feature,value
0,maxCoeff,0.258533
1,tn,0.221451
2,maxTriangles,0.219298
3,cn,0.109747
4,pa,0.081636
5,minTriangles,0.063137
6,minCoeff,0.046198
