In [1]:
from py2neo import Graph
import pandas as pd
graph = Graph("bolt://localhost:7687", auth=("superman", "pizzapep"))

# Building testing and training set

## Training dataset

In [207]:
# Find positive examples
train_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4|INTERACTS_5]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [208]:
train_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5360 entries, 0 to 5359
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   5360 non-null   object
 1   node2   5360 non-null   object
 2   label   5360 non-null   int64 
 3   season  5360 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 167.6+ KB


In [209]:
# Find negative examples
train_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4|INTERACTS_5]-()
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4|INTERACTS_5*1..2]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4|INTERACTS_5]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [210]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161350 entries, 0 to 161349
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   161350 non-null  object
 1   node2   161350 non-null  object
 2   label   161350 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 3.7+ MB


In [211]:
import random
randomlist = []
for i in range(0,161350):
    n = random.randint(1,5)
    randomlist.append(n)
#print(randomlist)
train_missing_links['season']=randomlist
train_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ADDAM_MARBRAND,TYRION,0,4
1,ADDAM_MARBRAND,JAIME,0,3
2,ADDAM_MARBRAND,NED,0,1
3,ADDAM_MARBRAND,ROBERT,0,3
4,ADDAM_MARBRAND,SHAGGA,0,3


In [212]:
# Remove duplicates
train_missing_links = train_missing_links.drop_duplicates()

In [213]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74240 entries, 0 to 161345
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   74240 non-null  object
 1   node2   74240 non-null  object
 2   label   74240 non-null  int64 
 3   season  74240 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 2.8+ MB


In [214]:
# Down sample negative examples
train_missing_links = train_missing_links.sample(
    n=len(train_existing_links))

In [215]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5360 entries, 66329 to 122282
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   5360 non-null   object
 1   node2   5360 non-null   object
 2   label   5360 non-null   int64 
 3   season  5360 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 209.4+ KB


In [216]:
# Create DataFrame from positive and negative examples
training_df = train_missing_links.append(
    train_existing_links, ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

  training_df = train_missing_links.append(


In [217]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10720 entries, 0 to 10719
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   10720 non-null  object  
 1   node2   10720 non-null  object  
 2   label   10720 non-null  category
 3   season  10720 non-null  int64   
dtypes: category(1), int64(1), object(2)
memory usage: 262.0+ KB


In [218]:
#Checking if there are indeed labels with the positive category
training_df[training_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
5360,ADDAM_MARBRAND,TYWIN,1,1
5361,ADDAM_MARBRAND,LEO_LEFFORD,1,1
5362,ADDAM_MARBRAND,KEVAN,1,1
5363,AEGON,RHAENYRA,1,5
5364,AEGON,MAESTER_AEMON,1,5
...,...,...,...,...
10715,WAIF,THIN_MAN,1,5
10716,WAIF,JAQEN,1,5
10717,WAIF,ARYA,1,5
10718,YOHN_ROYCE,LITTLEFINGER,1,5


In [219]:
#Checking if there are indeed labels with the negative category
training_df[training_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,RAST,HOBB,0,1
1,ROBERT,STEELSHANKS_WALTON,0,1
2,ARYA,ARYA,0,5
3,DAISY,ROBB_DWARF,0,2
4,CRASTER,SHIREEN,0,3
...,...,...,...,...
5355,GRENN,STEELSHANKS_WALTON,0,5
5356,JON,MAREI,0,1
5357,CATELYN,OSHA,0,1
5358,POLLIVER,JOFFREY_DWARF,0,3


## Testing dataset

In [295]:
# Find positive examples
test_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_6|INTERACTS_7]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [296]:
test_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2044 entries, 0 to 2043
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2044 non-null   object
 1   node2   2044 non-null   object
 2   label   2044 non-null   int64 
 3   season  2044 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 64.0+ KB


In [297]:
# Find negative examples
test_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_6|INTERACTS_7]-()
MATCH (n:Person)-[r:INTERACTS_6|INTERACTS_7*1..2]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_6|INTERACTS_7]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [298]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31608 entries, 0 to 31607
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   31608 non-null  object
 1   node2   31608 non-null  object
 2   label   31608 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 740.9+ KB


In [299]:
randomlist = []
for i in range(0,31608):
    n = random.randint(6,7)
    randomlist.append(n)
#print(randomlist)
test_missing_links['season']=randomlist
test_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,AEGON,DAARIO,0,7
1,AEGON,BELICHO,0,7
2,AEGON,TYRION,0,6
3,AEGON,VALA,0,7
4,AEGON,RED_PRIEST,0,6


In [300]:
# Remove duplicates 
test_missing_links = test_missing_links.drop_duplicates()

In [301]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13813 entries, 0 to 31601
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   13813 non-null  object
 1   node2   13813 non-null  object
 2   label   13813 non-null  int64 
 3   season  13813 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 539.6+ KB


In [302]:
# Down sample negative examples
test_missing_links = test_missing_links.sample(n=len(test_existing_links))

In [303]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2044 entries, 24090 to 23062
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2044 non-null   object
 1   node2   2044 non-null   object
 2   label   2044 non-null   int64 
 3   season  2044 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 79.8+ KB


In [304]:
# Create DataFrame from positive and negative examples
test_df = test_missing_links.append(
    test_existing_links, ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

  test_df = test_missing_links.append(


In [305]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4088 entries, 0 to 4087
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   4088 non-null   object  
 1   node2   4088 non-null   object  
 2   label   4088 non-null   category
 3   season  4088 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 100.1+ KB


In [306]:
#Checking if there are indeed labels with the positive category
test_df[test_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
2044,AEGON,DAENERYS,1,7
2045,AERYS,SANSA,1,7
2046,AERYS,OLENNA,1,7
2047,AERYS,NED,1,7
2048,AERYS,VARYS,1,7
...,...,...,...,...
4083,NED_UMBER,ALYS,1,7
4084,QHONO,TYRION,1,7
4085,QHONO,PODRICK,1,7
4086,QHONO,HOUND,1,7


In [308]:
#Checking if there are indeed labels with the negative category
test_df[test_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,LYANNA_MORMONT,WAIF,0,6
1,AERYS,MYRCELLA,0,6
2,WAIF,JON,0,6
3,WUN_WUN,BRAN,0,7
4,NIGHT_KING,RED_PRIEST,0,6
...,...,...,...,...
2039,TYCHO,QYBURN,0,6
2040,LITTLEFINGER,HIGH_SPARROW,0,6
2041,LYANNA,ARTHUR,0,7
2042,MELISANDRE,YEZZAN,0,6


# Choosing Random Forest Classifier

In [234]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10, 
                                    random_state=0)

## Generating link prediction features

In [279]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [237]:
train_season1 = training_df[training_df['season'] == 1]
train_season2 = training_df[training_df['season'] == 2]
train_season3 = training_df[training_df['season'] == 3]
train_season4 = training_df[training_df['season'] == 4]
train_season5 = training_df[training_df['season'] == 5]

In [309]:
test_season6 = test_df[test_df['season'] == 6]
test_season7 = test_df[test_df['season'] == 7]

In [311]:
train_season1_v = apply_graphy_features(train_season1, "INTERACTS_1")
train_season2_v = apply_graphy_features(train_season2, "INTERACTS_2")
train_season3_v= apply_graphy_features(train_season3, "INTERACTS_3")
train_season4_v= apply_graphy_features(train_season4, "INTERACTS_4")
train_season5_v= apply_graphy_features(train_season5, "INTERACTS_5")

In [312]:
train_season1_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
58,YARA,LITTLEFINGER,0,1,0.0,0.0,26.0
2298,TYSHA,JAIME,1,1,1.0,48.0,25.0
1387,CATELYN,AERYS,1,1,4.0,468.0,45.0
644,ALLISER_THORNE,GRENN,0,1,0.0,0.0,8.0
340,INNKEEPER,RORGE,0,1,0.0,0.0,0.0


In [323]:
###NOTE: AFTER APPLYING THE FUNCTION, NUMBER OF ROWS INCREASES! ==> why?
train_season1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2177 entries, 0 to 8999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2177 non-null   object  
 1   node2   2177 non-null   object  
 2   label   2177 non-null   category
 3   season  2177 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 70.3+ KB


In [324]:
train_season1_v.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2392 entries, 0 to 2391
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2392 non-null   object  
 1   node2   2392 non-null   object  
 2   label   2392 non-null   category
 3   season  2392 non-null   int64   
 4   cn      2392 non-null   float64 
 5   pa      2392 non-null   float64 
 6   tn      2392 non-null   float64 
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 133.3+ KB


In [313]:
train_season2_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1705,SAM,QHORIN,1,2,0.0,0.0,9.0
729,SANSA,OBARA,0,2,0.0,0.0,19.0
1967,FARLEN,RICKON,1,2,3.0,40.0,11.0
358,NYMERIA,PODRICK,0,2,0.0,0.0,11.0
1308,DROGO,DAENERYS,1,2,1.0,26.0,14.0


In [314]:
train_season3_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1553,PYCELLE,JOFFREY,1,3,10.0,273.0,24.0
326,BRIENNE,DORNISH_RIDER,0,3,0.0,0.0,5.0
1100,DYING_MAN,LOMMY,0,3,0.0,0.0,0.0
736,HODOR,DAARIO,0,3,0.0,84.0,19.0
98,BRIENNE,RICKARD_STARK,0,3,0.0,0.0,5.0


In [315]:
train_season4_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
2187,MOUNTAIN,TYWIN,1,4,11.0,435.0,33.0
659,JANOS,RODRIK,0,4,0.0,0.0,13.0
202,MATTHOS,POLLIVER,0,4,0.0,0.0,6.0
1514,ARYA,SALLY,1,4,2.0,75.0,26.0
355,WAYMAR_ROYCE,LANCEL,0,4,0.0,0.0,1.0


In [316]:
train_season5_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1833,MACE,JAQEN,1,5,4.0,70.0,13.0
604,QHORIN,OTHOR,0,5,0.0,0.0,0.0
1217,ALLISER_THORNE,DENYS,1,5,0.0,33.0,14.0
874,ILLYRIO,MOUNTAIN,0,5,0.0,0.0,2.0
437,FENNESZ,TYRION,0,5,0.0,0.0,19.0


In [317]:
test_season6_v = apply_graphy_features(test_season6, 'INTERACTS_6')
test_season7_v = apply_graphy_features(test_season7, "INTERACTS_7")

In [318]:
test_season6_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
2334,PODRICK,JAIME,1,6,4.0,250.0,31.0
833,LITTLEFINGER,WHITE_WALKER,0,6,0.0,0.0,11.0
1901,NED,RODRIK,1,6,2.0,72.0,25.0
2656,LYANNA_MORMONT,HARALD,1,6,6.0,70.0,11.0
2199,DAVOS,ALLISER_THORNE,1,6,8.0,225.0,26.0


In [319]:
test_season7_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1477,JORAH,VARYS,1,7,11.0,418.0,30.0
1205,BRONN,HOUND,1,7,11.0,384.0,29.0
2229,YOHN_ROYCE,BRIENNE,1,7,10.0,312.0,28.0
1482,JORAH,THOROS,1,7,5.0,154.0,24.0
1617,MOUNTAIN,RANDYLL,1,7,5.0,160.0,21.0


In [327]:
#Combining all seasons for trainign set and testing set
frames_training = [train_season1_v, train_season2_v, train_season3_v, train_season4_v, train_season5_v]
result_training = pd.concat(frames_training)
frames_test = [test_season6_v, test_season7_v]
result_test = pd.concat(frames_test)

In [328]:
result_training

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,RAST,HOBB,0,1,0.0,0.0,7.0
1,ROBERT,STEELSHANKS_WALTON,0,1,0.0,0.0,36.0
2,NIGHT_KING,MANCE,0,1,0.0,0.0,0.0
3,JON,TOBHO_MOTT,0,1,1.0,52.0,27.0
4,YOREN,BRIENNE,0,1,0.0,0.0,16.0
...,...,...,...,...,...,...,...
2790,TYENE,NYMERIA,1,5,5.0,42.0,8.0
2791,TYENE,BRONN,1,5,5.0,84.0,14.0
2792,WAIF,THIN_MAN,1,5,2.0,21.0,8.0
2793,WAIF,JAQEN,1,5,2.0,21.0,8.0


In [329]:
result_test

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,LYANNA_MORMONT,WAIF,0,6,0.0,30.0,13.0
1,AERYS,MYRCELLA,0,6,0.0,3.0,4.0
2,WAIF,JON,0,6,0.0,99.0,36.0
3,NIGHT_KING,RED_PRIEST,0,6,0.0,15.0,8.0
4,BOWEN_MARSH,YOHN_ROYCE,0,6,0.0,30.0,13.0
...,...,...,...,...,...,...,...
2402,NED_UMBER,ALYS,1,7,1.0,4.0,3.0
2403,QHONO,TYRION,1,7,3.0,140.0,36.0
2404,QHONO,PODRICK,1,7,3.0,76.0,20.0
2405,QHONO,HOUND,1,7,3.0,96.0,25.0


## Train your model

In [330]:
columns = ["cn", "pa", "tn"]
X = result_training[columns]
y = result_training["label"]
classifier.fit(X, y)

RandomForestClassifier(max_depth=10, n_estimators=30, random_state=0)

# Evaluation

In [331]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})
def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [332]:
predictions = classifier.predict(result_test[columns])
y_test = result_test["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.863119
1,precision,0.826268
2,recall,0.965674


In [333]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,cn,0.463123
1,pa,0.381622
2,tn,0.155255


# Introducing more features (Triangles and The Clustering Coefficient)

## Calculating the Triangle count

In [None]:
query = """
CALL algo.triangleCount('Author', 'CO_AUTHOR_EARLY', { 
  write:true,
  writeProperty:'trianglesTrain', 
  clusteringCoefficientProperty:'coefficientTrain'});
"""
graph.run(query)

## Adding the features

In [334]:
def apply_triangles_features(data,triangles_prop,coefficient_prop):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE id(p1) = pair.node1
    MATCH (p2) WHERE id(p2) = pair.node2
    RETURN pair.node1 AS node1,
    pair.node2 AS node2,
    apoc.coll.min([p1[$triangles], p2[$triangles]]) AS minTriangles,
    apoc.coll.max([p1[$triangles], p2[$triangles]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficient], p2[$coefficient]]) AS minCoeff,
    apoc.coll.max([p1[$coefficient], p2[$coefficient]]) AS maxCoeff
    """
    
    pairs = [{"node1": pair[0], "node2": pair[1]}  
          for pair in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs,
              "triangles": triangles_prop,
              "coefficient": coefficient_prop}
    
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [None]:
training_df = apply_triangles_features(training_df, 
  "trianglesTrain", "coefficientTrain")
test_df = apply_triangles_features(test_df, 
  "trianglesTest", "coefficientTest")