In [2]:
from py2neo import Graph
import pandas as pd
graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo4jneo4j"))

# Building testing and training set

## Training dataset

In [3]:
# Find positive examples
train_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [4]:
train_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4494 entries, 0 to 4493
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 140.6+ KB


In [5]:
# Find negative examples
train_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-()
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4*2..3]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [6]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6408824 entries, 0 to 6408823
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   node1   object
 1   node2   object
 2   label   int64 
dtypes: int64(1), object(2)
memory usage: 146.7+ MB


In [7]:
import random
randomlist = []
for i in range(0,6408824):
    n = random.randint(1,4)
    randomlist.append(n)
#print(randomlist)
train_missing_links['season']=randomlist
train_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ADDAM_MARBRAND,TYRION,0,3
1,ADDAM_MARBRAND,BRONN,0,3
2,ADDAM_MARBRAND,SHAE,0,2
3,ADDAM_MARBRAND,JON,0,2
4,ADDAM_MARBRAND,CATELYN,0,1


In [8]:
# Remove duplicates
train_missing_links = train_missing_links.drop_duplicates()

In [9]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 233823 entries, 0 to 6408822
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   233823 non-null  object
 1   node2   233823 non-null  object
 2   label   233823 non-null  int64 
 3   season  233823 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 8.9+ MB


In [10]:
# Down sample negative examples
train_missing_links = train_missing_links.sample(
    n=len(train_existing_links))

In [11]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4494 entries, 2099338 to 4961829
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 175.5+ KB


In [12]:
# Create DataFrame from positive and negative examples
training_df = pd.concat([train_missing_links,train_existing_links], ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

In [13]:
training_df

Unnamed: 0,node1,node2,label,season
0,LYSA,MIRRI_MAZ_DUUR,0,2
1,INNKEEPERS_DAUGHTER,BARRISTAN,0,4
2,JON_ARRYN,DAVOS,0,3
3,AXELL_FLORENT,STEELSHANKS_WALTON,0,1
4,TORRHEN,MARILLION,0,3
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [14]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8988 entries, 0 to 8987
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   8988 non-null   object  
 1   node2   8988 non-null   object  
 2   label   8988 non-null   category
 3   season  8988 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 219.7+ KB


In [15]:
#Checking if there are indeed labels with the positive category
training_df[training_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
4494,ADDAM_MARBRAND,TYWIN,1,1
4495,ADDAM_MARBRAND,LEO_LEFFORD,1,1
4496,ADDAM_MARBRAND,KEVAN,1,1
4497,AEGON,SHIREEN,1,3
4498,AEGON,DAVOS,1,3
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [16]:
#Checking if there are indeed labels with the negative category
training_df[training_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,LYSA,MIRRI_MAZ_DUUR,0,2
1,INNKEEPERS_DAUGHTER,BARRISTAN,0,4
2,JON_ARRYN,DAVOS,0,3
3,AXELL_FLORENT,STEELSHANKS_WALTON,0,1
4,TORRHEN,MARILLION,0,3
...,...,...,...,...
4489,GRENN,WHITE_WALKER,0,2
4490,DROWNED_PRIEST,ADRACK_HUMBLE,0,3
4491,DAENERYS,JOYEUSE,0,1
4492,JACKS,VISERYS,0,4


## Testing dataset

In [17]:
# Find positive examples
test_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [18]:
test_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4110 entries, 0 to 4109
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4110 non-null   object
 1   node2   4110 non-null   object
 2   label   4110 non-null   int64 
 3   season  4110 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 128.6+ KB


In [19]:
# Find negative examples
test_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-()
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8*2..3]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_5|INTERACTS_6|INTERACTS_7|INTERACTS_8]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [20]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6393380 entries, 0 to 6393379
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   node1   object
 1   node2   object
 2   label   int64 
dtypes: int64(1), object(2)
memory usage: 146.3+ MB


In [23]:
randomlist = []
for i in range(0,6393380):
    n = random.randint(5,8)
    randomlist.append(n)
#print(randomlist)
test_missing_links['season']=randomlist
test_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ROBETT,BRIAN,0,5
1,ROBETT,JEOR,0,5
2,ROBETT,GILLY,0,7
3,ROBETT,EDDISON_TOLLETT,0,5
4,ROBETT,JANOS,0,5


In [24]:
# Remove duplicates 
test_missing_links = test_missing_links.drop_duplicates()

In [25]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 147182 entries, 0 to 6393378
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   147182 non-null  object
 1   node2   147182 non-null  object
 2   label   147182 non-null  int64 
 3   season  147182 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 5.6+ MB


In [26]:
# Down sample negative examples
test_missing_links = test_missing_links.sample(n=len(test_existing_links))

In [27]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4110 entries, 2235854 to 5311648
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4110 non-null   object
 1   node2   4110 non-null   object
 2   label   4110 non-null   int64 
 3   season  4110 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 160.5+ KB


In [28]:
# Create DataFrame from positive and negative examples
test_df = pd.concat([test_missing_links, test_existing_links], ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

In [29]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8220 entries, 0 to 8219
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   8220 non-null   object  
 1   node2   8220 non-null   object  
 2   label   8220 non-null   category
 3   season  8220 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 200.9+ KB


In [30]:
#Checking if there are indeed labels with the positive category
test_df[test_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
4110,ROBETT,TORMUND,1,7
4111,ROBETT,PODRICK,1,7
4112,ROBETT,DAVOS,1,7
4113,ROBETT,BRIENNE,1,7
4114,ROBETT,ARYA,1,7
...,...,...,...,...
8215,WILLA,SARRA,1,8
8216,WILLA,SANSA,1,8
8217,WILLA,HOUND,1,8
8218,WILLA,TORMUND,1,8


In [31]:
#Checking if there are indeed labels with the negative category
test_df[test_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,RICKON,RAY,0,5
1,MERYN_TRANT,RODRIK,0,6
2,MOSSADOR,SON_OF_EDMURE,0,7
3,GUARD_CAPTAIN,RIVERLANDS_LORD,0,5
4,BOBONO,BALERION,0,7
...,...,...,...,...
4105,SAM,MYRCELLA,0,7
4106,RICKON,QHONO,0,5
4107,SHAE,BRAN,0,5
4108,VICKY,THIN_MAN,0,8


# Choosing Random Forest Classifier

In [32]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10,  
                                    random_state=0)

## Generating link prediction features

In [33]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [34]:
train_season1 = training_df[training_df['season'] == 1]
train_season2 = training_df[training_df['season'] == 2]
train_season3 = training_df[training_df['season'] == 3]
train_season4 = training_df[training_df['season'] == 4]
# train_season5 = training_df[training_df['season'] == 5]

In [35]:
test_season5 = test_df[test_df['season'] == 5]
test_season6 = test_df[test_df['season'] == 6]
test_season7 = test_df[test_df['season'] == 7]
test_season8 = test_df[test_df['season'] == 8]

In [36]:
train_season1_v = apply_graphy_features(train_season1, "INTERACTS_1")
train_season2_v = apply_graphy_features(train_season2, "INTERACTS_2")
train_season3_v= apply_graphy_features(train_season3, "INTERACTS_3")
train_season4_v= apply_graphy_features(train_season4, "INTERACTS_4")
# train_season5_v= apply_graphy_features(train_season5, "INTERACTS_5")

In [37]:
train_season1_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
2110,SANSA,CATELYN,1,1,14.0,936.0,48.0
211,AXELL_FLORENT,EDDISON_TOLLETT,0,1,0.0,0.0,0.0
806,RENLY_DWARF,PODRICK,0,1,0.0,0.0,0.0
120,STEELSHANKS_WALTON,VANCE_CORBRAY,0,1,0.0,0.0,0.0
204,ALTON,RHAEGAR,0,1,0.0,0.0,2.0


In [38]:
###NOTE: AFTER APPLYING THE FUNCTION, NUMBER OF ROWS INCREASES! ==> why?
train_season1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2168 entries, 3 to 7663
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2168 non-null   object  
 1   node2   2168 non-null   object  
 2   label   2168 non-null   category
 3   season  2168 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 70.0+ KB


In [39]:
train_season1_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2324 entries, 0 to 2323
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2324 non-null   object  
 1   node2   2324 non-null   object  
 2   label   2324 non-null   category
 3   season  2324 non-null   int64   
 4   cn      2324 non-null   float64 
 5   pa      2324 non-null   float64 
 6   tn      2324 non-null   float64 
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 111.5+ KB


In [40]:
train_season2_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
660,YOHN_ROYCE,YOHN_ROYCE,0,2,0.0,0.0,0.0
308,SHAE,RYLENE,0,2,0.0,0.0,12.0
1907,TYWIN,CERSEI,1,2,14.0,775.0,42.0
172,BILLY,BLACK_LORREN,0,2,0.0,5.0,6.0
210,ELIA,DYING_MAN,0,2,0.0,0.0,0.0


In [41]:
train_season3_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1606,NED,HOUND,1,3,5.0,224.0,25.0
2141,LOCKE,BRIENNE,1,3,2.0,30.0,9.0
1136,MOUNTAIN,BOROS,0,3,0.0,5.0,6.0
2019,YARA,RAMSAY,1,3,2.0,21.0,8.0
1850,TYRION,VARYS,1,3,12.0,350.0,27.0


In [42]:
train_season4_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
3322,OBERYN,MACE,1,4,14.0,442.0,29.0
3409,STYR,GILLY,1,4,3.0,81.0,15.0
2075,MACE,MARGAERY,1,4,12.0,306.0,23.0
2235,NED,CATELYN,1,4,9.0,221.0,21.0
340,STANNIS,RICKARD_KARSTARK,0,4,0.0,0.0,17.0


In [43]:
# train_season5_v.sample(5)

In [44]:
test_season5_v = apply_graphy_features(test_season5, "INTERACTS_5")
test_season6_v = apply_graphy_features(test_season6, 'INTERACTS_6')
test_season7_v = apply_graphy_features(test_season7, "INTERACTS_7")
test_season8_v = apply_graphy_features(test_season8, "INTERACTS_8")

In [45]:
test_season6_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
2007,PYCELLE,MACE,1,6,4.0,72.0,13.0
2001,PYCELLE,MACE,1,6,4.0,72.0,13.0
438,YOHN_ROYCE,DORAN,0,6,0.0,15.0,8.0
2677,YOHN_ROYCE,SANSA,1,6,2.0,123.0,42.0
43,BELICHO,TALLA,0,6,0.0,42.0,13.0


In [46]:
test_season7_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
800,GREY_WORM,PYP,0,7,0.0,0.0,14.0
2252,YOHN_ROYCE,ROBETT,1,7,9.0,132.0,14.0
345,KEVAN,LITTLE_SAM,0,7,0.0,0.0,2.0
142,HOSTER,EURON,0,7,0.0,0.0,17.0
260,MARWYN,BELICHO,0,7,0.0,0.0,0.0


In [47]:
#Combining all seasons for trainign set and testing set
frames_training = [train_season1_v, train_season2_v, train_season3_v, train_season4_v]
result_training = pd.concat(frames_training)
frames_test = [test_season5_v, test_season6_v, test_season7_v, test_season8_v]
result_test = pd.concat(frames_test)

In [48]:
result_training

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,AXELL_FLORENT,STEELSHANKS_WALTON,0,1,0.0,0.0,0.0
1,SYRIO_FOREL,OBERYN,0,1,0.0,0.0,3.0
2,SYRIO_FOREL,OBERYN,0,1,0.0,0.0,0.0
3,ORSON,OLLY,0,1,0.0,0.0,0.0
4,TORTURER,OLENNA,0,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...
3453,YOHN_ROYCE,VANCE_CORBRAY,1,4,3.0,24.0,7.0
3454,YOHN_ROYCE,VANCE_CORBRAY,1,4,0.0,0.0,4.0
3455,YOHN_ROYCE,NED,1,4,3.0,102.0,20.0
3456,YOHN_ROYCE,NED,1,4,0.0,0.0,17.0


In [49]:
result_test

Unnamed: 0,node1,node2,label,season,cn,pa,tn
0,RICKON,RAY,0,5,0.0,0.0,3.0
1,GUARD_CAPTAIN,RIVERLANDS_LORD,0,5,0.0,0.0,0.0
2,OTHELL_YARWYCK,HIGH_SPARROW,0,5,0.0,11.0,12.0
3,STRONG,RICKON,0,5,0.0,15.0,8.0
4,SANSA,JANOS,0,5,1.0,207.0,31.0
...,...,...,...,...,...,...,...
2844,WILLA,SARRA,1,8,3.0,16.0,5.0
2845,WILLA,SANSA,1,8,3.0,164.0,42.0
2846,WILLA,HOUND,1,8,3.0,100.0,26.0
2847,WILLA,TORMUND,1,8,3.0,120.0,31.0


## Train your model

In [50]:
columns = ["cn", "pa", "tn"]
X = result_training[columns]
y = result_training["label"]
classifier.fit(X, y)

# Evaluation

In [51]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})
def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [52]:
predictions = classifier.predict(result_test[columns])
y_test = result_test["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.946412
1,precision,0.953084
2,recall,0.956468


In [53]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,cn,0.47539
1,pa,0.341132
2,tn,0.183479


# Introducing more features (Triangles and The Clustering Coefficient)

## Calculating the Triangle count

In [54]:
query1 = """
CALL gds.graph.drop('myGraph1') YIELD graphName;
"""

query2 = """
CALL gds.graph.drop('myGraph2') YIELD graphName;
"""

query3 = """
CALL gds.graph.drop('myGraph3') YIELD graphName;
"""

query4 = """
CALL gds.graph.drop('myGraph4') YIELD graphName;
"""

query5 = """
CALL gds.graph.drop('myGraph5') YIELD graphName;
"""

query6 = """
CALL gds.graph.drop('myGraph6') YIELD graphName;
"""

query7 = """
CALL gds.graph.drop('myGraph7') YIELD graphName;
"""

query8 = """
CALL gds.graph.drop('myGraph8') YIELD graphName;
"""

graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)


ClientError: [Procedure.ProcedureCallFailed] Failed to invoke procedure `gds.graph.drop`: Caused by: java.util.NoSuchElementException: Graph with name `myGraph1` does not exist on database `neo4j`. It might exist on another database.

In [55]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query1 = """
CALL gds.graph.project(
  'myGraph1',
  'Person',
  {
    INTERACTS_1: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query2 = """
CALL gds.graph.project(
  'myGraph2',
  'Person',
  {
    INTERACTS_2: {
    orientation: 'UNDIRECTED'
}
}
)
"""
query3 = """
CALL gds.graph.project(
  'myGraph3',
  'Person',
  {
    INTERACTS_3: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query4 = """
CALL gds.graph.project(
  'myGraph4',
  'Person',
  {
    INTERACTS_4: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query5 = """
CALL gds.graph.project(
  'myGraph5',
  'Person',
  {
    INTERACTS_5: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_5: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_5', properties: {}}}",myGraph5,418,866,46


In [56]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query6 = """
CALL gds.graph.project(
  'myGraph6',
  'Person',
  {
    INTERACTS_6: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query7 = """
CALL gds.graph.project(
  'myGraph7',
  'Person',
  {
    INTERACTS_7: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query8 = """
CALL gds.graph.project(
  'myGraph8',
  'Person',
  {
    INTERACTS_8: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

graph.run(query6)
graph.run(query7)
graph.run(query8)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_8: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_8', properties: {}}}",myGraph8,418,1200,39


In [57]:
query1 = """ 
CALL gds.triangleCount.write('myGraph1', {
  writeProperty: 'trianglesTrain1'
})
"""

query2 = """ 
CALL gds.triangleCount.write('myGraph2', {
  writeProperty: 'trianglesTrain2'
})
"""

query3 = """ 
CALL gds.triangleCount.write('myGraph3', {
  writeProperty: 'trianglesTrain3'
})
"""

query4 = """ 
CALL gds.triangleCount.write('myGraph4', {
  writeProperty: 'trianglesTrain4'
})
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
14,418,1524,418,0,0,14,"{jobId: 'a386e335-e2ee-41ca-b75d-35cfb989f9f7', writeConcurrency: 4, writeProperty: 'trianglesTrain4', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [58]:
query5 = """ 
CALL gds.triangleCount.write('myGraph5', {
  writeProperty: 'trianglesTest5'
})
"""

query6 = """ 
CALL gds.triangleCount.write('myGraph6', {
  writeProperty: 'trianglesTest6'
})
"""
query7 = """ 
CALL gds.triangleCount.write('myGraph7', {
  writeProperty: 'trianglesTest7'
})
"""
query8 = """ 
CALL gds.triangleCount.write('myGraph8', {
  writeProperty: 'trianglesTest8'
})
"""
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
15,418,3351,418,0,0,11,"{jobId: '4e5ed31c-7128-4799-90d1-cee889528bb6', writeConcurrency: 4, writeProperty: 'trianglesTest8', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [59]:
query1 = """
CALL gds.localClusteringCoefficient.write('myGraph1', {
    writeProperty: 'coefficientTrain1'
});
"""

query2 = """
CALL gds.localClusteringCoefficient.write('myGraph2', {
    writeProperty: 'coefficientTrain2'
});
"""

query3 = """
CALL gds.localClusteringCoefficient.write('myGraph3', {
    writeProperty: 'coefficientTrain3'
});
"""

query4 = """
CALL gds.localClusteringCoefficient.write('myGraph4', {
    writeProperty: 'coefficientTrain4'
});
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
13,418,0.2821768603167667,418,0,0,23,"{jobId: '2d6655cd-095c-418e-82a6-f10549d98160', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTrain4', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [60]:
query5 = """
CALL gds.localClusteringCoefficient.write('myGraph5', {
    writeProperty: 'coefficientTest5'
});
"""

query6 = """
CALL gds.localClusteringCoefficient.write('myGraph6', {
    writeProperty: 'coefficientTest6'
});
"""

query7 = """
CALL gds.localClusteringCoefficient.write('myGraph7', {
    writeProperty: 'coefficientTest7'
});
"""

query8 = """
CALL gds.localClusteringCoefficient.write('myGraph8', {
    writeProperty: 'coefficientTest8'
});
"""

graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
12,418,0.1246066499305612,418,0,0,24,"{jobId: '0b8fc18e-02e0-460a-914e-521829123a82', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTest8', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


## Adding the features

In [61]:
def apply_triangles_features(data,triangles_prop,coefficient_prop):
    
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) WHERE p1.id = pair.node1
    MATCH (p2:Person) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1, 
    pair.node2 AS node2,
    apoc.coll.min([p1[$triangles], p2[$triangles]]) AS minTriangles,
    apoc.coll.max([p1[$triangles], p2[$triangles]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficient], p2[$coefficient]]) AS minCoeff,
    apoc.coll.max([p1[$coefficient], p2[$coefficient]]) AS maxCoeff
    """
    

    pairs = [{"node1": str(pair[0]), "node2": str(pair[1])}  
          for pair in data[["node1", "node2"]].values.tolist()]
        
    params = {
        "pairs": pairs,
        "triangles": triangles_prop,
        "coefficient": coefficient_prop
        }
    
    features = graph.run(query,params).to_data_frame()
    
    return pd.merge(data, features, on = ["node1", "node2"])

In [62]:
train_season1_w = apply_triangles_features(train_season1_v, "trianglesTrain1", "coefficientTrain1")
train_season2_w = apply_triangles_features(train_season2_v, "trianglesTrain2", "coefficientTrain2")
train_season3_w = apply_triangles_features(train_season3_v, "trianglesTrain3", "coefficientTrain3")
train_season4_w = apply_triangles_features(train_season4_v, "trianglesTrain4", "coefficientTrain4")
# train_season5_w = apply_triangles_features(train_season5_v, "trianglesTrain5", "coefficientTrain5")

test_season5_w = apply_triangles_features(test_season5_v, "trianglesTest5", "coefficientTest5")
test_season6_w = apply_triangles_features(test_season6_v, "trianglesTest6", "coefficientTest6")
test_season7_w = apply_triangles_features(test_season7_v, "trianglesTest7", "coefficientTest7")
test_season8_w = apply_triangles_features(test_season8_v, "trianglesTest8", "coefficientTest8")

In [63]:
frames_training_w = [train_season1_w, train_season2_w,
                   train_season3_w, train_season4_w]
result_training_w = pd.concat(frames_training_w)
frames_test_w = [test_season5_w ,test_season6_w, test_season7_w, test_season8_w]
result_test_w = pd.concat(frames_test_w)


# Train Model

In [64]:
classifier2 = RandomForestClassifier(n_estimators=30, max_depth=10,
                                    random_state=0)

In [65]:
columns = ["cn", "pa", "tn","minTriangles", "maxTriangles", "minCoeff", "maxCoeff"]
X = result_training_w[columns]
y = result_training_w["label"]
classifier2.fit(X, y)

In [66]:
predictions = classifier2.predict(result_test_w[columns])
y_test = result_test_w["label"]
evaluate_model(predictions, y_test)


Unnamed: 0,metric,value
0,accuracy,0.950413
1,precision,0.990842
2,recall,0.956203


In [67]:
feature_importance(columns, classifier2)

Unnamed: 0,feature,value
0,tn,0.302258
1,maxCoeff,0.250045
2,maxTriangles,0.232329
3,cn,0.073742
4,pa,0.063268
5,minTriangles,0.042925
6,minCoeff,0.035433
