In [1]:
from py2neo import Graph
import pandas as pd
from sklearn.model_selection import train_test_split


graph = Graph("bolt://localhost:7687", auth=("neo4j", "neo4jneo4j"))

# Building testing and training set

## Training dataset

In [2]:
# Find positive examples
train_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [3]:
train_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4494 entries, 0 to 4493
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 140.6+ KB


In [4]:
# Find negative examples
train_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-()
MATCH (n:Person)-[r:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4*2..3]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_1|INTERACTS_2|INTERACTS_3|INTERACTS_4]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [5]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6408824 entries, 0 to 6408823
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   node1   object
 1   node2   object
 2   label   int64 
dtypes: int64(1), object(2)
memory usage: 146.7+ MB


In [6]:
import random
randomlist = []
for i in range(0,6408824):
    n = random.randint(1,4)
    randomlist.append(n)
#print(randomlist)
train_missing_links['season']=randomlist
train_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,ADDAM_MARBRAND,TYRION,0,1
1,ADDAM_MARBRAND,BRONN,0,3
2,ADDAM_MARBRAND,SHAE,0,2
3,ADDAM_MARBRAND,JON,0,1
4,ADDAM_MARBRAND,CATELYN,0,3


In [7]:
# Remove duplicates
train_missing_links = train_missing_links.drop_duplicates()

In [8]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 233694 entries, 0 to 6408822
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   233694 non-null  object
 1   node2   233694 non-null  object
 2   label   233694 non-null  int64 
 3   season  233694 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 8.9+ MB


In [9]:
# Down sample negative examples
train_missing_links = train_missing_links.sample(
    n=len(train_existing_links))

In [10]:
train_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4494 entries, 5120244 to 5025416
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   4494 non-null   object
 1   node2   4494 non-null   object
 2   label   4494 non-null   int64 
 3   season  4494 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 175.5+ KB


In [11]:
# Create DataFrame from positive and negative examples
training_df = pd.concat([train_missing_links,train_existing_links], ignore_index=True)
training_df['label'] = training_df['label'].astype('category')

In [12]:
training_df

Unnamed: 0,node1,node2,label,season
0,POLLIVER,FARMER,0,2
1,LANCEL,CATELYN,0,2
2,MORGAN,LORD_OF_BONES,0,4
3,GARED,YGRITTE,0,1
4,LORD_OF_BONES,ROYCE,0,3
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [13]:
training_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8988 entries, 0 to 8987
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   8988 non-null   object  
 1   node2   8988 non-null   object  
 2   label   8988 non-null   category
 3   season  8988 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 219.7+ KB


In [14]:
#Checking if there are indeed labels with the positive category
training_df[training_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
4494,ADDAM_MARBRAND,TYWIN,1,1
4495,ADDAM_MARBRAND,LEO_LEFFORD,1,1
4496,ADDAM_MARBRAND,KEVAN,1,1
4497,AEGON,SHIREEN,1,3
4498,AEGON,DAVOS,1,3
...,...,...,...,...
8983,YOHN_ROYCE,ANYA_WAYNWOOD,1,4
8984,YOHN_ROYCE,LYSA,1,4
8985,YOHN_ROYCE,VANCE_CORBRAY,1,4
8986,YOHN_ROYCE,NED,1,4


In [15]:
#Checking if there are indeed labels with the negative category
training_df[training_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,POLLIVER,FARMER,0,2
1,LANCEL,CATELYN,0,2
2,MORGAN,LORD_OF_BONES,0,4
3,GARED,YGRITTE,0,1
4,LORD_OF_BONES,ROYCE,0,3
...,...,...,...,...
4489,TOMMY,DAISY,0,2
4490,BENJEN,WILLEM_LANNISTER,0,1
4491,BLACK_WALDER,RHAEGAR,0,2
4492,SORCERER,LANCEL,0,1


## Validation dataset

In [16]:
# Find positive examples
validation_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()


In [17]:
validation_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2020 entries, 0 to 2019
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2020 non-null   object
 1   node2   2020 non-null   object
 2   label   2020 non-null   int64 
 3   season  2020 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 63.3+ KB


In [18]:
# Find negative examples
validation_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_5|INTERACTS_6]-()
MATCH (n:Person)-[r:INTERACTS_5|INTERACTS_6*2..3]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_5|INTERACTS_6]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()


In [19]:
validation_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 578560 entries, 0 to 578559
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   node1   578560 non-null  object
 1   node2   578560 non-null  object
 2   label   578560 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 13.2+ MB


In [20]:
randomlist = []
for i in range(0, 578560):
    n = random.randint(5, 6)
    randomlist.append(n)
# print(randomlist)
validation_missing_links['season'] = randomlist
validation_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,AEGON,SAM,0,5
1,AEGON,JON,0,6
2,AEGON,RANDYLL,0,5
3,AEGON,MANCE,0,6
4,AEGON,MAGNAR,0,5


In [21]:
validation_missing_links = validation_missing_links.drop_duplicates()

In [22]:
validation_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38098 entries, 0 to 578449
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   38098 non-null  object
 1   node2   38098 non-null  object
 2   label   38098 non-null  int64 
 3   season  38098 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.5+ MB


In [23]:
# Down sample negative examples
validation_missing_links = validation_missing_links.sample(n=len(validation_existing_links))

In [24]:
validation_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2020 entries, 546091 to 351224
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2020 non-null   object
 1   node2   2020 non-null   object
 2   label   2020 non-null   int64 
 3   season  2020 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 78.9+ KB


In [25]:
# Create DataFrame from positive and negative examples
validation_df = pd.concat(
    [validation_missing_links, validation_existing_links], ignore_index=True)
validation_df['label'] = validation_df['label'].astype('category')

In [26]:
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4040 entries, 0 to 4039
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   4040 non-null   object  
 1   node2   4040 non-null   object  
 2   label   4040 non-null   category
 3   season  4040 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 98.9+ KB


## Testing dataset

In [27]:
# Find positive examples
test_existing_links = graph.run("""
MATCH (n:Person)-[r:INTERACTS_7|INTERACTS_8]-(p:Person)
RETURN n.id AS node1, p.id AS node2, 1 AS label, r.season AS season
""").to_data_frame()

In [28]:
test_existing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2090 entries, 0 to 2089
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2090 non-null   object
 1   node2   2090 non-null   object
 2   label   2090 non-null   int64 
 3   season  2090 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 65.4+ KB


In [29]:
# Find negative examples
test_missing_links = graph.run("""
MATCH (n:Person)
WHERE (n:Person)-[:INTERACTS_7|INTERACTS_8]-()
MATCH (n:Person)-[r:INTERACTS_7|INTERACTS_8*2..3]-(p:Person)
WHERE not((n:Person)-[:INTERACTS_7|INTERACTS_8]-(p:Person))
RETURN n.id AS node1, p.id AS node2, 0 AS label
""").to_data_frame()

In [30]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500008 entries, 0 to 1500007
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   node1   1500008 non-null  object
 1   node2   1500008 non-null  object
 2   label   1500008 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 34.3+ MB


In [31]:
randomlist = []
for i in range(0, 1500008):
    n = random.randint(7,8)
    randomlist.append(n)
#print(randomlist)
test_missing_links['season']=randomlist
test_missing_links.head(5)

Unnamed: 0,node1,node2,label,season
0,AEGON,CERSEI,0,8
1,AEGON,AERYS,0,8
2,AEGON,BALERION,0,7
3,AEGON,BRIENNE,0,7
4,AEGON,BRONN,0,8


In [32]:
# Remove duplicates 
test_missing_links = test_missing_links.drop_duplicates()

In [33]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16452 entries, 0 to 1500004
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   16452 non-null  object
 1   node2   16452 non-null  object
 2   label   16452 non-null  int64 
 3   season  16452 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 642.7+ KB


In [34]:
# Down sample negative examples
test_missing_links = test_missing_links.sample(n=len(test_existing_links))

In [35]:
test_missing_links.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2090 entries, 709243 to 892793
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   node1   2090 non-null   object
 1   node2   2090 non-null   object
 2   label   2090 non-null   int64 
 3   season  2090 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 81.6+ KB


In [36]:
# Create DataFrame from positive and negative examples
test_df = pd.concat([test_missing_links, test_existing_links], ignore_index=True)
test_df['label'] = test_df['label'].astype('category')

In [37]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4180 entries, 0 to 4179
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   4180 non-null   object  
 1   node2   4180 non-null   object  
 2   label   4180 non-null   category
 3   season  4180 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 102.3+ KB


In [38]:
#Checking if there are indeed labels with the positive category
test_df[test_df['label'] == 1]

Unnamed: 0,node1,node2,label,season
2090,AEGON,DAENERYS,1,8
2091,AEGON,DAENERYS,1,7
2092,AERYS,JON,1,8
2093,AERYS,DAENERYS,1,8
2094,AERYS,SANSA,1,7
...,...,...,...,...
4175,WILLA,SARRA,1,8
4176,WILLA,SANSA,1,8
4177,WILLA,HOUND,1,8
4178,WILLA,TORMUND,1,8


In [39]:
#Checking if there are indeed labels with the negative category
test_df[test_df['label'] == 0]

Unnamed: 0,node1,node2,label,season
0,EDDISON_TOLLETT,VALE_LORD,0,7
1,HOT_PIE,NED,0,7
2,UNSULLIED_CAPTAIN,BRAN,0,7
3,JORAH,GILLY,0,7
4,ALANNA,BERIC,0,8
...,...,...,...,...
2085,STANNIS,MELISANDRE,0,8
2086,WALDER,TOMMEN,0,8
2087,YOHN_ROYCE,NORA,0,7
2088,LYSA,HOUND,0,7


## Generating link prediction features

In [40]:
def apply_graphy_features(data, rel_type):
    query = """
    UNWIND $pairs AS pair
    MATCH (p1) WHERE p1.id = pair.node1
    MATCH (p2) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1,
           pair.node2 AS node2,
           gds.alpha.linkprediction.commonNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS cn,
           gds.alpha.linkprediction.preferentialAttachment(
               p1, p2, {relationshipQuery: $relType}) AS pa,
           gds.alpha.linkprediction.totalNeighbors(
               p1, p2, {relationshipQuery: $relType}) AS tn
    """
    pairs = [{"node1": node1, "node2": node2}  for node1,node2 in data[["node1", "node2"]].values.tolist()]
    params = {"pairs": pairs, "relType": rel_type}
    
    features = graph.run(query, params).to_data_frame()
    return pd.merge(data, features, on = ["node1", "node2"])

In [41]:
train_season1 = training_df[training_df['season'] == 1]
train_season2 = training_df[training_df['season'] == 2]
train_season3 = training_df[training_df['season'] == 3]
train_season4 = training_df[training_df['season'] == 4]

In [42]:
validation_season5 = validation_df[validation_df['season'] == 5]
validation_season6 = validation_df[validation_df['season'] == 6]

test_season7 = test_df[test_df['season'] == 7]
test_season8 = test_df[test_df['season'] == 8]

In [43]:
train_season1_v = apply_graphy_features(train_season1, "INTERACTS_1")
train_season2_v = apply_graphy_features(train_season2, "INTERACTS_2")
train_season3_v= apply_graphy_features(train_season3, "INTERACTS_3")
train_season4_v= apply_graphy_features(train_season4, "INTERACTS_4")
# train_season5_v= apply_graphy_features(train_season5, "INTERACTS_5")

In [44]:
train_season1_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
88,QHORIN,WINTERFELL_SHEPHERD,0,1,0.0,0.0,0.0
1192,JOYEUSE,GRENN,0,1,0.0,8.0,9.0
454,PROTESTER,QHORIN,0,1,0.0,0.0,0.0
2060,ROBERT,NED,1,1,31.0,2052.0,62.0
1785,MAESTER_AEMON,AERYS,1,1,1.0,117.0,21.0


In [45]:
###NOTE: AFTER APPLYING THE FUNCTION, NUMBER OF ROWS INCREASES! ==> why?
train_season1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2215 entries, 3 to 7663
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2215 non-null   object  
 1   node2   2215 non-null   object  
 2   label   2215 non-null   category
 3   season  2215 non-null   int64   
dtypes: category(1), int64(1), object(2)
memory usage: 71.5+ KB


In [46]:
train_season1_v.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2366 entries, 0 to 2365
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   node1   2366 non-null   object  
 1   node2   2366 non-null   object  
 2   label   2366 non-null   category
 3   season  2366 non-null   int64   
 4   cn      2366 non-null   float64 
 5   pa      2366 non-null   float64 
 6   tn      2366 non-null   float64 
dtypes: category(1), float64(3), int64(1), object(2)
memory usage: 113.5+ KB


In [47]:
train_season2_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
492,ILLYRIO,LYSA,0,2,0.0,0.0,1.0
1349,GENDRY,ARYA,1,2,9.0,270.0,28.0
72,LORAS,DOREAH,0,2,0.0,80.0,21.0
1062,ROS,TYSHA,0,2,0.0,0.0,6.0
942,FALYSE,MACE,0,2,0.0,0.0,0.0


In [48]:
train_season3_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
251,DAGMER,ALLISER_THORNE,0,3,0.0,0.0,0.0
287,QHORIN,RODRIK,0,3,0.0,0.0,0.0
1772,STANNIS,ROBERT,1,3,4.0,168.0,22.0
2134,MERO,GREY_WORM,1,3,4.0,36.0,8.0
1967,LORD_OF_BONES,YGRITTE,1,3,3.0,52.0,14.0


In [49]:
train_season4_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
394,VARDIS_EGEN,BRAN,0,4,0.0,0.0,15.0
1459,ALLISER_THORNE,MAESTER_AEMON,1,4,7.0,210.0,22.0
958,MOUNTAIN,LUTHOR,0,4,0.0,0.0,2.0
2760,PODRICK,HOT_PIE,1,4,2.0,51.0,18.0
2457,SYRIO_FOREL,HOUND,1,4,2.0,63.0,22.0


In [50]:
# train_season5_v.sample(5)

In [51]:
validation_season5_v = apply_graphy_features(validation_season5, "INTERACTS_5")
validation_season6_v = apply_graphy_features(validation_season6, 'INTERACTS_6')
test_season7_v = apply_graphy_features(test_season7, "INTERACTS_7")
test_season8_v = apply_graphy_features(test_season8, "INTERACTS_8")

In [52]:
validation_season6_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1216,ALLISER_THORNE,MELISANDRE,1,6,7.0,135.0,17.0
278,LORAS,KARSI,0,6,0.0,0.0,10.0
1208,ALLISER_THORNE,MELISANDRE,1,6,7.0,135.0,17.0
241,YOHN_ROYCE,MAESTER_WOLKAN,0,6,0.0,12.0,7.0
730,WUN_WUN,ROBERT,0,6,0.0,78.0,19.0


In [53]:
test_season7_v.sample(5)

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1066,IRONBORN_LORD,PODRICK,0,7,0.0,0.0,19.0
1482,JON,MOUNTAIN,1,7,10.0,736.0,52.0
412,LITTLE_SAM,JOANNA,0,7,0.0,2.0,3.0
493,OBERYN,RAMSAY,0,7,0.0,5.0,6.0
155,YOHN_ROYCE,LYSA,0,7,1.0,24.0,13.0


In [54]:
#Combining all seasons for trainign set and testing set
frames_training = [train_season1_v, train_season2_v, train_season3_v, train_season4_v]
result_training = pd.concat(frames_training)
result_training = result_training.sample(frac=1)
frames_validation = [validation_season5_v, validation_season6_v]
result_validation = pd.concat(frames_validation)
result_validation = result_validation.sample(frac=1)
frames_test = [test_season7_v, test_season8_v]
result_test = pd.concat(frames_test)
result_test = result_test.sample(frac=1)

In [55]:
result_training

Unnamed: 0,node1,node2,label,season,cn,pa,tn
632,TOMARD,MATTHOS,0,1,0.0,0.0,3.0
752,LEO_LEFFORD,ENDREW,0,4,0.0,0.0,2.0
1874,TYWIN,TYRION,1,2,13.0,825.0,45.0
910,FREY_DAUGHTER,BLACK_WALDER,0,2,0.0,0.0,1.0
489,TYRION,MISSANDEI,0,4,0.0,312.0,47.0
...,...,...,...,...,...,...,...
1224,ALLISER_THORNE,MAESTER_AEMON,1,1,0.0,0.0,9.0
2114,MACE,OBERYN,1,4,14.0,442.0,29.0
1835,TYWIN,MARGAERY,1,3,10.0,377.0,32.0
2094,PODRICK,BRONN,1,2,9.0,165.0,17.0


In [56]:
result_validation

Unnamed: 0,node1,node2,label,season,cn,pa,tn
785,STANNIS,YARA,0,6,1.0,63.0,15.0
2517,TYCHO,MACE,1,5,4.0,60.0,12.0
1758,LYANNA,BRAN,1,6,4.0,90.0,19.0
2892,TALLA,MELESSA,1,6,5.0,36.0,7.0
761,MAESTER_WOLKAN,MANDERLY,0,6,0.0,16.0,8.0
...,...,...,...,...,...,...,...
1512,JAIME,LOLLYS,1,5,1.0,51.0,19.0
753,BOBONO,MAESTER_WOLKAN,0,6,0.0,52.0,17.0
1769,LYANNA,BENJEN,1,6,4.0,40.0,9.0
744,MELARA,IZEMBARO,0,6,0.0,0.0,13.0


In [57]:
result_test

Unnamed: 0,node1,node2,label,season,cn,pa,tn
1823,SANSA,DAVOS,1,7,13.0,725.0,41.0
1146,ARYA,MOUNTAIN,1,8,7.0,410.0,44.0
64,CERSEIS_BABY,ROBERT,0,8,1.0,24.0,10.0
1809,SAM,STANNIS,1,7,0.0,12.0,13.0
1738,NED,JOFFREY,1,7,6.0,176.0,21.0
...,...,...,...,...,...,...,...
1989,VARYS,SAM,1,8,27.0,1530.0,52.0
916,DROGO,JEOR,0,7,2.0,12.0,5.0
1223,BERIC,NIGHT_KING,1,8,5.0,174.0,30.0
2353,OBARA,THEON,1,7,5.0,150.0,26.0


# Choosing Random Forest Classifier

In [58]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=30, max_depth=10,  
                                    random_state=0)

## Train your model

In [59]:
columns = ["cn", "pa", "tn"]
X = result_training[columns]
y = result_training["label"]
classifier.fit(X, y)

# Evaluation

In [60]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
def evaluate_model(predictions, actual):
    accuracy = accuracy_score(actual, predictions)
    precision = precision_score(actual, predictions)
    recall = recall_score(actual, predictions)
    
    metrics = ["accuracy", "precision", "recall"]
    values = [accuracy, precision, recall]    
    return pd.DataFrame(data={'metric': metrics, 'value': values})
def feature_importance(columns, classifier):        
    features = list(zip(columns, classifier.feature_importances_))
    sorted_features = sorted(features, key = lambda x: x[1]*-1)
    
    keys = [value[0] for value in sorted_features]
    values = [value[1] for value in sorted_features]
    return pd.DataFrame(data={'feature': keys, 'value': values})

In [61]:
predictions = classifier.predict(result_test[columns])
y_test = result_test["label"]
evaluate_model(predictions, y_test)

Unnamed: 0,metric,value
0,accuracy,0.869005
1,precision,0.823014
2,recall,0.981699


In [62]:
feature_importance(columns, classifier)

Unnamed: 0,feature,value
0,cn,0.48699
1,pa,0.329295
2,tn,0.183715


# Introducing more features (Triangles and The Clustering Coefficient)

## Calculating the Triangle count

In [63]:
query1 = """
CALL gds.graph.drop('myGraph1') YIELD graphName;
"""

query2 = """
CALL gds.graph.drop('myGraph2') YIELD graphName;
"""

query3 = """
CALL gds.graph.drop('myGraph3') YIELD graphName;
"""

query4 = """
CALL gds.graph.drop('myGraph4') YIELD graphName;
"""

query5 = """
CALL gds.graph.drop('myGraph5') YIELD graphName;
"""

query6 = """
CALL gds.graph.drop('myGraph6') YIELD graphName;
"""

query7 = """
CALL gds.graph.drop('myGraph7') YIELD graphName;
"""

query8 = """
CALL gds.graph.drop('myGraph8') YIELD graphName;
"""

graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)


graphName
myGraph8


In [64]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query1 = """
CALL gds.graph.project(
  'myGraph1',
  'Person',
  {
    INTERACTS_1: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query2 = """
CALL gds.graph.project(
  'myGraph2',
  'Person',
  {
    INTERACTS_2: {
    orientation: 'UNDIRECTED'
}
}
)
"""
query3 = """
CALL gds.graph.project(
  'myGraph3',
  'Person',
  {
    INTERACTS_3: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query4 = """
CALL gds.graph.project(
  'myGraph4',
  'Person',
  {
    INTERACTS_4: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
query5 = """
CALL gds.graph.project(
  'myGraph5',
  'Person',
  {
    INTERACTS_5: {
    orientation: 'UNDIRECTED'
}
  }
)
"""
graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)
graph.run(query5)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_5: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_5', properties: {}}}",myGraph5,418,866,13


In [65]:
# Make the in memory graphs for adding triangle counts and clustering coefficients
query6 = """
CALL gds.graph.project(
  'myGraph6',
  'Person',
  {
    INTERACTS_6: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query7 = """
CALL gds.graph.project(
  'myGraph7',
  'Person',
  {
    INTERACTS_7: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

query8 = """
CALL gds.graph.project(
  'myGraph8',
  'Person',
  {
    INTERACTS_8: {
      orientation: 'UNDIRECTED'
    }
  }
)
"""

graph.run(query6)
graph.run(query7)
graph.run(query8)


nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
"{Person: {label: 'Person', properties: {}}}","{INTERACTS_8: {orientation: 'UNDIRECTED', indexInverse: false, aggregation: 'DEFAULT', type: 'INTERACTS_8', properties: {}}}",myGraph8,418,1200,15


In [66]:
query1 = """ 
CALL gds.triangleCount.write('myGraph1', {
  writeProperty: 'trianglesTrain1'
})
"""

query2 = """ 
CALL gds.triangleCount.write('myGraph2', {
  writeProperty: 'trianglesTrain2'
})
"""

query3 = """ 
CALL gds.triangleCount.write('myGraph3', {
  writeProperty: 'trianglesTrain3'
})
"""

query4 = """ 
CALL gds.triangleCount.write('myGraph4', {
  writeProperty: 'trianglesTrain4'
})
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
4,418,1524,418,0,0,3,"{jobId: '9aee1723-7945-4cad-b3ca-ac26281539a0', writeConcurrency: 4, writeProperty: 'trianglesTrain4', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [67]:
query5 = """ 
CALL gds.triangleCount.write('myGraph5', {
  writeProperty: 'trianglesTest5'
})
"""

query6 = """ 
CALL gds.triangleCount.write('myGraph6', {
  writeProperty: 'trianglesTest6'
})
"""
query7 = """ 
CALL gds.triangleCount.write('myGraph7', {
  writeProperty: 'trianglesTest7'
})
"""
query8 = """ 
CALL gds.triangleCount.write('myGraph8', {
  writeProperty: 'trianglesTest8'
})
"""
graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,globalTriangleCount,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
4,418,3351,418,0,0,4,"{jobId: '79039b8c-c30b-445b-ae94-da2ba5a0e35f', writeConcurrency: 4, writeProperty: 'trianglesTest8', maxDegree: 9223372036854775807, logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [68]:
query1 = """
CALL gds.localClusteringCoefficient.write('myGraph1', {
    writeProperty: 'coefficientTrain1'
});
"""

query2 = """
CALL gds.localClusteringCoefficient.write('myGraph2', {
    writeProperty: 'coefficientTrain2'
});
"""

query3 = """
CALL gds.localClusteringCoefficient.write('myGraph3', {
    writeProperty: 'coefficientTrain3'
});
"""

query4 = """
CALL gds.localClusteringCoefficient.write('myGraph4', {
    writeProperty: 'coefficientTrain4'
});
"""



graph.run(query1)
graph.run(query2)
graph.run(query3)
graph.run(query4)


writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
5,418,0.2821768603167667,418,0,0,10,"{jobId: '76dc8d0b-8778-44d4-8b5d-ddbf7680e539', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTrain4', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


In [69]:
query5 = """
CALL gds.localClusteringCoefficient.write('myGraph5', {
    writeProperty: 'coefficientTest5'
});
"""

query6 = """
CALL gds.localClusteringCoefficient.write('myGraph6', {
    writeProperty: 'coefficientTest6'
});
"""

query7 = """
CALL gds.localClusteringCoefficient.write('myGraph7', {
    writeProperty: 'coefficientTest7'
});
"""

query8 = """
CALL gds.localClusteringCoefficient.write('myGraph8', {
    writeProperty: 'coefficientTest8'
});
"""

graph.run(query5)
graph.run(query6)
graph.run(query7)
graph.run(query8)

writeMillis,nodePropertiesWritten,averageClusteringCoefficient,nodeCount,postProcessingMillis,preProcessingMillis,computeMillis,configuration
3,418,0.1246066499305612,418,0,0,9,"{jobId: 'da3db338-ae9f-43a9-8648-c6da0a9164dc', writeConcurrency: 4, triangleCountProperty: null, writeProperty: 'coefficientTest8', logProgress: true, nodeLabels: ['*'], sudo: false, relationshipTypes: ['*'], concurrency: 4}"


## Adding the features

In [70]:
def apply_triangles_features(data,triangles_prop,coefficient_prop):
    
    query = """
    UNWIND $pairs AS pair
    MATCH (p1:Person) WHERE p1.id = pair.node1
    MATCH (p2:Person) WHERE p2.id = pair.node2
    RETURN pair.node1 AS node1, 
    pair.node2 AS node2,
    apoc.coll.min([p1[$triangles], p2[$triangles]]) AS minTriangles,
    apoc.coll.max([p1[$triangles], p2[$triangles]]) AS maxTriangles,
    apoc.coll.min([p1[$coefficient], p2[$coefficient]]) AS minCoeff,
    apoc.coll.max([p1[$coefficient], p2[$coefficient]]) AS maxCoeff
    """
    

    pairs = [{"node1": str(pair[0]), "node2": str(pair[1])}  
          for pair in data[["node1", "node2"]].values.tolist()]
        
    params = {
        "pairs": pairs,
        "triangles": triangles_prop,
        "coefficient": coefficient_prop
        }
    
    features = graph.run(query,params).to_data_frame()
    
    return pd.merge(data, features, on = ["node1", "node2"])

In [71]:
train_season1_w = apply_triangles_features(train_season1_v, "trianglesTrain1", "coefficientTrain1")
train_season2_w = apply_triangles_features(train_season2_v, "trianglesTrain2", "coefficientTrain2")
train_season3_w = apply_triangles_features(train_season3_v, "trianglesTrain3", "coefficientTrain3")
train_season4_w = apply_triangles_features(train_season4_v, "trianglesTrain4", "coefficientTrain4")
# train_season5_w = apply_triangles_features(train_season5_v, "trianglesTrain5", "coefficientTrain5")

validation_season5_w = apply_triangles_features(validation_season5_v, "trianglesTest5", "coefficientTest5")
validation_season6_w = apply_triangles_features(validation_season6_v, "trianglesTest6", "coefficientTest6")
test_season7_w = apply_triangles_features(test_season7_v, "trianglesTest7", "coefficientTest7")
test_season8_w = apply_triangles_features(test_season8_v, "trianglesTest8", "coefficientTest8")

In [72]:
frames_training_w = [train_season1_w, train_season2_w,
                   train_season3_w, train_season4_w]
result_training_w = pd.concat(frames_training_w)
result_training_w = result_training_w.sample(frac=1).reset_index(drop=True)
frames_validation_w = [validation_season5_w, validation_season6_w]
result_validation_w = pd.concat(frames_validation_w)
result_validation_w = result_validation_w.sample(frac=1).reset_index(drop=True)
frames_test_w = [test_season7_w, test_season8_w]
result_test_w = pd.concat(frames_test_w)
result_test_w = result_test_w.sample(frac=1).reset_index(drop=True)

# Train Model

In [79]:
from sklearn.model_selection import RandomizedSearchCV

n_estimators = [10,20,30,40,50,60,70,80,90,100]
max_depth = [2,3,4,5,6,7,8,9,10]
min_samples_split = [2,3,4,5,6,7,8,9,10]
min_samples_leaf = [1,2,3,4,5,6,7,8,9,10]
max_features = ['auto', 'sqrt', 'log2']
bootstrap = [True, False]
criterion = ['gini', 'entropy']

param_grid = {'n_estimators': n_estimators,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'max_features': max_features,
                'bootstrap': bootstrap,
                'criterion': criterion}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(result_training_w[["cn", "pa", "tn", "minTriangles",
              "maxTriangles", "minCoeff", "maxCoeff"]], result_training_w['label'])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [82]:
print(rf_random.best_params_)
print(rf_random.best_score_)
print(rf_random.best_estimator_)
print(rf_random.best_estimator_.feature_importances_)


{'n_estimators': 20, 'min_samples_split': 4, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'criterion': 'gini', 'bootstrap': False}
0.9917799523721672
RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=2,
                       min_samples_split=4, n_estimators=20)
[0.0891082  0.06680963 0.27007868 0.05533929 0.28572711 0.03056476
 0.20237233]
1
[0 1]
2


In [73]:
classifier2 = RandomForestClassifier(n_estimators=30, max_depth=10,
                                    random_state=0)

In [74]:
columns = ["cn", "pa", "tn","minTriangles", "maxTriangles", "minCoeff", "maxCoeff"]
X = result_training_w[columns]
y = result_training_w["label"]
classifier2.fit(X, y)

In [75]:
predictions = classifier2.predict(result_test_w[columns])
y_test = result_test_w["label"]
evaluate_model(predictions, y_test)


Unnamed: 0,metric,value
0,accuracy,0.941644
1,precision,0.948067
2,recall,0.985573


In [76]:
feature_importance(columns, classifier2)

Unnamed: 0,feature,value
0,tn,0.281437
1,maxCoeff,0.266709
2,maxTriangles,0.254681
3,cn,0.072159
4,pa,0.062295
5,minTriangles,0.040329
6,minCoeff,0.022391
