In [1]:
!pip install neo4j node2vec networkx

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
from neo4j import GraphDatabase

url = 'bolt://44.193.1.247:7687'
username = 'neo4j'
password = 'fund-circulation-morale'

# Connect to Neo4j
driver = GraphDatabase.driver(url, auth=(username, password))

In [3]:
import pandas as pd

def run_query(query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [4]:
run_query("""
RETURN gds.version() 
""")

Unnamed: 0,gds.version()
0,2.0.1


In [5]:
run_query("""
call dbms.setConfigValue('dbms.transaction.timeout','0')
""")

In [6]:
run_query("""
CREATE CONSTRAINT IF NOT EXISTS ON (s:Stream) ASSERT s.id IS UNIQUE;
""")

In [7]:
run_query("""
LOAD CSV WITH HEADERS FROM "https://bit.ly/3JjgKgZ" AS row
MERGE (s:Stream {id: row.id})
SET s.language = row.language
""")

In [8]:
run_query("""
USING PERIODIC COMMIT 10000
LOAD CSV WITH HEADERS FROM "https://bit.ly/3S9Uyd8" AS row
MATCH (s:Stream {id:row.source})
MATCH (t:Stream {id:row.target})
MERGE (s)-[r:SHARED_AUDIENCE]->(t)
SET r.weight = toInteger(row.weight)
""")

In [9]:
run_query("""
MATCH (s:Stream)
WHERE NOT (s)-[:SHARED_AUDIENCE]-()
DETACH DELETE s
""")

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [11]:
run_query("""
CALL gds.graph.project("twitch", "Stream", 
  {SHARED_AUDIENCE: {orientation: "UNDIRECTED", properties:["weight"]}})
""")

Unnamed: 0,nodeProjection,relationshipProjection,graphName,nodeCount,relationshipCount,projectMillis
0,"{'Stream': {'label': 'Stream', 'properties': {}}}",{'SHARED_AUDIENCE': {'orientation': 'UNDIRECTE...,twitch,3721,262854,679


In [12]:
from sklearn.metrics import classification_report

for embeddingDimension in [8,16,32,64,128,256]:
  data = run_query(f"""
  CALL gds.beta.node2vec.stream('twitch', 
    {{embeddingDimension:{embeddingDimension}, relationshipWeightProperty:'weight',
    inOutFactor:2, returnFactor:1}})
  YIELD nodeId, embedding
  WITH gds.util.asNode(nodeId) AS node, embedding
  RETURN node.id AS streamId, node.language AS language, embedding
  """)
  data['output'] = pd.factorize(data['language'])[0]
  X = data['embedding'].to_list()
  y = data['output'].to_list()

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

  rfc = RandomForestClassifier()
  rfc.fit(X_train, y_train)

  y_pred = rfc.predict(X_test)
  r = classification_report(y_test,y_pred, output_dict=True)['weighted avg']
  print(f"embedding dimension {embeddingDimension} has {r} weighted avg")

embedding dimension 8 has {'precision': 0.9018994255561046, 'recall': 0.9006711409395973, 'f1-score': 0.9005426180923415, 'support': 745} weighted avg
embedding dimension 16 has {'precision': 0.8888086789098827, 'recall': 0.8859060402684564, 'f1-score': 0.8853744481384653, 'support': 745} weighted avg
embedding dimension 32 has {'precision': 0.856411034595741, 'recall': 0.8469798657718121, 'f1-score': 0.845814552890966, 'support': 745} weighted avg
embedding dimension 64 has {'precision': 0.8281362773812437, 'recall': 0.7932885906040269, 'f1-score': 0.7852580524128767, 'support': 745} weighted avg
embedding dimension 128 has {'precision': 0.7924177877059633, 'recall': 0.7100671140939597, 'f1-score': 0.684787735858872, 'support': 745} weighted avg
embedding dimension 256 has {'precision': 0.7442489869651336, 'recall': 0.6067114093959731, 'f1-score': 0.529712561742786, 'support': 745} weighted avg


In [13]:
run_query("""
CALL gds.graph.drop('twitch')
""")

Unnamed: 0,graphName,database,memoryUsage,sizeInBytes,nodeCount,relationshipCount,configuration,density,creationTime,modificationTime,schema
0,twitch,neo4j,,-1,3721,262854,{'relationshipProjection': {'SHARED_AUDIENCE':...,0.018989,2022-08-08T16:25:31.018810000+00:00,2022-08-08T16:25:31.695509000+00:00,{'relationships': {'SHARED_AUDIENCE': {'weight...


In [14]:
from node2vec import Node2Vec
import networkx as nx

In [15]:
# networx Graph

In [16]:
# Construct a networkX graph
edge_list = run_query("""
MATCH (s:Stream)-[r:SHARED_AUDIENCE]->(t:Stream)
WITH toString(s.id) + " " + toString(t.id) + " {'weight':" + toString(r.weight)  + "}" as edge
WITH collect(edge) as result
RETURN result
""")

edge_list = edge_list['result'].to_list()[0]
# Undirected graph as well
G = nx.parse_edgelist(edge_list, create_using=nx.Graph(), nodetype=int)

In [17]:
G.number_of_nodes()

3721

In [18]:
labels = run_query("""
MATCH (s:Stream)
RETURN s.id AS id, s.language AS language
""")

In [19]:
for embeddingDimension in [8,16,32,64,128,256]:
  node2vec = Node2Vec(G, dimensions=embeddingDimension, walk_length=80, num_walks=10, workers=4, p=2, q= 1, weight_key= 'weight', seed=1)
  model = node2vec.fit(window=10, min_count=1, batch_words=1000, sg=1, negative=5, ns_exponent=0.75, alpha=0.01, min_alpha=0.0001)  # Any keywords acceptable by gensim.Word2Vec can be passed, `dimensions` and `workers` are automatically passed (from the Node2Vec constructor)
  d = []
  for i in model.wv.vocab:
    d.append({'id': i, 'embedding': list(model.wv[i])})
  df = pd.DataFrame.from_dict(d).merge(labels, on='id')
  df['output'] = pd.factorize(df['language'])[0]
  X = df['embedding'].to_list()
  y = df['output'].to_list()

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

  rfc = RandomForestClassifier()
  rfc.fit(X_train, y_train)

  y_pred = rfc.predict(X_test)
  r = classification_report(y_test,y_pred, output_dict=True)['weighted avg']
  print(f"embedding dimension {embeddingDimension} has {r} weighted avg")

Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 8 has {'precision': 0.906685832119338, 'recall': 0.9020134228187919, 'f1-score': 0.9028292784084888, 'support': 745} weighted avg


Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 16 has {'precision': 0.9080715182627932, 'recall': 0.9046979865771813, 'f1-score': 0.9048657745971206, 'support': 745} weighted avg


Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 32 has {'precision': 0.9144739536975945, 'recall': 0.912751677852349, 'f1-score': 0.9130174861485133, 'support': 745} weighted avg


Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 64 has {'precision': 0.9201808084688147, 'recall': 0.9167785234899329, 'f1-score': 0.9172581017714226, 'support': 745} weighted avg


Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 128 has {'precision': 0.9045999283664127, 'recall': 0.9033557046979865, 'f1-score': 0.9034578077765745, 'support': 745} weighted avg


Computing transition probabilities:   0%|          | 0/3721 [00:00<?, ?it/s]

embedding dimension 256 has {'precision': 0.9135628110074038, 'recall': 0.9114093959731544, 'f1-score': 0.9116655152460384, 'support': 745} weighted avg
