In [None]:
import neo4j
import json
import time
import pandas as pd

In [None]:
!ls

'Project Code.ipynb'	    Untitled.ipynb	       facebook_features.json
'Project Code_copy.ipynb'   facebook_edges.csv	       facebook_target.csv
 README.md		    facebook_edges_small.csv


In [None]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [None]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    query = "MATCH (node)-[relationship]->() DELETE node, relationship"
    session.run(query)
    query = "MATCH (node) DELETE node"
    session.run(query)

In [None]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    result = session.run(query, **kwargs)
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    return df

In [None]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    query = """
        MATCH (n) 
        RETURN n.name AS node_name, labels(n) AS labels, n.id AS node_id
        ORDER BY n.name
    """
    df_nodes = my_neo4j_run_query_pandas(query)
    print(df_nodes)

    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    query = """
        MATCH (n1)-[r]->(n2) 
        RETURN n1.id AS node_id_1, labels(n1) AS node_1_labels,
               type(r) AS relationship_type,
               n2.id AS node_id_2, labels(n2) AS node_2_labels
        ORDER BY node_id_1, node_id_2
    """
    df_rels = my_neo4j_run_query_pandas(query)
    print(df_rels)
    print("-------------------------")

In [None]:
# 1. Wipe out any existing data
my_neo4j_wipe_out_database()

# 2. Read CSVs into Pandas
edges_df   = pd.read_csv("facebook_edges.csv")   # columns: id_1, id_2
#edges_df = edges_df[:100]

targets_df = pd.read_csv("facebook_target.csv")  # columns: id, facebook_id, page_name, page_type
#targets_df = targets_df[:100]
# 3. Optionally read the JSON features (if present)
try:
    with open("facebook_features.json", "r") as f:
        features_data = json.load(f)  # dict: { "nodeID": [featureIndices,...], ... }
except FileNotFoundError:
    features_data = {}
    print("No facebook_features.json found; skipping feature import.")

In [None]:
for i, row in targets_df.iterrows():
    node_id       = row["id"]
    facebook_id   = row["facebook_id"]
    page_name     = row["page_name"]
    page_type     = row["page_type"]
    
    node_id_str   = str(node_id)
    node_features = features_data.get(node_id_str, [])  # list or empty

    query = """
    CREATE (p:Page {
        id: $node_id,
        facebook_id: $facebook_id,
        page_name: $page_name,
        page_type: $page_type,
        features: $features
    })
    """
    session.run(query, {
        "node_id"    : node_id,
        "facebook_id": facebook_id,
        "page_name"  : page_name,
        "page_type"  : page_type,
        "features"   : node_features
    })

    # (Option A) Print every 1000 nodes
    if i % 1000 == 0 and i != 0:
        print(f"  Created {i} nodes so far...")
print("Done")

In [None]:
start_time = time.time()
last_print_time = start_time

for i, row in edges_df.iterrows():
    source_id = row["id_1"]
    target_id = row["id_2"]
    
    query = """
    MATCH (a:Page {id: $source_id})
    MATCH (b:Page {id: $target_id})
    CREATE (a)-[:LIKES]->(b)
    """
    session.run(query, {"source_id": source_id, "target_id": target_id})

    # (Option B) Print every 5 seconds
    current_time = time.time()
    if current_time - last_print_time > 5:  # e.g. 5s
        elapsed = current_time - start_time
        print(f"  Processed {i} edges in {elapsed:.1f} seconds so far...")
        last_print_time = current_time
print("Done")

  Processed 261 edges in 5.0 seconds so far...
  Processed 523 edges in 10.0 seconds so far...
  Processed 784 edges in 15.0 seconds so far...
  Processed 1046 edges in 20.0 seconds so far...
  Processed 1310 edges in 25.0 seconds so far...
  Processed 1575 edges in 30.0 seconds so far...
  Processed 1835 edges in 35.1 seconds so far...
  Processed 2093 edges in 40.1 seconds so far...
  Processed 2351 edges in 45.1 seconds so far...
  Processed 2607 edges in 50.1 seconds so far...
  Processed 2864 edges in 55.1 seconds so far...
  Processed 3120 edges in 60.1 seconds so far...
  Processed 3379 edges in 65.1 seconds so far...
  Processed 3637 edges in 70.1 seconds so far...
  Processed 3894 edges in 75.1 seconds so far...
  Processed 4153 edges in 80.1 seconds so far...
  Processed 4413 edges in 85.1 seconds so far...
  Processed 4671 edges in 90.1 seconds so far...
  Processed 4928 edges in 95.2 seconds so far...
  Processed 5184 edges in 100.2 seconds so far...
  Processed 5440 edges 

  Processed 41995 edges in 816.5 seconds so far...
  Processed 42255 edges in 821.5 seconds so far...
  Processed 42514 edges in 826.5 seconds so far...
  Processed 42774 edges in 831.5 seconds so far...
  Processed 43034 edges in 836.6 seconds so far...
  Processed 43294 edges in 841.6 seconds so far...
  Processed 43555 edges in 846.6 seconds so far...
  Processed 43814 edges in 851.6 seconds so far...
  Processed 44074 edges in 856.6 seconds so far...
  Processed 44332 edges in 861.6 seconds so far...
  Processed 44590 edges in 866.6 seconds so far...
  Processed 44848 edges in 871.6 seconds so far...
  Processed 45107 edges in 876.6 seconds so far...
  Processed 45367 edges in 881.7 seconds so far...
  Processed 45626 edges in 886.7 seconds so far...
  Processed 45884 edges in 891.7 seconds so far...
  Processed 46139 edges in 896.7 seconds so far...
  Processed 46398 edges in 901.7 seconds so far...
  Processed 46657 edges in 906.7 seconds so far...
  Processed 46916 edges in 911.

  Processed 83179 edges in 1613.1 seconds so far...
  Processed 83439 edges in 1618.1 seconds so far...
  Processed 83698 edges in 1623.1 seconds so far...
  Processed 83958 edges in 1628.1 seconds so far...
  Processed 84220 edges in 1633.2 seconds so far...
  Processed 84481 edges in 1638.2 seconds so far...
  Processed 84742 edges in 1643.2 seconds so far...
  Processed 85003 edges in 1648.2 seconds so far...
  Processed 85263 edges in 1653.2 seconds so far...
  Processed 85522 edges in 1658.2 seconds so far...
  Processed 85782 edges in 1663.2 seconds so far...
  Processed 86041 edges in 1668.2 seconds so far...
  Processed 86301 edges in 1673.2 seconds so far...
  Processed 86562 edges in 1678.3 seconds so far...
  Processed 86822 edges in 1683.3 seconds so far...
  Processed 87082 edges in 1688.3 seconds so far...
  Processed 87344 edges in 1693.3 seconds so far...
  Processed 87605 edges in 1698.3 seconds so far...
  Processed 87866 edges in 1703.3 seconds so far...
  Processed 

  Processed 123833 edges in 2394.6 seconds so far...
  Processed 124095 edges in 2399.6 seconds so far...
  Processed 124358 edges in 2404.6 seconds so far...
  Processed 124620 edges in 2409.7 seconds so far...
  Processed 124883 edges in 2414.7 seconds so far...
  Processed 125145 edges in 2419.7 seconds so far...
  Processed 125407 edges in 2424.7 seconds so far...
  Processed 125670 edges in 2429.7 seconds so far...
  Processed 125934 edges in 2434.7 seconds so far...
  Processed 126196 edges in 2439.7 seconds so far...
  Processed 126460 edges in 2444.7 seconds so far...
  Processed 126723 edges in 2449.7 seconds so far...
  Processed 126982 edges in 2454.7 seconds so far...
  Processed 127243 edges in 2459.7 seconds so far...
  Processed 127505 edges in 2464.8 seconds so far...
  Processed 127767 edges in 2469.8 seconds so far...
  Processed 128030 edges in 2474.8 seconds so far...
  Processed 128292 edges in 2479.8 seconds so far...
  Processed 128554 edges in 2484.8 seconds so 

  Processed 164701 edges in 3171.1 seconds so far...
  Processed 164966 edges in 3176.1 seconds so far...
  Processed 165234 edges in 3181.1 seconds so far...
  Processed 165500 edges in 3186.1 seconds so far...
  Processed 165766 edges in 3191.1 seconds so far...
  Processed 166031 edges in 3196.1 seconds so far...
  Processed 166298 edges in 3201.1 seconds so far...
  Processed 166564 edges in 3206.1 seconds so far...
  Processed 166831 edges in 3211.1 seconds so far...
  Processed 167097 edges in 3216.1 seconds so far...
  Processed 167365 edges in 3221.2 seconds so far...
  Processed 167631 edges in 3226.2 seconds so far...
  Processed 167899 edges in 3231.2 seconds so far...
  Processed 168163 edges in 3236.2 seconds so far...
  Processed 168428 edges in 3241.2 seconds so far...
  Processed 168695 edges in 3246.2 seconds so far...
  Processed 168962 edges in 3251.2 seconds so far...
  Processed 169230 edges in 3256.3 seconds so far...
  Processed 169495 edges in 3261.3 seconds so 

In [None]:
my_neo4j_nodes_relationships()

-------------------------
  Nodes:
-------------------------
      node_name  labels  node_id
0          None  [Page]      107
1          None  [Page]      108
2          None  [Page]      109
3          None  [Page]      110
4          None  [Page]      111
...         ...     ...      ...
22465      None  [Page]    22171
22466      None  [Page]    22172
22467      None  [Page]    22173
22468      None  [Page]    22174
22469      None  [Page]    22175

[22470 rows x 3 columns]
-------------------------
  Relationships:
-------------------------
        node_id_1 node_1_labels relationship_type  node_id_2 node_2_labels
0               0        [Page]             LIKES      18427        [Page]
1               1        [Page]             LIKES       2812        [Page]
2               1        [Page]             LIKES       4987        [Page]
3               1        [Page]             LIKES       5228        [Page]
4               1        [Page]             LIKES       5307        [Page

In [None]:
# Create a projection of the graph
query = '''

MATCH (a)-[r:LIKES]->(b)
WITH DISTINCT a, b, r

WITH gds.graph.project('connected-facebook-graph', 
    a,  
    b,  
    {
        sourceNodeLabels: labels(a),
        targetNodeLabels: labels(b),
        sourceNodeProperties: a { facebook_id: a.facebook_id}, 
        targetNodeProperties: b { facebook_id: b.facebook_id},
        relationshipType: type(r)
    }
)
as g
RETURN g.graphName AS graph, g.nodeCount AS nodes, g.relationshipCount AS rels
'''
df = my_neo4j_run_query_pandas(query)
df

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke function `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: Graph connected-facebook-graph already exists}

In [12]:
# PageRank implementation
query = """

CALL gds.pageRank.stream('connected-facebook-graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).page_name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Joachim Herrmann,4.323256
1,The Obama White House,3.983636
2,Sir Peter Bottomley MP,3.937708
3,The White House,2.969697
4,U.S. Army Chaplain Corps,2.898097
...,...,...
22465,식샤를합시다,0.950000
22466,인사이트,0.950000
22467,컴투스 (Com2uS),0.950000
22468,한국안의 이스라엘,0.950000


In [None]:
#Shortest Path (Dijkstra's algorithm)
query = """

MATCH (source:Page {name: $source}), (target:Page {name: $target})
CALL gds.shortestPath.dijkstra.stream(
    'connected-facebook-graph', 
    { sourceNode: $source, 
      targetNode: $target, 
      relationshipWeightProperty: 'LIKES'
    }
)
YIELD index, sourceNode, targetNode, nodeIds, path
RETURN
    gds.util.asNode(sourceNode).name AS from,
    gds.util.asNode(targetNode).name AS to,
    [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes
ORDER BY index


"""

source = "20892"
target = "127"

my_neo4j_run_query_pandas(query, source=source, target=target)

In [None]:
#Shortest Path (Dijkstra's algorithm)
query = """
MATCH (source:Page {id: $source}), (target:Page {id: $target})
CALL gds.shortestPath.dijkstra.stream(
    'connected-facebook-graph', 
    { sourceNode: $source, 
      targetNode:  $target})
YIELD index, sourceNode, targetNode, nodeIds, path
RETURN index, sourceNode, targetNode, nodeIds, path
ORDER BY index

"""

source= "20892"
target= "127"

my_neo4j_run_query_pandas(query, source=source, target=target)

In [None]:
# Community Detection
query = """
CALL gds.louvain.stream('connected-facebook-graph')
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).id AS pageId, communityId
ORDER BY communityId, pageId
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Add communityId to page
query = """
CALL gds.louvain.write('connected-facebook-graph', {
  writeProperty: 'communityId'
})
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Create a Community node for each unique communityId
query = """
MATCH (p:Page)
WITH DISTINCT p.communityId AS communityId
MERGE (c:Community {communityId: communityId})
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Create a Community node for each unique communityId
query = """
MATCH (a:Page)-[:LIKES]->(b:Page)
WHERE a.communityId <> b.communityId
WITH a.communityId AS source, b.communityId AS target, COUNT(*) AS interLinks

// Get community sizes
MATCH (c1:Community), (c2:Community)
WHERE c1.communityId = source AND c2.communityId = target
WITH c1, c2, interLinks,
     (c1.size + c2.size) AS combinedSize

MERGE (c1)-[r:INTER_COMMUNITY_LINK]->(c2)
SET r.weight = interLinks * combinedSize
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Set size to the community nodes
query = """
MATCH (p:Page)
WITH p.communityId AS communityId, COUNT(*) AS size
MATCH (c:Community {communityId: communityId})
SET c.size = size
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Community Detection
query = """
CALL gds.louvain.stream('connected-facebook-graph')
YIELD nodeId, communityId
WITH gds.util.asNode(nodeId) AS node, communityId
RETURN node.page_name AS page_name, communityId
ORDER BY communityId, page_name
"""
df = my_neo4j_run_query_pandas(query)

# Save to CSV
df.to_csv('community_detection_results.csv', index=False)
unique_communities = df["communityId"].nunique()
print(f"Number of unique communities: {unique_communities}")

In [None]:
# Load the CSV
df = pd.read_csv('community_detection_results.csv')

# Group by communityId and aggregate page names
grouped = df.groupby('communityId')['page_name'].apply(list).reset_index()

# Function to generate a community name (simple example: use most common word)
def summarize_names(page_names):
    from collections import Counter
    words = []
    for name in page_names:
        words.extend(name.split())
    most_common = Counter(words).most_common(3)
    return ' '.join([w for w, _ in most_common]) + ' Community'

grouped['community_name'] = grouped['page_name'].apply(summarize_names)

# Select the community columns
result = grouped[['communityId', 'community_name']]

# Save to CSV
result.to_csv('community_names_full.csv', index=False)

In [None]:
# Update community with basic community name
import time
import pandas as pd

community_df = pd.read_csv('community_names_full.csv')
start_time = time.time()
last_print_time = start_time

for i, row in community_df.iterrows():
    community_id = int(row["communityId"])
    community_name = row["community_name"]

    query = """
    MATCH (c:Community {communityId: $community_id})
    SET c.communityName = $community_name
    """
    session.run(query, {"community_id": community_id, "community_name": community_name})

    # Print progress every 5 seconds
    current_time = time.time()
    if current_time - last_print_time > 5:
        elapsed = current_time - start_time
        print(f"  Processed {i} community labels in {elapsed:.1f} seconds so far...")
        last_print_time = current_time

print("Done")


In [None]:
# Aggregate pages per community 
query = """
MATCH (c:Community)
WITH c
MATCH (p:Page)
WHERE p.communityId = c.communityId
WITH c, collect(p.page_name) AS pageNames
SET c.pages = pageNames
"""
my_neo4j_run_query_pandas(query)

In [None]:
# Community Detection -- Run this command in browser
query = """
MATCH (c1:Community)-[r:INTER_COMMUNITY_LINK]->(c2:Community)
WHERE c1.communityName IS NOT NULL AND c2.communityName IS NOT NULL
RETURN c1, r, c2
LIMIT 1000
"""
my_neo4j_run_query_pandas(query)

In [None]:
#Shortest Path (Dijkstra's algorithm)
query = """

MATCH (source:Page) 
WHERE id(source) = $source
MATCH (target:Page) 
WHERE id(target) = $target
WITH id(source) AS sourceId, id(target) AS targetId
CALL gds.shortestPath.dijkstra.stream(
  'connected-facebook-graph',
  {
    sourceNode: sourceId,
    targetNode: targetId
  }
)
YIELD index, sourceNode, targetNode, nodeIds, path
RETURN
  index,
  gds.util.asNode(sourceNode).id AS fromId,
  gds.util.asNode(targetNode).id AS toId,
  [nodeId IN nodeIds | gds.util.asNode(nodeId).id] AS pathNodes
ORDER BY index

"""

source=	7
target=294

my_neo4j_run_query_pandas(query, source=source, target=target)

In [14]:
#Source node 1160 for Air Canada & Canadian Embassy
query = """

MATCH (source:Page) 
WHERE id(source) = $source
MATCH (target:Page) 
WHERE id(target) = $target
WITH id(source) AS sourceId, id(target) AS targetId
CALL gds.shortestPath.dijkstra.stream(
  'connected-facebook-graph',
  {
    sourceNode: sourceId,
    targetNode: targetId
  }
)
YIELD index, sourceNode, targetNode, nodeIds, path
RETURN
  index,
  gds.util.asNode(sourceNode).id AS fromId,
  gds.util.asNode(targetNode).id AS toId,
  [nodeId IN nodeIds | gds.util.asNode(nodeId).id] AS pathNodes  
ORDER BY index

"""
source_id=1160
target_id=21006

my_neo4j_run_query_pandas(query, source=source_id, target=target_id)



Unnamed: 0,index,fromId,toId,pathNodes


In [15]:
#Shortest Path (Dijkstra's algorithm) Air Canada to General Consulate of Switzerland
query = """

MATCH (source:Page {id: 1244}), (target:Page {id: 11319})
WITH id(source) AS sourceNodeId, id(target) AS targetNodeId

CALL gds.shortestPath.dijkstra.stream('connected-facebook-graph', {
  sourceNode: sourceNodeId,
  targetNode: targetNodeId
})
YIELD nodeIds
WITH [nodeId IN nodeIds | gds.util.asNode(nodeId).page_name] AS pageNames
RETURN pageNames"""


source_id=1244
target_id= 11319



my_neo4j_run_query_pandas(query, source=source_id, target=target_id)



Unnamed: 0,pageNames


In [16]:
#Shortest Path (Dijkstra's algorithm) Air Canada to General Consulate of Switzerland
query = """
MATCH (source:Page {id: 1244}), (target:Page {id: 11319})
WITH id(source) AS sourceNodeId, id(target) AS targetNodeId

CALL gds.shortestPath.dijkstra.stream('connected-facebook-graph', {
  sourceNode: sourceNodeId,
  targetNode: targetNodeId
})
YIELD nodeIds
WITH nodeIds
UNWIND range(0, size(nodeIds)-2) AS i
MATCH (n1) WHERE id(n1) = nodeIds[i]
MATCH (n2) WHERE id(n2) = nodeIds[i + 1]
MATCH (n1)-[r:LIKES]-(n2)
RETURN n1, r, n2

"""


source_id=1244
target_id= 11319



my_neo4j_run_query_pandas(query, source=source_id, target=target_id)



Unnamed: 0,n1,r,n2
