This notebook assumes that you have a running instance of Neo4J Aura. You can create a free one by heading over to https://neo4j.com/

Once you have your instance, save its id as `neo4j_instance_id` and its password as `neo4j_password` as Colab secrets and grant this notebook access to said secrets.

In [1]:
!pip install neo4j

Collecting neo4j
  Downloading neo4j-5.28.1-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.28.1-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.3/312.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.28.1


In [2]:
from neo4j import GraphDatabase
from google.colab import userdata
neo4j_instance_id = userdata.get('neo4j_instance_id')
neo4j_password = userdata.get('neo4j_password')

url = f"neo4j+s://{neo4j_instance_id}.databases.neo4j.io:7687"
username = "neo4j"

driver = GraphDatabase.driver(url, auth=(username, neo4j_password))

In [3]:
def insert_data(tx):
    query = """
    // Create account holders
    CREATE (accountHolder1:AccountHolder {
            FirstName: "John",
            LastName: "Doe",
            UniqueId: "JohnDoe" })

    CREATE (accountHolder2:AccountHolder {
            FirstName: "Jane",
            LastName: "Appleseed",
            UniqueId: "JaneAppleseed" })

    CREATE (accountHolder3:AccountHolder {
            FirstName: "Matt",
            LastName: "Smith",
            UniqueId: "MattSmith" })

    // Create Address
    CREATE (address1:Address {
            Street: "123 NW 1st Street",
            City: "San Francisco",
            State: "California",
            ZipCode: "94101" })

    // Connect 3 account holders to 1 address
    CREATE (accountHolder1)-[:HAS_ADDRESS]->(address1),
           (accountHolder2)-[:HAS_ADDRESS]->(address1),
           (accountHolder3)-[:HAS_ADDRESS]->(address1)

    // Create Phone Number
    CREATE (phoneNumber1:PhoneNumber { PhoneNumber: "555-555-5555" })

    // Connect 2 account holders to 1 phone number
    CREATE (accountHolder1)-[:HAS_PHONENUMBER]->(phoneNumber1),
           (accountHolder2)-[:HAS_PHONENUMBER]->(phoneNumber1)

    // Create SSN
    CREATE (ssn1:SSN { SSN: "241-23-1234" })

    // Connect 2 account holders to 1 SSN
    CREATE (accountHolder2)-[:HAS_SSN]->(ssn1),
           (accountHolder3)-[:HAS_SSN]->(ssn1)

    // Create SSN and connect 1 account holder
    CREATE (ssn2:SSN { SSN: "241-23-4567" })<-[:HAS_SSN]-(accountHolder1)

    // Create Credit Card and connect 1 account holder
    CREATE (creditCard1:CreditCard {
            AccountNumber: "1234567890123456",
            Limit: 5000, Balance: 1442.23,
            ExpirationDate: "01-20",
            SecurityCode: "123" })<-[:HAS_CREDITCARD]-(accountHolder1)

    // Create Bank Account and connect 1 account holder
    CREATE (bankAccount1:BankAccount {
            AccountNumber: "2345678901234567",
            Balance: 7054.43 })<-[:HAS_BANKACCOUNT]-(accountHolder1)

    // Create Credit Card and connect 1 account holder
    CREATE (creditCard2:CreditCard {
            AccountNumber: "1234567890123456",
            Limit: 4000, Balance: 2345.56,
            ExpirationDate: "02-20",
            SecurityCode: "456" })<-[:HAS_CREDITCARD]-(accountHolder2)

    // Create Bank Account and connect 1 account holder
    CREATE (bankAccount2:BankAccount {
            AccountNumber: "3456789012345678",
            Balance: 4231.12 })<-[:HAS_BANKACCOUNT]-(accountHolder2)

    // Create Unsecured Loan and connect 1 account holder
    CREATE (unsecuredLoan2:UnsecuredLoan {
            AccountNumber: "4567890123456789-0",
            Balance: 9045.53,
            APR: .0541,
            LoanAmount: 12000.00 })<-[:HAS_UNSECUREDLOAN]-(accountHolder2)

    // Create Bank Account and connect 1 account holder
    CREATE (bankAccount3:BankAccount {
            AccountNumber: "4567890123456789",
            Balance: 12345.45 })<-[:HAS_BANKACCOUNT]-(accountHolder3)

    // Create Unsecured Loan and connect 1 account holder
    CREATE (unsecuredLoan3:UnsecuredLoan {
            AccountNumber: "5678901234567890-0",
            Balance: 16341.95, APR: .0341,
            LoanAmount: 22000.00 })<-[:HAS_UNSECUREDLOAN]-(accountHolder3)

    // Create Phone Number and connect 1 account holder
    CREATE (phoneNumber2:PhoneNumber {
            PhoneNumber: "555-555-1234" })<-[:HAS_PHONENUMBER]-(accountHolder3)

    RETURN *
    """
    tx.run(query)

# Run the query
with driver.session() as session:
    session.execute_write(insert_data)

In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.config("neo4j.url", url)
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.3_for_spark_3,graphframes:graphframes:0.8.4-spark3.5-s_2.12") \
    .config("neo4j.authentication.basic.username", username)
    .config("neo4j.authentication.basic.password", neo4j_password)
    .getOrCreate()
)

Let's reimplement our Fraud ring example.

In [5]:
has_address_df = (spark.read
            .format('org.neo4j.spark.DataSource')
            .option('relationship', 'HAS_ADDRESS')
            .option('relationship.source.labels', ':AccountHolder')
            .option('relationship.target.labels', ':Address')
            .load())

has_address_df.show()

+-------------------+-----------+-----------+---------------+---------------+---------------+----------------+-----------+---------------+--------------+-----------------+------------+-------------+
|           <rel.id>| <rel.type>|<source.id>|<source.labels>|source.UniqueId|source.LastName|source.FirstName|<target.id>|<target.labels>|target.ZipCode|    target.Street|target.State|  target.City|
+-------------------+-----------+-----------+---------------+---------------+---------------+----------------+-----------+---------------+--------------+-----------------+------------+-------------+
|1152925902653358080|HAS_ADDRESS|          0|[AccountHolder]|        JohnDoe|            Doe|            John|          3|      [Address]|         94101|123 NW 1st Street|  California|San Francisco|
|1152925902653358081|HAS_ADDRESS|          1|[AccountHolder]|  JaneAppleseed|      Appleseed|            Jane|          3|      [Address]|         94101|123 NW 1st Street|  California|San Francisco|
|1152

In [6]:
from pyspark.sql import functions as F

account_address_df = has_address_df.select(
    F.col("`source.UniqueId`").alias("account_id"),
    F.col("`source.FirstName`").alias("account_firstname"),
    F.col("`source.LastName`").alias("account_lastname"),
    F.col("`target.ZipCode`").alias("address_zip"),
    F.col("`target.Street`").alias("address_street"),
    F.col("`target.City`").alias("address_city"),
    F.col("`target.State`").alias("address_state")
)

account_address_df.show()

+-------------+-----------------+----------------+-----------+-----------------+-------------+-------------+
|   account_id|account_firstname|account_lastname|address_zip|   address_street| address_city|address_state|
+-------------+-----------------+----------------+-----------+-----------------+-------------+-------------+
|      JohnDoe|             John|             Doe|      94101|123 NW 1st Street|San Francisco|   California|
|JaneAppleseed|             Jane|       Appleseed|      94101|123 NW 1st Street|San Francisco|   California|
|    MattSmith|             Matt|           Smith|      94101|123 NW 1st Street|San Francisco|   California|
+-------------+-----------------+----------------+-----------+-----------------+-------------+-------------+



In [7]:
fraud_rings_df = account_address_df.groupBy(
    "address_zip", "address_street", "address_city", "address_state"
).agg(
    F.countDistinct("account_id").alias("ring_size"),
    F.collect_list("account_id").alias("account_ids"),
    F.collect_list("account_firstname").alias("first_names"),
    F.collect_list("account_lastname").alias("last_names")
)

# Filter to find fraud rings with more than 1 AccountHolder
fraud_rings_df = fraud_rings_df.filter("ring_size > 1")

fraud_rings_df.show()

+-----------+-----------------+-------------+-------------+---------+--------------------+------------------+--------------------+
|address_zip|   address_street| address_city|address_state|ring_size|         account_ids|       first_names|          last_names|
+-----------+-----------------+-------------+-------------+---------+--------------------+------------------+--------------------+
|      94101|123 NW 1st Street|San Francisco|   California|        3|[JaneAppleseed, M...|[Jane, Matt, John]|[Appleseed, Smith...|
+-----------+-----------------+-------------+-------------+---------+--------------------+------------------+--------------------+



Now let's use Graphframes to solve the same problem

In [8]:
pip install graphframes

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl.metadata (934 bytes)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl.metadata (1.7 kB)
Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [9]:
from graphframes import GraphFrame

has_address_edges_df = (spark.read
            .format('org.neo4j.spark.DataSource')
            .option('relationship', 'HAS_ADDRESS')
            .option('relationship.source.labels', ':AccountHolder')
            .option('relationship.target.labels', ':Address')
            .load())

account_df = spark.read \
            .format('org.neo4j.spark.DataSource') \
            .option('labels', ':AccountHolder') \
            .load()

account_df = account_df \
    .select(
        F.col("<id>").alias("id"),
        F.col("<labels>").alias("labels"),
    )

address_df = spark.read \
            .format('org.neo4j.spark.DataSource') \
            .option('labels', ':Address') \
            .load()

address_df = address_df \
    .select(
        F.col("<id>").alias("id"),
        F.col("<labels>").alias("labels"),
    )

vertices_df = account_df.union(address_df)

edges_df = has_address_edges_df \
    .select(
        F.col("`<source.id>`").alias("src"),
        F.col("`<target.id>`").alias("dst"),
        F.col("`<rel.type>`").alias("relationship")
    )

g = GraphFrame(vertices_df, edges_df)

g.vertices.show()
g.edges.show()

+---+---------------+
| id|         labels|
+---+---------------+
|  0|[AccountHolder]|
|  1|[AccountHolder]|
|  2|[AccountHolder]|
|  3|      [Address]|
+---+---------------+

+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  0|  3| HAS_ADDRESS|
|  1|  3| HAS_ADDRESS|
|  2|  3| HAS_ADDRESS|
+---+---+------------+



In [10]:
df_AC = (
    g.find("(A)-[r]->(C)")
    .select("A", "C", "r")
)
df_AC.show(truncate=False)

df_grouped = (
    df_AC
    .groupBy("C")
    .agg(
        F.countDistinct("A").alias("RingSize"),
        F.collect_list("r").alias("relationships")
    )
    .filter("RingSize > 1")
)

df_grouped.show(truncate=False)


+--------------------+--------------+-------------------+
|A                   |C             |r                  |
+--------------------+--------------+-------------------+
|{2, [AccountHolder]}|{3, [Address]}|{2, 3, HAS_ADDRESS}|
|{1, [AccountHolder]}|{3, [Address]}|{1, 3, HAS_ADDRESS}|
|{0, [AccountHolder]}|{3, [Address]}|{0, 3, HAS_ADDRESS}|
+--------------------+--------------+-------------------+

+--------------+--------+---------------------------------------------------------------+
|C             |RingSize|relationships                                                  |
+--------------+--------+---------------------------------------------------------------+
|{3, [Address]}|3       |[{0, 3, HAS_ADDRESS}, {2, 3, HAS_ADDRESS}, {1, 3, HAS_ADDRESS}]|
+--------------+--------+---------------------------------------------------------------+



You can also use a Cypher query to read data from Neo4j into a Dataframe. This example reads the entire graph which in most cases is probably overkill.

In [11]:
full_graph_df = (spark.read
            .format('org.neo4j.spark.DataSource')
            .option('query', '''
              MATCH (n)-[r]->(m)
              RETURN n, r, m
              ''')
            .load())

full_graph_df.show()

+--------------------+--------------------+--------------------+
|                   n|                   r|                   m|
+--------------------+--------------------+--------------------+
|{0, [AccountHolde...|{1152925902653358...|{3, [Address], 12...|
|{0, [AccountHolde...|{1152927002164985...|{4, [PhoneNumber]...|
|{0, [AccountHolde...|{1152928101676613...|{6, [SSN], NULL, ...|
|{0, [AccountHolde...|{1152929201188241...|{7, [CreditCard],...|
|{0, [AccountHolde...|{1152930300699869...|{8, [BankAccount]...|
|{1, [AccountHolde...|{1152925902653358...|{3, [Address], 12...|
|{1, [AccountHolde...|{1152927002164985...|{4, [PhoneNumber]...|
|{1, [AccountHolde...|{1152928101676613...|{5, [SSN], NULL, ...|
|{1, [AccountHolde...|{1152929201188241...|{9, [CreditCard],...|
|{1, [AccountHolde...|{1152930300699869...|{10, [BankAccount...|
|{1, [AccountHolde...|{1152931400211496...|{11, [UnsecuredLo...|
|{2, [AccountHolde...|{1152925902653358...|{3, [Address], 12...|
|{2, [AccountHolde...|{11

In [12]:
full_graph_df.printSchema()

root
 |-- n: struct (nullable = true)
 |    |-- <id>: long (nullable = false)
 |    |-- <labels>: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- UniqueId: string (nullable = true)
 |    |-- LastName: string (nullable = true)
 |    |-- FirstName: string (nullable = true)
 |-- r: struct (nullable = true)
 |    |-- <rel.id>: long (nullable = false)
 |    |-- <rel.type>: string (nullable = false)
 |    |-- <source.id>: long (nullable = false)
 |    |-- <target.id>: long (nullable = false)
 |-- m: struct (nullable = true)
 |    |-- <id>: long (nullable = false)
 |    |-- <labels>: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- Street: string (nullable = true)
 |    |-- City: string (nullable = true)
 |    |-- State: string (nullable = true)
 |    |-- ZipCode: string (nullable = true)

