In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, lit

In [None]:
spark = (
    SparkSession.builder.appName("bk-imp")
    .config(
        "spark.jars.packages",
        "org.neo4j:neo4j-connector-apache-spark_2.12:5.0.1_for_spark_3",
    )
    .getOrCreate()
)

In [None]:
df = spark.read.json("../data/meta_Digital_Music.json").select(
    ["also_buy", "also_view", "asin"]
)

In [None]:
# Explode also_buy
also_buy_df = df.select(
    col("asin").alias("src_product_id"),
    explode("also_buy").alias("dst_product_id"),
    lit("same_buyer").alias("relationship"),
)

# Explode also_view
also_view_df = df.select(
    col("asin").alias("src_product_id"),
    explode("also_view").alias("dst_product_id"),
    lit("same_viewer").alias("relationship"),
)

# Union the two dataframes
result_df = also_buy_df.union(also_view_df).dropDuplicates(
    ["src_product_id", "dst_product_id"]
)

In [None]:
# Sample the dataset
result_df = result_df.sample(fraction=0.01)
result_df.count()

In [None]:
result_df.show(3)

In [None]:
# Create a dataframe with distinct product IDs
nodes_df = (
    result_df.select(col("src_product_id").alias("id"))
    .union(result_df.select(col("dst_product_id").alias("id")))
    .distinct()
)

In [None]:
# Write nodes to Neo4j
nodes_df.write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("authentication.type", "basic").option(
    "authentication.basic.username", "neo4j"
).option(
    "authentication.basic.password", "bitnami1"
).option(
    "node.keys", "id"
).option(
    "labels", ":Product"
).mode(
    "overwrite"
).save()

In [None]:
# Write relationships to Neo4j
same_buyer_df = result_df.filter(result_df.relationship == "same_buyer")
same_buyer_df.write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("authentication.type", "basic").option(
    "authentication.basic.username", "neo4j"
).option(
    "authentication.basic.password", "bitnami1"
).option(
    "relationship.save.strategy", "keys"
).option(
    "relationship", "same_buyer"
).option(
    "relationship.source.labels", ":Product"
).option(
    "relationship.source.node.keys", "src_product_id:id"
).option(
    "relationship.target.labels", ":Product"
).option(
    "relationship.target.node.keys", "dst_product_id:id"
).mode(
    "overwrite"
).save()
same_buyer_df.write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("authentication.type", "basic").option(
    "authentication.basic.username", "neo4j"
).option(
    "authentication.basic.password", "bitnami1"
).option(
    "relationship.save.strategy", "keys"
).option(
    "relationship", "same_buyer"
).option(
    "relationship.source.labels", ":Product"
).option(
    "relationship.source.node.keys", "dst_product_id:id"
).option(
    "relationship.target.labels", ":Product"
).option(
    "relationship.target.node.keys", "src_product_id:id"
).mode(
    "overwrite"
).save()