In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, explode, lit, size
from pyspark.sql.types import (FloatType, IntegerType, StringType, StructField,
                               StructType)

In [2]:
spark = (
    SparkSession.builder.appName("bk-imp")
    .config(
        "spark.jars.packages",
        "org.neo4j:neo4j-connector-apache-spark_2.12:5.0.1_for_spark_3",
    )
    .getOrCreate()
)

23/05/06 14:14:57 WARN Utils: Your hostname, workspace resolves to a loopback address: 127.0.1.1; using 11.11.1.73 instead (on interface eth0)
23/05/06 14:14:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/terrabot/bk-imp/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/terrabot/.ivy2/cache
The jars for the packages stored in: /home/terrabot/.ivy2/jars
org.neo4j#neo4j-connector-apache-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-86702a5b-1b91-4d3e-8ee0-52ae56b85888;1.0
	confs: [default]
	found org.neo4j#neo4j-connector-apache-spark_2.12;5.0.1_for_spark_3 in central
	found org.neo4j#neo4j-connector-apache-spark_2.12_common;5.0.1 in central
	found org.neo4j.driver#neo4j-java-driver;4.4.11 in central
	found org.reactivestreams#reactive-streams;1.0.4 in central
	found org.apache.xbean#xbean-asm6-shaded;4.10 in central
	found org.neo4j#neo4j-cypher-dsl;2020.1.4 in central
	found org.apiguardian#apiguardian-api;1.1.0 in central
:: resolution report :: resolve 233ms :: artifacts dl 12ms
	:: modules in use:
	org.apache.xbean#xbean-asm6-shaded;4.10 from central in [default]
	org.apiguardian#apiguardian-api;1.1.0 from central in [default]
	org.neo4j#neo4j-connector-apache-spar

## Sampling Data

In [6]:
review_schema = StructType(
    [
        StructField("asin", StringType(), True),
        StructField("reviewerID", StringType(), True),
        StructField("overall", FloatType(), True),
    ]
)
review_df = (
    spark.read.schema(review_schema)
    .json("../data/Automotive.json")
    .select(
        col("asin").alias("product_id"),
        col("reviewerID").alias("reviewer_id"),
        col("overall").alias("rating"),
    )
    .dropDuplicates()
)

metadata_df = (
    spark.read.json("../data/meta_Automotive.json")
    .select(
        [
            "also_buy",
            "also_view",
            col("asin").alias("product_id"),
            "category",
            "description",
        ]
    )
    .filter((size(col("also_buy")) >= 3) & (size(col("also_view")) >= 3))
    .dropDuplicates()
)

merged_df = review_df.join(metadata_df, ["product_id"]).sample(fraction=0.40)
merged_df = (
    merged_df.groupBy("reviewer_id")
    .agg(
        count("*").alias("review_count"),
    )
    .filter("review_count >= 3")
    .join(merged_df, ["reviewer_id"])
    .select(
        "product_id",
        "reviewer_id",
        "rating",
        "also_buy",
        "also_view",
        "category",
        "description",
    )
)
merged_df.cache().count()

                                                                                

1952

In [7]:
merged_df.show(3)

+----------+-------------+------+--------------------+--------------------+--------------------+--------------------+
|product_id|  reviewer_id|rating|            also_buy|           also_view|            category|         description|
+----------+-------------+------+--------------------+--------------------+--------------------+--------------------+
|B001CD54XM|A10FTACKN3EY8|   5.0|[B01MTGDI4Y, B010...|[B00FEPF5YW, B00D...|[Automotive, Moto...|[6-ply-rated extr...|
|B0013LH6NO|A10FTACKN3EY8|   4.0|[B005DJFXY6, B072...|[B07B7T8GZS, B005...|[Automotive, Moto...|[Nelson Rigg SR-6...|
|B000BYELQY|A10FTACKN3EY8|   1.0|[B002G43C6M, B001...|[B06XD4RD4N, B019...|[Automotive, Repl...|[Bosch bracketles...|
+----------+-------------+------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [8]:
merged_df.write.parquet("../data/sampled_data", mode="overwrite")

                                                                                

## Review

In [9]:
df = merged_df.select(["product_id", "reviewer_id", "rating"])

In [10]:
# Write nodes to Neo4j
df.select(col("product_id").alias("id")).dropDuplicates().write.format(
    "org.neo4j.spark.DataSource"
).option("url", "bolt://localhost:7687").option("node.keys", "id").option(
    "labels", ":Product"
).mode(
    "overwrite"
).save()
df.select(col("reviewer_id").alias("id")).dropDuplicates().write.format(
    "org.neo4j.spark.DataSource"
).option("url", "bolt://localhost:7687").option(
    "authentication.basic.username", "neo4j"
).option(
    "authentication.basic.password", "bitnami1"
).option(
    "node.keys", "id"
).option(
    "labels", ":User"
).mode(
    "overwrite"
).save()

                                                                                

In [11]:
# Write relationships to Neo4j
df.write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("relationship.save.strategy", "keys").option(
    "relationship", "reviews"
).option(
    "relationship.properties", "rating"
).option(
    "relationship.source.labels", ":User"
).option(
    "relationship.source.node.keys", "reviewer_id:id"
).option(
    "relationship.target.labels", ":Product"
).option(
    "relationship.target.node.keys", "product_id:id"
).mode(
    "overwrite"
).save()

                                                                                

## Metadata

In [13]:
df = merged_df.select(
    ["also_buy", "also_view", "product_id"]
).drop_duplicates()

In [14]:
# Explode also_buy
also_buy_df = df.select(
    col("product_id").alias("src_product_id"),
    explode("also_buy").alias("dst_product_id"),
    lit("same_buyer").alias("relationship"),
)

# Explode also_view
also_view_df = df.select(
    col("product_id").alias("src_product_id"),
    explode("also_view").alias("dst_product_id"),
    lit("same_viewer").alias("relationship"),
)

# Union the two dataframes
result_df = also_buy_df.union(also_view_df).dropDuplicates(
    ["src_product_id", "dst_product_id"]
)

In [15]:
# Create a dataframe with distinct product IDs
nodes_df = (
    result_df.select(col("src_product_id").alias("id"))
    .union(result_df.select(col("dst_product_id").alias("id")))
    .distinct()
)

In [16]:
# Write nodes to Neo4j
nodes_df.repartition(8).write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("node.keys", "id").option("labels", ":Product").mode(
    "overwrite"
).save()

                                                                                

In [17]:
# Write relationships to Neo4j
for relationship in ["same_buyer", "same_viewer"]:
    relationships_df = result_df.filter(result_df.relationship == relationship)
    relationships_df.repartition(8).write.format(
        "org.neo4j.spark.DataSource"
    ).option("url", "bolt://localhost:7687").option(
        "relationship.save.strategy", "keys"
    ).option(
        "relationship", relationship
    ).option(
        "relationship.source.labels", ":Product"
    ).option(
        "relationship.source.node.keys", "src_product_id:id"
    ).option(
        "relationship.target.labels", ":Product"
    ).option(
        "relationship.target.node.keys", "dst_product_id:id"
    ).mode(
        "overwrite"
    ).save()
    relationships_df.repartition(8).write.format(
        "org.neo4j.spark.DataSource"
    ).option("url", "bolt://localhost:7687").option(
        "relationship.save.strategy", "keys"
    ).option(
        "relationship", relationship
    ).option(
        "relationship.source.labels", ":Product"
    ).option(
        "relationship.source.node.keys", "dst_product_id:id"
    ).option(
        "relationship.target.labels", ":Product"
    ).option(
        "relationship.target.node.keys", "src_product_id:id"
    ).mode(
        "overwrite"
    ).save()

                                                                                