In [None]:
from neo4j import GraphDatabase
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc, explode, lit, size
from pyspark.sql.types import (FloatType, IntegerType, StringType, StructField,
                               StructType)

In [None]:
spark = (
    SparkSession.builder.appName("bk-imp")
    .config(
        "spark.jars.packages",
        "org.neo4j:neo4j-connector-apache-spark_2.12:5.0.1_for_spark_3",
    )
    .getOrCreate()
)

## Sampling Data

In [None]:
review_schema = StructType(
    [
        StructField("asin", StringType(), True),
        StructField("reviewerID", StringType(), True),
        StructField("overall", FloatType(), True),
    ]
)
review_df = (
    spark.read.schema(review_schema)
    .json("../data/Automotive.json")
    .select(
        col("asin").alias("product_id"),
        col("reviewerID").alias("reviewer_id"),
        col("overall").alias("rating"),
    )
    .dropDuplicates()
    .repartition(8)
)

metadata_df = (
    spark.read.json("../data/meta_Automotive.json")
    .select(
        [
            col("asin").alias("product_id"),
            "rank",
            "category",
            "description",
        ]
    )
    .dropDuplicates()
    .repartition(8)
)

merged_df = review_df.join(metadata_df, ["product_id"])
merged_df = (
    merged_df.groupBy("product_id")
    .agg(
        count("*").alias("reviewer_count"),
    )
    .filter("reviewer_count >= 5")
    .limit(1000)
    .join(merged_df, ["product_id"])
    .select(
        "product_id",
        "reviewer_id",
        "rating",
        "rank",
        "category",
        "description",
    )
    .groupBy("reviewer_id")
    .agg(
        count("*").alias("review_count"),
    )
    .filter("review_count >= 2")
    .join(merged_df, ["reviewer_id"])
    .select(
        "product_id",
        "reviewer_id",
        "rating",
        "rank",
        "category",
        "description",
    )
)

merged_df.count()

In [None]:
merged_df.show(3)

In [None]:
merged_df.write.parquet("../data/sampled_data", mode="overwrite")

## Export for Sharing

In [None]:
merged_df.toPandas().to_parquet("../data/sampled_data.parquet")

## Review

In [None]:
df = merged_df.select(["product_id", "reviewer_id", "rating"])

In [None]:
# Write nodes to Neo4j
df.select(col("product_id").alias("id")).dropDuplicates().repartition(
    8
).write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option(
    "node.keys", "id"
).option(
    "labels", ":Product"
).mode(
    "overwrite"
).save()
df.select(col("reviewer_id").alias("id")).dropDuplicates().repartition(
    8
).write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option(
    "authentication.basic.username", "neo4j"
).option(
    "authentication.basic.password", "bitnami1"
).option(
    "node.keys", "id"
).option(
    "labels", ":User"
).mode(
    "overwrite"
).save()

In [None]:
# Write relationships to Neo4j
df.repartition(8).write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("relationship.save.strategy", "keys").option(
    "relationship", "reviews"
).option(
    "relationship.properties", "rating"
).option(
    "relationship.source.labels", ":User"
).option(
    "relationship.source.node.keys", "reviewer_id:id"
).option(
    "relationship.target.labels", ":Product"
).option(
    "relationship.target.node.keys", "product_id:id"
).mode(
    "overwrite"
).save()

## Metadata

In [None]:
df = merged_df.select(
    ["also_buy", "also_view", "product_id"]
).drop_duplicates()

In [None]:
# Explode also_buy
also_buy_df = df.select(
    col("product_id").alias("src_product_id"),
    explode("also_buy").alias("dst_product_id"),
    lit("same_buyer").alias("relationship"),
)

# Explode also_view
also_view_df = df.select(
    col("product_id").alias("src_product_id"),
    explode("also_view").alias("dst_product_id"),
    lit("same_viewer").alias("relationship"),
)

# Union the two dataframes
result_df = also_buy_df.union(also_view_df).dropDuplicates(
    ["src_product_id", "dst_product_id"]
)

In [None]:
# Create a dataframe with distinct product IDs
nodes_df = (
    result_df.select(col("src_product_id").alias("id"))
    .union(result_df.select(col("dst_product_id").alias("id")))
    .distinct()
)

In [None]:
# Write nodes to Neo4j
nodes_df.repartition(8).write.format("org.neo4j.spark.DataSource").option(
    "url", "bolt://localhost:7687"
).option("node.keys", "id").option("labels", ":Product").mode(
    "overwrite"
).save()

In [None]:
# Write relationships to Neo4j
for relationship in ["same_buyer", "same_viewer"]:
    relationships_df = result_df.filter(result_df.relationship == relationship)
    relationships_df.repartition(8).write.format(
        "org.neo4j.spark.DataSource"
    ).option("url", "bolt://localhost:7687").option(
        "relationship.save.strategy", "keys"
    ).option(
        "relationship", relationship
    ).option(
        "relationship.source.labels", ":Product"
    ).option(
        "relationship.source.node.keys", "src_product_id:id"
    ).option(
        "relationship.target.labels", ":Product"
    ).option(
        "relationship.target.node.keys", "dst_product_id:id"
    ).mode(
        "overwrite"
    ).save()
    relationships_df.repartition(8).write.format(
        "org.neo4j.spark.DataSource"
    ).option("url", "bolt://localhost:7687").option(
        "relationship.save.strategy", "keys"
    ).option(
        "relationship", relationship
    ).option(
        "relationship.source.labels", ":Product"
    ).option(
        "relationship.source.node.keys", "dst_product_id:id"
    ).option(
        "relationship.target.labels", ":Product"
    ).option(
        "relationship.target.node.keys", "src_product_id:id"
    ).mode(
        "overwrite"
    ).save()

## Bipartite Graph Projection

In [None]:
user_bipartite_query = """
MATCH (u1:User)-[:reviews]->(product)<-[:reviews]-(u2:User) 
WITH u1, u2, count(product) AS weight 
CREATE (u1)-[:connected {common_products_reviewed:weight}]->(u2)
"""

product_bipartite_query = """
MATCH (p1:Product)<-[:reviews]-(reviewer)-[:reviews]->(p2:Product)
WITH p1, p2, count(reviewer) AS weight
CREATE (p1)-[:connected {common_reviewers:weight}]->(p2)
"""

In [None]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=None)

with driver.session() as session:
    session.run(user_bipartite_query)
    session.run(product_bipartite_query)

In [None]:
user_relationship_query = """
MATCH (u1:User)-[r:connected]->(u2:User)
RETURN u1.id as user1_id, u2.id as user2_id, r.common_products_reviewed as weight
"""

product_relationship_query = """
MATCH (p1:Product)-[r:connected]->(p2:Product)
RETURN p1.id as product1_id, p2.id as product2_id, r.common_reviewers as weight
"""

In [None]:
user_relationship_df = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("url", "bolt://localhost:7687")
    .option("query", user_relationship_query)
    .load()
)
user_relationship_df.toPandas().to_parquet(
    "../data/user_relationship_df.parquet"
)

product_relationship_df = (
    spark.read.format("org.neo4j.spark.DataSource")
    .option("url", "bolt://localhost:7687")
    .option("query", product_relationship_query)
    .load()
)
product_relationship_df.toPandas().to_parquet(
    "../data/product_relationship_df.parquet"
)