In [1]:
from neo4j import GraphDatabase
import pandas as pd
import numpy as np

# Load the Amazon CSV file into a pandas dataframe
df = pd.read_csv('amazon.csv')
df = df.rename(columns={'actual_price (USD)': 'actual_price'})

# Create a connection to the Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password))

# Use the driver to create the nodes and relationships in the database
with driver.session() as session:
    # Create a node for each product in the database
    for index, row in df.iterrows():
        session.run("CREATE (:Product {category: $category, actual_price: $actual_price, rating_weighted_avg: $rating_weighted_avg})", 
                    category=row['category'], actual_price=row['actual_price'], rating_weighted_avg=row['rating_weighted_avg'])


In [16]:
import pandas as pd
from neo4j import GraphDatabase

# Load Amazon CSV data into a pandas dataframe
df = pd.read_csv('amazon.csv')

# Connect to the Neo4j database
uri = "bolt://localhost:7687"
driver = GraphDatabase.driver(uri, auth=("neo4j", "password"))

df = df.rename(columns={'actual_price (USD)': 'actual_price'})

df = df.dropna(how='any')

# Define the node and edge creation queries
create_product_nodes_query = """
    UNWIND $products AS product
    CREATE (:Product {id: product.id, name: product.name, category: product.category, actual_price: product.actual_price, rating_weighted_avg: product.rating_weighted_avg})
"""
create_similarity_edges_query = """
    MATCH (p1:Product), (p2:Product) WHERE id(p1) < id(p2)
    WITH p1, p2,
        (1.0 / (ABS(toFloat(p1.actual_price) - toFloat(p2.actual_price)) + 1.0)) +
        (CASE WHEN p1.category = p2.category THEN 1 ELSE 0 END) +
        ((5 - COALESCE(ABS(toInteger(p1.rating_weighted_avg) - toInteger(p2.rating_weighted_avg)), 5)) / 5) AS similarity_score
    WHERE similarity_score IS NOT NULL AND NOT(isnan(similarity_score))
    MERGE (p1)-[:SIMILARITY {score: similarity_score}]->(p2)
"""

# Define functions to execute the node and edge creation queries
def create_product_nodes(tx, products):
    tx.run(create_product_nodes_query, products=products)

# Define a batch size for processing the data
BATCH_SIZE = 1000

def create_similarity_edges(tx):
    result = tx.run("MATCH (p:Product) RETURN COUNT(p)").single()
    num_products = result.value()
    for i in range(0, num_products, BATCH_SIZE):
        query = """
        MATCH (p1:Product), (p2:Product) 
        WHERE id(p1) < id(p2) AND id(p1) >= $start_id AND id(p1) < $end_id
        WITH p1, p2,
        (1.0 / (ABS(toFloat(p1.actual_price) - toFloat(p2.actual_price)) + 1.0)) +
        (CASE WHEN p1.category = p2.category THEN 1 ELSE 0 END) +
        ((5 - COALESCE(ABS(toInteger(p1.rating_weighted_avg) - toInteger(p2.rating_weighted_avg)), 5)) / 5) AS similarity_score
        WHERE similarity_score IS NOT NULL AND NOT(isnan(similarity_score))
        MERGE (p1)-[:SIMILARITY {score: similarity_score}]->(p2)
        """

        params = {"start_id": i, "end_id": min(num_products, i + BATCH_SIZE)}
        tx.run(query, params)


# Create the edges between the products based on their similarity scores
with driver.session() as session:
    session.write_transaction(create_similarity_edges)


# Define a function to retrieve the similarity scores and product information from the Neo4j database
def get_similarity_scores(tx):
    result = tx.run("""
        MATCH (p1:Product)-[s:SIMILARITY]->(p2:Product)
        RETURN p1.id AS product1, p2.id AS product2, s.score AS similarity_score
        LIMIT 10
    """)
    return pd.DataFrame([r.values() for r in result], columns=result.keys())

# Create the nodes for each product in the Amazon CSV data as Neo4j nodes
with driver.session() as session:
    session.write_transaction(create_product_nodes, df.to_dict('records'))

# Create the edges between the products based on their similarity scores
with driver.session() as session:
    session.write_transaction(create_similarity_edges)

# Load the similarity scores and product information into a pandas dataframe
with driver.session() as session:
    similarity_df = session.read_transaction(get_similarity_scores)

# Close the Neo4j database connection
driver.close()

# Print the resulting similarity dataframe
print(similarity_df)


  session.write_transaction(create_similarity_edges)
Transaction failed and will be retried in 1.1636987850669442s (The allocation of an extra 48.0 MiB would use more than the limit 716.8 MiB. Currently using 675.0 MiB. dbms.memory.transaction.total.max threshold reached)
Transaction failed and will be retried in 1.6658867455340527s (The allocation of an extra 48.0 MiB would use more than the limit 716.8 MiB. Currently using 675.0 MiB. dbms.memory.transaction.total.max threshold reached)


TransientError: {code: Neo.TransientError.General.MemoryPoolOutOfMemoryError} {message: The allocation of an extra 48.0 MiB would use more than the limit 716.8 MiB. Currently using 675.0 MiB. dbms.memory.transaction.total.max threshold reached}

In [44]:
import pandas as pd
from neo4j import GraphDatabase

# Load the Amazon CSV data into a Pandas dataframe
df = pd.read_csv('amazon.csv')

df = df.rename(columns={'actual_price (USD)': 'actual_price', 'product_name': 'name', 'rating': 'ratings'})

df = df.dropna(how='any')

# Connect to the Neo4j database
uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

# Define the batch size
batch_size = 1000

# Define the Cypher query to calculate similarity scores
query = '''
UNWIND $params AS row
MATCH (p1:Product {category: row.category1, name: row.name1})
MATCH (p2:Product {category: row.category2})
WITH p1, p2, gds.alpha.similarity.euclideanDistance([row.price1, row.ratings1], [row.price2, row.ratings2]) AS similarity
MERGE (p1)-[s:SIMILARITY]->(p2)
SET s.score = similarity
'''

# Loop through each batch of rows and execute the query
with driver.session() as session:
    for i in range(0, len(df), batch_size):
        batch = df[i:i+batch_size]
        params_list = []
        for _, row in batch.iterrows():
            params = {
                'category1': row['category'],
                'name1': row['name'],
                'category2': row['category'],
                'price1': row['actual_price'],
                'ratings1': row['ratings'],
            }
            for _, row2 in batch.iterrows():
                params['price2'] = row2['actual_price']
                params['ratings2'] = row2['ratings']
                params_list.append(params.copy())
        session.run(query, params_list)

# Define the Cypher query to retrieve similarity scores
query2 = '''
MATCH (p1:Product)-[s:SIMILARITY]->(p2:Product)
WHERE id(p1) <> id(p2)
RETURN id(p1) AS id1, id(p2) AS id2, s.score AS similarity
'''

# Execute the query and store the results in a new Pandas dataframe
with driver.session() as session:
    results = session.run(query2)
    similarity_df = pd.DataFrame([r.values() for r in results], columns=results.keys())
    
similarity_df.head()


Unnamed: 0,id1,id2,similarity


In [47]:
import pandas as pd
from neo4j import GraphDatabase

# Load the Amazon CSV data into a Pandas dataframe
df = pd.read_csv('amazon.csv')

df = df.rename(columns={'actual_price (USD)': 'actual_price', 'product_name': 'name', 'rating': 'ratings'})

df = df.dropna(how='any')

# Connect to the Neo4j database
uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

# Define the batch size
batch_size = 1

# Define the Cypher query to create missing nodes
create_nodes_query = '''
UNWIND $params AS row
MERGE (p:Product {category: row.category, name: row.name})
'''

# Execute the query to create missing nodes
with driver.session() as session:
    params_list = []
    for _, row in df.iterrows():
        params = {
            'category': row['category'],
            'name': row['name']
        }
        params_list.append(params.copy())
    session.run(create_nodes_query, params=params_list)

# Define the Cypher query to calculate similarity scores
query = '''
UNWIND $params AS row
MATCH (p1:Product {category: row.category1, name: row.name1})
MATCH (p2:Product {category: row.category2})
WITH p1, p2, gds.alpha.similarity.euclideanDistance([row.price1, row.ratings1], [row.price2, row.ratings2]) AS similarity
MERGE (p1)-[s:SIMILARITY]->(p2)
SET s.score = similarity
'''

# Loop through each batch of rows and execute the query
with driver.session() as session:
    for i in range(0, len(df), batch_size):
        batch = df[i:i+batch_size]
        params_list = []
        for _, row in batch.iterrows():
            params1 = {
                'category1': row['category'],
                'name1': row['name'],
                'price1': row['actual_price'],
                'ratings1': row['ratings']
            }
            for _, row2 in batch.iterrows():
                if row['name'] != row2['name']:
                    params2 = {
                        'category2': row2['category'],
                        'name2': row2['name'],
                        'price2': row2['actual_price'],
                        'ratings2': row2['ratings']
                    }
                    params_list.append({**params1, **params2})
                    session.run(query, params=params_list)

# Define the Cypher query to retrieve similarity scores
query2 = '''
MATCH (p1:Product)-[s:SIMILARITY]->(p2:Product)
WHERE p1.name <> p2.name
RETURN p1.name AS product1, p2.name AS product2, s.score AS similarity
'''

# Execute the query and store the results in a new Pandas dataframe
with driver.session() as session:
    results = session.run(query2)
    similarity_df = pd.DataFrame([r.values() for r in results], columns=results.keys())
    
similarity_df.head()


Unnamed: 0,product1,product2,similarity


In [49]:
import pandas as pd
from neo4j import GraphDatabase

# Load the Amazon CSV data into a Pandas dataframe
df = pd.read_csv('amazon.csv')

df = df.rename(columns={'actual_price (USD)': 'actual_price', 'product_name': 'name', 'rating': 'ratings'})

df = df.dropna(how='any')

# Connect to the Neo4j database
uri = 'bolt://localhost:7687'
username = 'neo4j'
password = 'password'
driver = GraphDatabase.driver(uri, auth=(username, password))

# Define the batch size
batch_size = 100

# Define the Cypher query to create missing nodes
create_nodes_query = '''
UNWIND $params AS row
MERGE (p:Product {category: row.category, name: row.name})
'''

# Execute the query to create missing nodes
with driver.session() as session:
    params_list = []
    for _, row in df.iterrows():
        params = {
            'category': row['category'],
            'name': row['name']
        }
        params_list.append(params.copy())
    session.run(create_nodes_query, params=params_list)

# Define the Cypher query to calculate similarity scores
query = '''
UNWIND $params AS row
MATCH (p1:Product {category: row.category1, name: row.name1})
MATCH (p2:Product {category: row.category2})
WITH p1, p2, [(row.price1 - row.price2)^2, (row.ratings1 - row.ratings2)^2] AS distances
WITH p1, p2, SQRT(REDUCE(s = 0, d IN distances | s + d)) AS similarity
MERGE (p1)-[s:SIMILARITY]->(p2)
SET s.score = similarity
'''

# Loop through each batch of rows and execute the query
with driver.session() as session:
    for i in range(0, len(df), batch_size):
        batch = df[i:i+batch_size]
        params_list = []
        for _, row in batch.iterrows():
            params1 = {
                'category1': row['category'],
                'name1': row['name'],
                'price1': row['actual_price'],
                'ratings1': row['ratings']
            }
            for _, row2 in batch.iterrows():
                if row['name'] != row2['name']:
                    params2 = {
                        'category2': row2['category'],
                        'name2': row2['name'],
                        'price2': row2['actual_price'],
                        'ratings2': row2['ratings']
                    }
                    params_list.append({**params1, **params2})
        session.run(query, params=params_list)

# Define the Cypher query to retrieve similarity scores
query2 = '''
MATCH (p1:Product)-[s:SIMILARITY]->(p2:Product)
WHERE p1.name <> p2.name
RETURN p1.name AS product1, p2.name AS product2, s.score AS similarity
'''

# Execute the query and store the results in a new Pandas dataframe
with driver.session() as session:
    results = session.run(query2)
    similarity_df = pd.DataFrame([r.values() for r in results], columns=results.keys())

similarity_df.head()


Unnamed: 0,product1,product2,similarity
