##  Consumer Oriented Amazon Product Recommendation Engine

In [1]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
import csv

In [2]:
df = pd.read_csv('amazon.csv')

### Data Cleaning
- Limit the dataset to 1000 entries for faster processing
- Convert necessary string column types like price, discount_price to float
- Drop unneeded columns like product link and image link
- Assign categorical value to Category column of dataframe
- Convert non-dollar currencies to dollar values
- Limit the number of words in each Amazon product name to 10

In [3]:
#strip the product name to three words 
df['product_name'] = df['product_name'].str.split().str[:10].str.join(' ')
df['product_name']

0      Reffair AX30 [MAX] Portable Air Purifier for C...
1           rts [2 Pack] Mini USB C Type C Adapter Plug,
2            Kanget [2 Pack] Type C Female to USB A Male
3        Hp Wired On Ear Headphones With Mic With 3.5 Mm
4      JBL Commercial CSLM20B Auxiliary Omnidirection...
                             ...                        
994    Havells Ambrose 1200mm Ceiling Fan (Gold Mist ...
995              Bajaj Frore 1200 mm Ceiling Fan (Brown)
996    Crompton Sea Sapphira 1200 mm Ultra High Speed...
997    Havells Glaze 74W Pearl Ivory Gold Ceiling Fan...
998    Crompton Hill Briz Deco 1200mm (48 inch) High ...
Name: product_name, Length: 999, dtype: object

In [22]:
# # Split the values in the 'category' column by '|'
# df['new_category'] = df['category'].str.split('|')

# # Extract the first two parts and join them back with '|'
# df['new_category'] = df['new_category'].apply(lambda x: ' | '.join(x[:2]))
# set(df['new_category'])

In [5]:
#add a more general category 
df['general_cat'] = df['category'].str.split('|', n=2).str[0]
set(df['general_cat'])

{'Car&Motorbike',
 'Computers&Accessories',
 'Electronics',
 'Health&PersonalCare',
 'Home&Kitchen'}

In [6]:
#add new column to assign numbers to categorical data using encoding
from sklearn.preprocessing import LabelEncoder 

# instance of LabelEncoder
le = LabelEncoder()

# fit and transform the categorical column to numerical values
df['category_numerical'] = le.fit_transform(df['general_cat'])

In [7]:
#save the updates to the new csv
df.to_csv('amazon.csv')


In [8]:
# Establish a connection to Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password)) 

### Product Node Creation


In [9]:
# Create a product node in Neo4j
def create_product(tx, product):
    query = """
    CREATE (p:Product {product_name: $product_name, product_id : $product_id, price: $price, rating: $rating, 
    category: $category, discount_percentage: $discount_percentage, 
    review_title: $review_title, discounted_price: $discounted_price, 
    general_cat : $general_cat, img_link : $img_link,
    rating_count : $rating_count,
    category_numerical : $category_numerical})
    """
    tx.run(query, product_name=product['product_name'], 
           product_id = product['product_id'], 
           price=product['price'], 
           rating=product['rating'], 
           rating_count = product['rating_count'],
           category=product['category'], 
           discount_percentage=product['discount_percentage'],
           general_cat = product['general_cat'],
           review_title=product['review_title'], 
           discounted_price=product['discounted_price'], 
           img_link = product['img_link'],
           category_numerical = product['category_numerical']
          )


In [10]:
# read the Amazon product data from CSV file
def read_product_data_from_csv(file_path):
    products = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            product = {
                'product_name': row['product_name'],
                'price': float(row['price']),
                'rating': float(row['rating']),
                'category': row['category'],
                'discount_percentage': float(row['discount_percentage']),
                'review_title': row['review_title'],
                'discounted_price': float(row['discounted_price']),
                'general_cat': row['general_cat'],
                'product_id': row['product_id'],
                'img_link': row['img_link'],
                'rating_count': row['rating_count'],
                'category_numerical' : row['category_numerical']
            }
            products.append(product)
    return products

In [11]:
# Amazon CSV file path
csv_file = 'amazon.csv'

# call create product nodes in Neo4j to make the nodes
with driver.session() as session:
    products = read_product_data_from_csv(csv_file)
    for product in products:
        session.execute_write(create_product, product)

In [12]:
# Create an edge between two products in Neo4j
def create_edge(tx, product1, product2):
    query = """
    MATCH (p1:Product {product_name: $product_name_1})
    MATCH (p2:Product {product_name: $product_name_2})
    CREATE (p1)-[:SIMILAR]->(p2)
    """
    tx.run(query, product_name_1=product1['product_name'], product_name_2=product2['product_name'])

### Edge creation: 
Egdes between two Nodes are determined on certain criteria made using certain Node properties; product category, product price, product rating, and product discount percentage 

In [13]:
with driver.session() as session:        
    # Create edges between products based on criteria
    for i in range(len(products)):
        for j in range(i + 1, len(products)):
            product1 = products[i]
            product2 = products[j]

            # Check criteria for creating edge
            if (product1['category'] == product2['category'] and
                    abs(product1['price'] - product2['price']) <= 10 and
                    product1['rating'] == product2['rating'] and
                    abs(product1['discount_percentage'] - product2['discount_percentage']) <= 0.1):
                session.execute_write(create_edge, product1, product2)


In [14]:
#delete some edges in neo4j

# MATCH ()-[r]->()
# WITH r LIMIT 10000
# DELETE r


### Utilizing User Provided Input for Recommendations 
User can type in preferences and the engine will provide 5 recommendations based on the criteria specified. 
Categories of products available: 
1. 'Car & Motorbike',
2. 'Computers & Accessories',
3. 'Electronics',
4. 'Health & PersonalCare',
5. 'Home & Kitchen'

In [15]:
cat_num = {1: 'Car&Motorbike',
 2: 'Computers&Accessories',
 3: 'Electronics',
 4: 'Health&PersonalCare',
 5: 'Home&Kitchen'}

In [17]:
print("Welcome to the Amazon Product Recommender! ")
print("For the questions, please enter only integers")
input_category_num = int(input("From the list above of Product Categories, What type of product are you interested in? "))

#convert the category num to the specified category
input_cat = cat_num.get(input_category_num)

#print to user, their selection
print("You selected", input_cat)

# Get the encoded value of the user-specified category
category = input_cat
input_category_encoded = le.transform([category])[0]

input_max_price = int(input("What is your maximum budget? "))
input_rating = int(input("What is the minimum number of stars you want for the product: "))
input_rating_count = int(input("What is the minumum number of ratings you want for the product: "))


Welcome to the Amazon Product Recommender! 
For the questions, please enter only integers
From the list above of Product Categories, What type of product are you interested in? 1
You selected Car&Motorbike
What is your maximum budget? 200
What is the minimum number of stars you want for the product: 3
What is the minumum number of ratings you want for the product: 20


In [18]:
from neo4j import GraphDatabase

# define the Cypher query with parameters for user input
# uses categorical_numerical becuase Euclidean distance can only accept ints as parameters
query = """
MATCH (inputProduct:Product)
WHERE inputProduct.price <= $inputPrice 
AND inputProduct.rating >= $inputRating 
AND inputProduct.category_numerical = $inputCategory
WITH inputProduct
MATCH (similarProduct:Product)
WHERE similarProduct.price <= $inputPrice 
AND similarProduct.rating >= $inputRating 
AND similarProduct.category_numerical = $inputCategory
AND inputProduct <> similarProduct
WITH inputProduct, similarProduct
RETURN inputProduct, similarProduct, 
gds.similarity.euclideanDistance([inputProduct.price, inputProduct.rating, inputProduct.category_numerical,inputProduct.rating_count], 
[similarProduct.price, similarProduct.rating, similarProduct.category_numerical, inputProduct.rating_count]) 
AS similarity
ORDER BY similarity DESC
LIMIT 5
"""


In [19]:
#store results in a df 
# empty lists to store the data
input_product_list = []
input_product_price_list = []
similar_product_list = []
similar_product_price_list = []
similarity_list = []


# Execute the Cypher query with parameters
with driver.session() as session:
    result = session.run(query, inputPrice=input_max_price,
                         inputRating=input_rating, 
                         inputRatingCount = input_rating_count, 
                         inputCategory = input_category_encoded)

    # Process the query result
    for record in result:
        input_product = record["inputProduct"]["product_name"]
        input_product_price = record["inputProduct"]["price"]
        similar_product = record["similarProduct"]["product_name"]
        similar_product_price = record["similarProduct"]["price"]
        similarity = record["similarity"]
        
        # Process the results as needed, e.g., display or store recommendations
        print(f"Input Product: {input_product}, Similar Product: {similar_product}, Similarity: {similarity}")
        # Append the data to the respective lists
        input_product_list.append(input_product)
        similar_product_list.append(similar_product)
#         input_product_price_list.append(input_product_price)
#         similar_product_price_list.append(similar_product_price)
        similarity_list.append(similarity)

# Create a DataFrame from the lists
df = pd.DataFrame({
    "Input Product": input_product_list,
#     "Input Product Price" : input_product_price_list, 
    "Similar Product": similar_product_list,
#     "Sample Product Price" : similar_product_price_list,
    "Similarity": similarity_list
})

In [20]:
display(df)

Unnamed: 0,Input Product,Similar Product,Similarity


In [None]:
# Close the Neo4j driver connection
# driver.close()


# TODO: 
- recommendtions DF not returning anything
- evaluation of recommendations 
- visualze the graph
- test on a user (?)
- make interactive way of letting user enter data 