##  Consumer Oriented Amazon Product Recommendation Engine

In [1]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
import csv

In [2]:
df = pd.read_csv('amazon.csv')

### Data Cleaning
- Limit the dataset to 1000 entries for faster processing
- Convert necessary string column types like price, discount_price to float
- Drop unneeded columns like product link and image link
- Assign categorical value to Category column of dataframe
- Convert non-dollar currencies to dollar values
- Limit the number of words in each Amazon product name to 10

In [3]:
df = df[df['rating_count'] != '1+H9280443']

In [4]:
#strip the product name to three words 
df['product_name'] = df['product_name'].str.split().str[:10].str.join(' ')
df['product_name']

0      Reffair AX30 [MAX] Portable Air Purifier for C...
1           rts [2 Pack] Mini USB C Type C Adapter Plug,
2            Kanget [2 Pack] Type C Female to USB A Male
3        Hp Wired On Ear Headphones With Mic With 3.5 Mm
4      JBL Commercial CSLM20B Auxiliary Omnidirection...
                             ...                        
992    Havells Ambrose 1200mm Ceiling Fan (Gold Mist ...
993              Bajaj Frore 1200 mm Ceiling Fan (Brown)
994    Crompton Sea Sapphira 1200 mm Ultra High Speed...
995    Havells Glaze 74W Pearl Ivory Gold Ceiling Fan...
996    Crompton Hill Briz Deco 1200mm (48 inch) High ...
Name: product_name, Length: 997, dtype: object

In [5]:
#add a more general category 
df['general_cat'] = df['category'].str.split('|', n=2).str[1]
set(df['general_cat'])

{'Accessories',
 'Accessories&Peripherals',
 'Cameras&Photography',
 'CarAccessories',
 'Components',
 'CraftMaterials',
 'ExternalDevices&DataStorage',
 'GeneralPurposeBatteries&BatteryChargers',
 'Headphones,Earbuds&Accessories',
 'Heating,Cooling&AirQuality',
 'HomeAudio',
 'HomeMedicalSupplies&Equipment',
 'HomeTheater,TV&Video',
 'Laptops',
 'Mobiles&Accessories',
 'Monitors',
 'NetworkingDevices',
 'PowerAccessories',
 'Printers,Inks&Accessories',
 'Tablets',
 'WearableTechnology'}

In [6]:
#add new column to assign numbers to categorical data using encoding
from sklearn.preprocessing import LabelEncoder 

# instance of LabelEncoder
le = LabelEncoder()

# fit and transform the categorical column to numerical values
df['category_numerical'] = le.fit_transform(df['general_cat'])

In [7]:
#save the updates to the new csv
df.to_csv('amazon.csv')


In [8]:
# Establish a connection to Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password)) 

### Product Node Creation


In [9]:
# Create a product node in Neo4j
def create_product(tx, product):
    query = """
    CREATE (p:Product {product_name: $product_name, product_id : $product_id, price: $price, rating: $rating, 
    category: $category, discount_percentage: $discount_percentage, 
    review_title: $review_title, discounted_price: $discounted_price, 
    general_cat : $general_cat, img_link : $img_link,
    rating_count : $rating_count,
    category_numerical : $category_numerical})
    """
    tx.run(query, product_name=product['product_name'], 
           product_id = product['product_id'], 
           price=product['price'], 
           rating=product['rating'], 
           rating_count = product['rating_count'],
           category=product['category'], 
           discount_percentage=product['discount_percentage'],
           general_cat = product['general_cat'],
           review_title=product['review_title'], 
           discounted_price=product['discounted_price'], 
           img_link = product['img_link'],
           category_numerical = product['category_numerical']
          )


In [10]:
# read the Amazon product data from CSV file
def read_product_data_from_csv(file_path):
    products = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            product = {
                'product_name': row['product_name'],
                'price': float(row['price']),
                'rating': float(row['rating']),
                'category': row['category'],
                'discount_percentage': float(row['discount_percentage']),
                'review_title': row['review_title'],
                'discounted_price': float(row['discounted_price']),
                'general_cat': row['general_cat'],
                'product_id': row['product_id'],
                'img_link': row['img_link'],
                'rating_count': int(row['rating_count']),
                'category_numerical' : int(row['category_numerical'])
            }
            products.append(product)
    return products

In [11]:
# Amazon CSV file path
csv_file = 'amazon.csv'

# call create product nodes in Neo4j to make the nodes
with driver.session() as session:
    products = read_product_data_from_csv(csv_file)
    for product in products:
        session.execute_write(create_product, product)

In [12]:
# # Convert DataFrame to numpy array for vectorized operations
# df_array = df[['category_numerical', 'price', 'discount_percentage', 'rating', 'rating_count', 'discounted_price']].values

# # Connect similar products via edges
# for index1 in range(len(df_array)):
#     for index2 in range(index1+1, len(df_array)):
#         distance = np.linalg.norm(df_array[index1] - df_array[index2])
#         if distance < 2:
#             query = "MATCH (p1:Product {product_name: $product_name1}), (p2:Product {product_name: $product_name2}) CREATE (p1)-[:SIMILAR_TO {distance: $distance}]->(p2)"
#             session.run(query, product_name1=df.iloc[index1]['product_name'], product_name2=df.iloc[index2]['product_name'], distance=distance)


In [13]:
# Create an edge between two products in Neo4j
def create_edge(tx, product1, product2):
    query = """   
    MATCH (p1:Product {product_name: $product_name_1})
    MATCH (p2:Product {product_name: $product_name_2})
    WHERE p1 <> p2
    WITH p1, p2,
    gds.similarity.euclideanDistance([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS euclidean_distance
    CREATE (p1)-[:SIMILAR {distance: euclidean_distance}]->(p2)
    
    """
    tx.run(query, product_name_1=product1['product_name'], product_name_2=product2['product_name'])

### Edge creation: 
Egdes between two Nodes are determined on certain criteria made using certain Node properties; product category, product price, product rating, and product discount percentage 

In [14]:
with driver.session() as session:        
    # Create edges between products based on criteria
    for i in range(len(products)):
        for j in range(i + 1, len(products)):
            product1 = products[i]
            product2 = products[j]

            # Check criteria for creating edge
            if (product1['category'] == product2['category'] and
                    (product1['category_numerical'] == product2['category_numerical']) and
                    abs(product1['price'] - product2['price']) <= 10 and
                    product1['rating'] == product2['rating'] and
                    abs(product1['discount_percentage'] - product2['discount_percentage']) <= 0.1):
                session.execute_write(create_edge, product1, product2)


In [28]:
# Recommend similar products to a user based on a product name 

# EUCLIDIAN DISTANCE
def recommend_products(tx, product_name):
    query = """
    MATCH (p1:Product {product_name: $product_name})
    WITH p1
    MATCH (p2:Product)
    WHERE p1 <> p2
    WITH p1, p2,
    gds.similarity.euclideanDistance([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS euclidean
    ORDER BY euclidean DESC
    RETURN DISTINCT p2.product_name, euclidean AS similarity
    """
    result = tx.run(query, product_name=product_name)
    return [{"product_name": record["p2.product_name"], "similarity": record["similarity"]} for record in result]


In [29]:
with driver.session() as session:
    recommended_products = session.execute_write(recommend_products, product_name="Hp Wired On Ear Headphones With Mic With 3.5 Mm")
    df = pd.DataFrame(recommended_products)
    df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
df.head()

Unnamed: 0,product_name,similarity
0,VU 164 cm (65 inches) The GloLED Series 4K Smart,1008.010061
2,LG 139 cm (55 inches) 4K Ultra HD Smart LED,947.890084
6,LG 1.5 Ton 5 Star AI DUAL Inverter Split AC,899.890103
10,"Samsung Galaxy S20 FE 5G (Cloud Navy, 8GB RAM,...",888.000135
14,Samsung 138 cm (55 inches) Crystal 4K Neo Seri...,838.810073


In [24]:
# Recommend similar products to a user based on a product name

# COSINE SIMILARITY
def recommend_products(tx, product_name):
    query = """
    MATCH (p1:Product {product_name: $product_name})
    WITH p1
    MATCH (p2:Product)
    WHERE p1 <> p2
    WITH p1, p2,
    gds.similarity.cosine([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS cosine_similarity
    ORDER BY cosine_similarity ASC
    RETURN DISTINCT p2.product_name, cosine_similarity AS similarity
    """
    result = tx.run(query, product_name=product_name)
    return [{"product_name": record["p2.product_name"], "similarity": record["similarity"]} for record in result]


In [25]:
with driver.session() as session:
    recommended_products = session.execute_write(recommend_products, product_name="Hp Wired On Ear Headphones With Mic With 3.5 Mm")
    df = pd.DataFrame(recommended_products)
    df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
df.head()

Unnamed: 0,product_name,similarity
0,FLiX (Beetel Flow USB to Micro USB PVC Data Sync,0.958596
1,Flix Micro Usb Cable For Smartphone (Black),0.959685
2,FLiX (Beetel USB to Micro USB PVC Data Sync &,0.959685
6,GIZGA essentials Universal Silicone Keyboard P...,0.971869
7,"Gizga Essentials Webcam Cover, Privacy Protect...",0.977763


In [32]:
# JACCARD SIMILARITY 

def recommend_products(tx, product_name):
    query = """
    MATCH (p1:Product {product_name: $product_name})
    WITH p1
    MATCH (p2:Product)
    WHERE p1 <> p2 AND p2.product_name <> $product_name
    WITH p1, p2,
    gds.similarity.jaccard([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS jaccard_similarity
    ORDER BY jaccard_similarity DESC
    RETURN DISTINCT p2.product_name, jaccard_similarity AS similarity
    """
    result = tx.run(query, product_name=product_name)
    return [{"product_name": record["p2.product_name"], "similarity": record["similarity"]} for record in result]


In [33]:
with driver.session() as session:
    recommended_products = session.execute_write(recommend_products, product_name="Hp Wired On Ear Headphones With Mic With 3.5 Mm")
    df = pd.DataFrame(recommended_products)
    df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
df.head()

Unnamed: 0,product_name,similarity
0,MI Xiaomi 22.5W Fast USB Type C Charger Combo for,1.0
1,Sounce Spiral Charger Cable Protector Data Cab...,1.0
2,LAPSTER Spiral Charger Spiral Charger Cable Pr...,1.0
3,Lapster USB 3.0 sata Cable for 2.5 inch SSD and,1.0
4,Sounce Fast Phone Charging Cable & Data Sync U...,1.0


### Utilizing User Provided Input for Recommendations 
User can type in preferences and the engine will provide 5 recommendations based on the criteria specified. 
Categories of products available: 
 - 1: 'Accessories',
 - 2: 'Accessories&Peripherals',
 - 3: 'Cameras&Photography',
 - 4: 'CarAccessories',
 - 5: 'Components',
 - 6: 'CraftMaterials',
 - 7: 'ExternalDevices&DataStorage',
 - 8: 'GeneralPurposeBatteries&BatteryChargers',
 - 9: 'Headphones,Earbuds&Accessories',
 - 10: 'Heating,Cooling&AirQuality',
 - 11: 'HomeAudio',
 - 12: 'HomeMedicalSupplies&Equipment',
 - 13: 'HomeTheater,TV&Video',
 - 14: 'Laptops',
 - 15: 'Mobiles&Accessories',
 - 16: 'Monitors',
 - 17: 'NetworkingDevices',
 - 18: 'PowerAccessories',
 - 19: 'Printers,Inks&Accessories',
 - 20: 'Tablets',
 - 21: 'WearableTechnology'

In [None]:
cat_num = {1: 'Accessories',
 2: 'Accessories&Peripherals',
 3: 'Cameras&Photography',
 4: 'CarAccessories',
 5: 'Components',
 6: 'CraftMaterials',
 7: 'ExternalDevices&DataStorage',
 8: 'GeneralPurposeBatteries&BatteryChargers',
 9: 'Headphones,Earbuds&Accessories',
 10: 'Heating,Cooling&AirQuality',
 11: 'HomeAudio',
 12: 'HomeMedicalSupplies&Equipment',
 13: 'HomeTheater,TV&Video',
 14: 'Laptops',
 15: 'Mobiles&Accessories',
 16: 'Monitors',
 17: 'NetworkingDevices',
 18: 'PowerAccessories',
 19: 'Printers,Inks&Accessories',
 20: 'Tablets',
 21: 'WearableTechnology'}

In [None]:
print("Welcome to the Amazon Product Recommender! ")
print("For the questions, please enter only integers")
input_category_num = int(input("From the list above of Product Categories, What type of product are you interested in? "))

#convert the category num to the specified category
input_cat = cat_num.get(input_category_num)

#print to user, their selection
print("You selected", input_cat)

# Get the encoded value of the user-specified category
category = input_cat
input_category_encoded = le.transform([category])[0]

input_max_price = int(input("What is your maximum budget? "))
input_rating = int(input("What is the minimum number of stars you want for the product: "))
input_rating_count = int(input("What is the minumum number of ratings you want for the product: "))


In [None]:
from neo4j import GraphDatabase

# define the Cypher query with parameters for user input
# uses categorical_numerical becuase Euclidean distance can only accept ints as parameters
query = """
MATCH (inputProduct:Product)
WHERE inputProduct.price <= $inputPrice 
AND inputProduct.rating >= $inputRating 
AND inputProduct.category_numerical = $inputCategory
WITH inputProduct
MATCH (similarProduct:Product)
WHERE similarProduct.price <= $inputPrice 
AND similarProduct.rating >= $inputRating 
AND similarProduct.category_numerical = $inputCategory
AND inputProduct <> similarProduct
WITH inputProduct, similarProduct
RETURN inputProduct, similarProduct, 
gds.similarity.euclideanDistance([inputProduct.price, inputProduct.rating, inputProduct.category_numerical,inputProduct.rating_count], 
[similarProduct.price, similarProduct.rating, similarProduct.category_numerical, inputProduct.rating_count]) 
AS similarity
ORDER BY similarity DESC
LIMIT 5
"""


In [None]:
#store results in a df 
# empty lists to store the data
input_product_list = []
input_product_price_list = []
similar_product_list = []
similar_product_price_list = []
similarity_list = []


# Execute the Cypher query with parameters
with driver.session() as session:
    result = session.run(query, inputPrice=input_max_price,
                         inputRating=input_rating, 
                         inputRatingCount = input_rating_count, 
                         inputCategory = input_category_encoded)

    # Process the query result
    for record in result:
        input_product = record["inputProduct"]["product_name"]
        input_product_price = record["inputProduct"]["price"]
        similar_product = record["similarProduct"]["product_name"]
        similar_product_price = record["similarProduct"]["price"]
        similarity = record["similarity"]
        
        #remove duplicate input products
        #Check if input_product already exists in the list
        if (input_product not in input_product_list) and (input_product not in similar_product_list):
            # Append the data to the respective lists
            input_product_list.append(input_product)
            similar_product_list.append(similar_product)
            input_product_price_list.append(input_product_price)
            similar_product_price_list.append(similar_product_price)

        
        # Process the results as needed, e.g., display or store recommendations
        print(f"Input Product: {input_product}, Similar Product: {similar_product}, Similarity: {similarity}")

        
# Check if all lists have the same length
if len(input_product_list) == len(similar_product_list) == len(input_product_price_list) == len(similar_product_price_list) == len(similarity_list):
    # Create a DataFrame from the lists
    df = pd.DataFrame({
        "Input Product": input_product_list,
        "Input Product Price" : input_product_price_list, 
        "Similar Product": similar_product_list,
        "Sample Product Price" : similar_product_price_list,
        "Similarity": similarity_list
    })
    print(df)
else:
    print("Error: All lists must have the same length")

# # Create a DataFrame from the lists
# df = pd.DataFrame({
#     "Input Product": input_product_list,
#     "Input Product Price" : input_product_price_list, 
#     "Similar Product": similar_product_list,
#     "Sample Product Price" : similar_product_price_list,
#     "Similarity": similarity_list
# })

In [None]:
display(df)

In [None]:
# Close the Neo4j driver connection
# driver.close()


# TODO: 
- recommendtions DF not returning anything
- evaluation of recommendations 
- visualze the graph
- test on a user (?)
- make interactive way of letting user enter data 