##  Consumer Oriented Amazon Product Recommendation Engine

In [1]:
import pandas as pd
import numpy as np
from neo4j import GraphDatabase
import csv

In [2]:
#Read from the csv stored in same directory
df = pd.read_csv('amazon.csv')

### Data Cleaning
- Limit the dataset to 1000 entries for faster processing
- Convert necessary string column types like price, discount_price to float
- Convert rating_count column to only include integers 
- Drop unneeded columns
- Assign categorical value to general_cat column of dataframe
- Convert non-dollar currencies to dollar values
- Limit the number of words in each Amazon product name to 20
- Intorduce a more general 'general_cat' that can be used by user to enter their preferences

In [3]:
#Remove non-integer values from the rating count
df = df[df['rating_count'] != '1+H9280443']

In [4]:
#strip the product name to first 20 characters 
df['product_name'] = df['product_name'].str.split().str[:20].str.join(' ')

In [5]:
#add a more general category 
df['general_cat'] = df['category'].str.split('|', n=2).str[1]

In [6]:
#add new column to assign numbers to categorical data using encoding
from sklearn.preprocessing import LabelEncoder 

# instance of LabelEncoder
le = LabelEncoder()

# fit and transform the categorical column to numerical values
df['category_numerical'] = le.fit_transform(df['general_cat'])

In [7]:
#save the updates to the new csv
df.to_csv('amazon.csv')

In [8]:
# Establish a connection to Neo4j database
uri = "bolt://localhost:7687"
username = "neo4j"
password = "password"
driver = GraphDatabase.driver(uri, auth=(username, password)) 

### Product Node Creation
This is used to create the nodes used in neo4j, the node properties used are: product_name, product_id, price, rating, category, discount_percentage, review_title, general_cat, image_link, rating_count, and category_number

- product_id - Product ID
- product_name - Name of the Product
- category - Category of the Product
- general_cat - More general category grouping from one of the twenty buckets.
- discounted_price - Discounted Price of the Product
- price - Dollar value of the Price of the Product
- discount_percentage - Percentage of Discount for the Product
- rating - Rating of the Product from (1 to 5)
- rating_count - Number of people who voted for the Amazon rating
- review_title - Short review
- img_link - Image Link of the Product

In [9]:
# Create a product node in Neo4j
def create_product(tx, product):
    #Cypher query
    query = """
    CREATE (p:Product {product_name: $product_name, product_id : $product_id, price: $price, rating: $rating, 
    category: $category, discount_percentage: $discount_percentage, 
    review_title: $review_title, discounted_price: $discounted_price, 
    general_cat : $general_cat, img_link : $img_link,
    rating_count : $rating_count,
    category_numerical : $category_numerical})
    """
    tx.run(query, product_name=product['product_name'], 
           product_id = product['product_id'], 
           price=product['price'], 
           rating=product['rating'], 
           rating_count = product['rating_count'],
           category=product['category'], 
           discount_percentage=product['discount_percentage'],
           general_cat = product['general_cat'],
           review_title=product['review_title'], 
           discounted_price=product['discounted_price'], 
           img_link = product['img_link'],
           category_numerical = product['category_numerical']
          )


In [10]:
# read the Amazon product data from CSV file
def read_product_data_from_csv(file_path):
    products = []
    with open(file_path, newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            #Intialize a Product referenceing the columns
            product = {
                'product_name': row['product_name'],
                'price': float(row['price']),
                'rating': float(row['rating']),
                'category': row['category'],
                'discount_percentage': float(row['discount_percentage']),
                'review_title': row['review_title'],
                'discounted_price': float(row['discounted_price']),
                'general_cat': row['general_cat'],
                'product_id': row['product_id'],
                'img_link': row['img_link'],
                'rating_count': int(row['rating_count']),
                'category_numerical' : int(row['category_numerical'])
            }
            products.append(product)
    return products

In [11]:
# Amazon CSV file path
csv_file = 'amazon.csv'

# Call create product nodes in Neo4j to make the nodes
with driver.session() as session:
    products = read_product_data_from_csv(csv_file)
    for product in products:
        session.execute_write(create_product, product)

### Edge creation using Jaccard Distance
Egdes between two Nodes are determined on certain criteria made using certain Node properties; product category, product price, product rating, and product discount percentage 

In [12]:
# Create an edge between two products in Neo4j
def create_edge(tx, product1, product2):
    query = """   
    MATCH (p1:Product {product_name: $product_name_1})
    MATCH (p2:Product {product_name: $product_name_2})
    WHERE p1 <> p2
    WITH p1, p2,
    gds.similarity.jaccard([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS jaccard
    CREATE (p1)-[:SIMILAR {distance: jaccard}]->(p2)
    
    """
    tx.run(query, product_name_1=product1['product_name'], product_name_2=product2['product_name'])

In [13]:
with driver.session() as session:        
    # Create edges between products based on criteria
    for i in range(len(products)):
        for j in range(i + 1, len(products)):
            product1 = products[i]
            product2 = products[j]

            # Check criteria for creating edge
            if (product1['category'] == product2['category'] and
                    (product1['category_numerical'] == product2['category_numerical']) and
                    abs(product1['price'] - product2['price']) <= 10 and
                    product1['rating'] == product2['rating'] and
                    abs(product1['discount_percentage'] - product2['discount_percentage']) <= 0.1):
                session.execute_write(create_edge, product1, product2)


### Utilizing User's Favorite Products for Recommendations 
Our engine works by providing recommendation based on products the User has liked in the past. The User can select their favorite products and also indicate their preferences for price. The engine will take these factors into account and then provide 5 product recommendations based with prices and similarity scores.

### Product Categories: 

- 0: Accessories
- 1: Accessories&Peripherals
- 2: Cameras&Photography
- 3: Components
- 4: CraftMaterials
- 5: ExternalDevices&DataStorage
- 6: GeneralPurposeBatteries&BatteryChargers
- 7: Headphones,Earbuds&Accessories
- 8: Heating,Cooling&AirQuality
- 9: HomeAudio
- 10: HomeTheater,TV&Video
- 11: Laptops
- 12: Mobiles&Accessories
- 13: Monitors
- 14: NetworkingDevices
- 15: PowerAccessories
- 16: Printers,Inks&Accessories
- 17: Tablets
- 18: WearableTechnology

In [14]:
cat_num ={
    0: 'Accessories',
    1: 'Accessories&Peripherals',
    2: 'Cameras&Photography',
    3: 'Components',
    4: 'CraftMaterials',
    5: 'ExternalDevices&DataStorage',
    6: 'GeneralPurposeBatteries&BatteryChargers',
    7: 'Headphones,Earbuds&Accessories',
    8: 'Heating,Cooling&AirQuality',
    9: 'HomeAudio',
    10: 'HomeTheater,TV&Video',
    11: 'Laptops',
    12: 'Mobiles&Accessories',
    13: 'Monitors',
    14: 'NetworkingDevices',
    15: 'PowerAccessories',
    16: 'Printers,Inks&Accessories',
    17: 'Tablets',
    18: 'WearableTechnology'
}

# Welcome to the Amazon Product Recommender

##### Enter your Preferences and select Favorite Products to feed to Recommendation Algorithm 

In [15]:
print("Welcome to the Amazon Product Recommender! ")
print("For the questions, please enter only integers")

#Validations for category number 
while True:
    input_category = input("From the list above of Product Categories, What type of product are you interested in? ")
    try:
        input_category_num = int(input_category)
        if input_category_num < 0 or input_category_num > 18:
            print("Product category must be between 0 to 18 inclusive. Please try again.")
        else:
            break
    except ValueError:
        print("Product category must be a positive integer. Please try again.")

#convert the category num to the specified category
input_cat = cat_num.get(input_category_num)

#print to user, their selection
print("Good choice. You selected", input_cat, "!")

# Get the encoded value of the user-specified category
category = input_cat
input_category_encoded = le.transform([category])[0]

import random
import pandas as pd
import random

def select_favorite_products(product_category):
    
    df = pd.read_csv('amazon.csv') # Read the csv
    
    # Filter the DataFrame to only include products with the specified category
    category_df = df[df['general_cat'] == product_category]
    
    # Instructions 
    print("In order for the engine to understand your needs better... \n In", product_category, "category of products, what is your maximum price?")
    
    # Ask the user for the maximum price they are willing to pay
    while True:
        max_price_str = input("What is the maximum price you are willing to pay? ")
        try:
            max_price = int(max_price_str)
            if max_price <= 0:
                print("Maximum price must be a positive integer. Please try again.")
            else:
                break
        except ValueError:
            print("Maximum price must be a positive integer. Please try again.")
    
    # Filter the DataFrame to only include products that are less than or equal to the max price
    category_df = category_df[category_df['price'] <= max_price]
    
    # Select 5 random products from the filtered DataFrame
    random_products = random.sample(list(category_df[['product_name', 'product_link']].values.tolist()), 5)
    
    # Print out the list of random products for the user to choose from
    print("\n Here are 5 random products in the", product_category, "category: \n")
    for i, (product_name, product_link) in enumerate(random_products):
        print(f"{i+1}: {product_name} - {product_link}")
    
    # Ask the user to pick their 2 favorite products by entering integers
    favorite_indices = []
    while len(favorite_indices) < 2:
        try: 
            favorite_index = int(input("\n Enter the number of your favorite product (1-5): "))
            if favorite_index < 1 or favorite_index > 5:
                print("Invalid input. Please enter a number between 1 and 5.")
            elif favorite_index in favorite_indices:
                print("You've already selected that product. Please choose a different one.")
            else:
                favorite_indices.append(favorite_index)
        except ValueError:
            print("Invalid input. Please enter a number between 1 and 5.")
            
    
    # Get the names and links of the user's favorite products
    favorite_products = [(random_products[i-1][0], random_products[i-1][1]) for i in favorite_indices]
    
    # Return the names and links of the user's favorite products as a list of tuples
    return favorite_products

# find users favorite products
favorite_products = select_favorite_products(input_cat)
print("\n Thanks for letting me know. Your favorite products are: \n", favorite_products[0][0], "and \n", favorite_products[1][0])
print("\n This will be factored in my recommendations! ")

Welcome to the Amazon Product Recommender! 
For the questions, please enter only integers
From the list above of Product Categories, What type of product are you interested in? 2
Good choice. You selected Cameras&Photography !
In order for the engine to understand your needs better... 
 In Cameras&Photography category of products, what is your maximum price?
What is the maximum price you are willing to pay? 20

 Here are 5 random products in the Cameras&Photography category: 

1: Gizga Essentials Professional 3-in-1 Cleaning Kit for Camera, Lens, Binocular, - https://www.amazon.in/Essentials-Gz-Ck-101-Professional-Micro-Fiber-Antibacterial/dp/B01IBRHE3E/ref=sr_1_144?qid=1672903001&s=computers&sr=1-144
2: Fujifilm Instax Mini Single Pack 10 Sheets Instant Film for - https://www.amazon.in/Fujifilm-Instax-Instant-Fuji-Cameras/dp/B00R1P3B4O/ref=sr_1_129?qid=1672903001&s=computers&sr=1-129
3: DIGITEK® (DTR-200MT) (18 CM) Portable & Flexible Mini Tripod with - https://www.amazon.in/DIGITEK-P

In [44]:
product_name = []
for name, url in favorite_products:
    product_name.append(name)
print(product_name)

['Gizga Essentials Professional 3-in-1 Cleaning Kit for Camera, Lens, Binocular,', 'DIGITEK® (DTR-200MT) (18 CM) Portable & Flexible Mini Tripod with']


### Recommendation Algorithm using Jaccard Similarity

In [55]:
# JACCARD SIMILARITY 

def recommend_products(tx, product_name):
    query = """
    MATCH (p1:Product)
    WHERE p1.product_name IN $product_name
    WITH p1
    MATCH (p2:Product)
    WHERE p1 <> p2 AND NOT p2.product_name IN $product_name
    WITH p1, p2,
    gds.similarity.jaccard([p1.price, p1.product_rating, p1.categorical_numerical, 
    p1.discount_percentage, p1.num_rating], 
    [p2.price, p2.product_rating, p2.categorical_numerical, p2.discount_percentage, p2.num_rating]) 
    AS jaccard_similarity, p2.price AS price
    ORDER BY jaccard_similarity DESC
    RETURN DISTINCT p1.product_name, p2.product_name, p2.price AS price, jaccard_similarity AS similarity

    """
    result = tx.run(query, product_name=product_name)
    return [{"product_name": record["p2.product_name"], "similarity": record["similarity"], "price": record["price"]} for record in result]


In [57]:
with driver.session() as session:
    recommended_products = session.execute_write(recommend_products, product_name=product_name)
    df = pd.DataFrame(recommended_products)
    df.drop_duplicates(subset=['product_name'], keep='first', inplace=True)
    df = df[['product_name','price', 'similarity']] # add this line to select only the columns you want
    df['price'] = pd.to_numeric(df['price']) # add this line to convert the price column to numeric values

print("Based on your favorite two products, these are the recommendations we have for you!")
df.head()



Based on your favorite two products, these are the recommendations we have for you!


Unnamed: 0,product_name,price,similarity
0,AirCase Rugged Hard Drive Case for 2.5-inch We...,5.99,1.0
1,HUMBLE Dynamic Lapel Collar Mic Voice Recordin...,5.99,0.333333
2,"FEDUS Cat6 Ethernet Cable, 10 Meter High Speed...",5.99,0.333333
3,"boAt Micro USB 55 Tangle-free, Sturdy Micro US...",5.99,0.333333
4,"boAt Type C A325 Tangle-free, Sturdy Type C Ca...",5.99,0.333333


In [None]:
#Close the Neo4j driver connection
driver.close()