## **Generating Product Image from Customer Reviews**
### 94844 Gen AI Lab Final Project
#### April 2024

In [39]:
# Import Libraries
import os, io, json, transformers, pinecone, fitz, pypdf, faiss, sqlite3, langchain_community, langchain, openai, math, time 
import requests
from transformers import pipeline
import pandas as pd
import numpy as np
from io import StringIO
from dotenv import load_dotenv
from operator import itemgetter
import logging

from langchain import document_loaders, embeddings
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
#from llama_index.core.node_parser import SentenceSplitter
from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

from pinecone import Pinecone, ServerlessSpec, Pinecone         # vector store

from llama_index.core import (
    VectorStoreIndex,
    StorageContext,
    SimpleDirectoryReader
)

from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core.schema import TextNode
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

# sentence transformers
from sentence_transformers import SentenceTransformer
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [40]:
load_dotenv()

access_endpoint_api_key = os.getenv('access_endpoint_api_key')
openai_key = os.getenv('OPENAI_API_KEY')
pinecone_api_key = os.getenv('PINECONE_API_KEY')
environment = os.getenv('PINECONE_ENV')

# configure Pinecone client
pinecone_api_key = '1b7fc13c-0ea1-4edc-a174-244bce3e0b47'
pc = Pinecone(api_key=pinecone_api_key)

In [41]:
# LOGGER
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s',
                    filename='log/test.log')

## **Part 1: Product Selection and Customer Review Data Collection**
- Select 3 different products from different categories on a digital marketplace (e.g., Amazon).
- Please take into consideration different factors (e.g., product categories, popularity levels) when making your selection.
- Explain the rationale of your choices.
- Collect the corresponding product descriptions (textual content) and customer reviews (textual content) for each product

In [42]:
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
from urllib.error import HTTPError, URLError
import time

def get_page(url, headers):
    """
    Get webpage results into a Beautiful Soup object.
    """
    try:
        req = Request(url, headers=headers)
        page = urlopen(req)
        soup = BeautifulSoup(page, "html.parser")
        return soup
    except HTTPError as e:
        print(f"HTTP error: {e.code} {e.reason}")
    except URLError as e:
        print(f"URL error: {e.reason}")
    except Exception as e:
        print(f"General error: {e}")
    return None

def scrape_product_details(base_url, product_id, headers):
    """
    Scrape product description and up to 200 most recent reviews.
    """
    url = f"{base_url}/dp/{product_id}/"
    print("page url:", url)
    soup = get_page(url, headers)
    if not soup:
        return None, []

    # Find product description
    try:
        #description = soup.find_all('div', id='productDescription', class_="a-section a-spacing-small").get_text(strip=True)
        description = soup.find('div', id='feature-bullets').get_text(strip=True)
    except AttributeError:
        description = "No description available"

    reviews = []
    # Adjust the URL for 'Most Recent' reviews and iterate through pages
    review_url = f"{base_url}/product-reviews/{product_id}/"
    print("review url:", review_url)
    for i in range(1, 21):  # Assume there are 10 reviews per page, fetch 20 pages
        url = f"{review_url}?pageNumber={i}&sortBy=helpful"
        soup = get_page(url, headers)
        if not soup:
            continue
        review_divs = soup.find_all('div', {'data-hook': 'review'})
        for div in review_divs:
            try:
                #title = div.find('a', {'data-hook': 'review-title'}).get_text(strip=True)
                body = div.find('span', {'data-hook': 'review-body'}).get_text(strip=True)
                reviews.append(body)
                if len(reviews) >= 200:
                    break
            except AttributeError:
                continue
        if len(reviews) >= 200:
            break
        time.sleep(1)  # Sleep to prevent too frequent requests

    return description, reviews

In [43]:
# Usage example
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"}
urls = {"https://www.amazon.com/Samsonite-Omni-Hardside-Spinner-Teal": "B013WFNZRE",
        "https://www.amazon.com/Hurtle-3-Wheeled-Scooter-Kids-Graffiti": "B08BDGS58Q",
        "https://www.amazon.com/DeLonghi-ECAM29084SB-Magnifica-LatteCrema-Espresso": "B0B38KRTV6"
        }

descriptions = []
reviews = []
for base_url, product_id in urls.items():
    description, review = scrape_product_details(base_url, product_id, headers)
    descriptions.append(description)
    reviews.append(review)

products = ["Product A", "Product B", "Product C"]

# Create dictionary
product_data = {}
for i in range(len(products)):
    product_data[products[i]] = {
        "description": descriptions[i],
        "reviews": reviews[i]
    }
print(product_data)

page url: https://www.amazon.com/Samsonite-Omni-Hardside-Spinner-Teal/dp/B013WFNZRE/
review url: https://www.amazon.com/Samsonite-Omni-Hardside-Spinner-Teal/product-reviews/B013WFNZRE/
page url: https://www.amazon.com/Hurtle-3-Wheeled-Scooter-Kids-Graffiti/dp/B08BDGS58Q/
review url: https://www.amazon.com/Hurtle-3-Wheeled-Scooter-Kids-Graffiti/product-reviews/B08BDGS58Q/
page url: https://www.amazon.com/DeLonghi-ECAM29084SB-Magnifica-LatteCrema-Espresso/dp/B0B38KRTV6/
review url: https://www.amazon.com/DeLonghi-ECAM29084SB-Magnifica-LatteCrema-Espresso/product-reviews/B0B38KRTV6/


In [44]:
product_data

{'Product A': {'description': '24" SPINNER LUGGAGE maximizes your packing power and is the ideal checked bag for longer tripsPACKING Dimensions: 24” x 17.5” x 11.5”, Overall Dimensions: 26.5” x 17.75” x 11.75”, Weight: 8.34 lbs.10 YEAR LIMITED WARRANTY: Samsonite products are rigorously tested to ensure our products meet stringent standards. This bag comes with a 10-year warranty against defects in materials and workmanship.MICRO-DIAMOND POLYCARBONATE texture is extremely scratch-resistant, keeping cases beautiful trip after tripSIDE-MOUNTED TSA LOCKS act to deter theft, ensuring that only you or a TSA agent have easy access to your belongings when travelingFOUR, MULTI-DIRECTIONAL OVERSIZED SPINNER WHEELS for effortless mobility, re-engineered lightweight',
  'reviews': ['Perfect weight and very maneuverable.  Zippers work easily and has a convenient TSA approved lock.  Looks great and will be perfect size for longer trips.  Happy with the purchase / great value for the money.',
   'Wh

In [45]:
# Method 2: pull data from pdf
doc = fitz.open("coffeemaker_product.pdf")

## **Part 2: Analysis of Customer Reviews with LLM**
- Use OpenAI’s GPT 3.5 turbo API to conduct text analysis to extract valuable information from the textual data collected above and build a more wholistic understanding about the product.
- Some analyses that are relevant to consider include, but not limited to, for example, text summarization, extraction of particular product features (e.g., visual information), or sentiment analysis.
- You can do the analyses using prompt engineering (with different prompt strategies), RAG, or a combination of both.
- When doing the analysis, you may need to consider different documentation chunking strategies given the input token limit OpenAI API has.
- You could consider using a vector database to store your text embedding if necessary, but not mandatory.
- You also need to think about what is an effective output from this step, given that your goal next step is to send this output into the diffusion model for meaningful product image generation.

In [46]:
# Chunk text from pdf
text_splitter = SemanticChunker(OpenAIEmbeddings())
text_chunks = []
doc_idxs = []

for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_splitter.split_text(page_text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

In [47]:
def create_textnode(usepdf=False):
    nodes = []
    # Use scraped chunks
    if usepdf == False:
        # Create a TextNode for the description
        desc_node = TextNode(text=product_data[i]['description'])
        nodes.append(desc_node)

        # Create a TextNode for each review
        for review in product_data[i]['reviews']:
            review_node = TextNode(text=review)
            nodes.append(review_node)
    # Use text from pdf, chunked
    else:
        for idx, text_chunk in enumerate(text_chunks):
            node = TextNode(text=text_chunk)
            src_doc_idx = doc_idxs[idx]
            src_page = doc[src_doc_idx]
            nodes.append(node)
    
    return nodes

In [48]:
### CHANGE HERE ###
# Select which product to analyze (either: Product B and usepdf=False or usepdf=True)

i = "Product A"

nodes = create_textnode(usepdf=False)

In [49]:
gpt_3_5_turbo = "gpt-3.5-turbo"
llm = OpenAI(model=gpt_3_5_turbo)

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

pipeline = IngestionPipeline(
    transformations=extractors,
)

nodes = await pipeline.arun(nodes=nodes, in_place=False)

100%|██████████| 5/5 [00:01<00:00,  3.30it/s]
100%|██████████| 101/101 [00:59<00:00,  1.70it/s]


In [50]:
embed_model_3_small = "text-embedding-3-small"
embed_model = OpenAIEmbedding(model=embed_model_3_small, openai_api_key=openai_key)

# create embeddings for the text dataset
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [51]:
use_serverless = os.environ.get("USE_SERVERLESS", "False").lower() == "true"

if use_serverless:
    spec = pinecone.ServerlessSpec(cloud='aws', region='us-west-2')
else:
    spec = pinecone.PodSpec(environment=environment)

# Name our Pinecone Index
index_name = "hw03"

# If a Pinecone index of the same name already exists, delete it:
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

In [52]:
# Name our Pinecone Index
index_name = "hw03"

dimensions = 1536 #768              # the dimensions of the index need to align with the LLM we are using for the RAG system. For example, if using openAI then dimenion = 1536. If using Llama2, then dimension = 384.

pc.create_index(
    name=index_name, 
    dimension=dimensions, 
    metric="cosine",          # we can use different distance metrics to measure the similarity between vector embeddings and user queries. this is where we define what similarity metric we are going to use for the vector store.
    spec=spec
)

# wait for index to be ready before connecting
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

for index in pc.list_indexes():
    print(index['name'])

pc.describe_index("hw03")

pc_index = pc.Index(index_name)  # create an index to use in the vector store

vector_store = PineconeVectorStore(pinecone_index=pc_index)    # this function creates a vector store where we will add and store embeddings

hw03


In [53]:
# nothing so far - empty index
pc_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [54]:
vector_store.add(nodes)

Upserted vectors: 100%|██████████| 101/101 [00:03<00:00, 25.34it/s]


['ffe64bcb-a112-42bb-9f0c-b81fe40b5103',
 '5cb64606-3e12-49d2-a0d8-3975aed61a56',
 '2d8a4558-c9b0-4f8a-8994-0afe39bdba14',
 'eed2f3ae-c6ec-43b6-a2c9-a1592cd012f2',
 'f483a21c-108a-4fe0-b155-b4131faa498b',
 '1c59abfe-b515-484d-93c8-af41c72aa167',
 '6b20ef5e-2ef0-4f18-abf2-1ecfe951f5a3',
 '72a22c55-02af-468d-948c-2423bf2e493d',
 '792544d7-74b5-4d4f-a061-6b79cc685699',
 'b3e0f4b8-db7d-44bc-8c41-b8049066e3cc',
 '6ce5d3a2-e73c-4183-8e49-5676cb25b22f',
 '91680edb-a99d-4df8-a9fc-420dba02e9d8',
 '53dffe80-3ab5-4913-9edb-89f0d73c680f',
 'cdd7101c-71c6-42ba-8836-f34d1fb2b7f8',
 '845110e6-b888-44e8-b8bf-062f2f92e5b5',
 '4ebc9398-de54-4f27-8da8-bb32cb844e8c',
 'dd709d7d-b051-4830-b56f-909c0c6126ad',
 '445915dc-39fb-40e4-b82e-7971293da6d6',
 '99bf60b1-3310-4020-bd13-597052a66c04',
 '35515a6f-3ded-44ee-b64d-26c3af45fdf2',
 '1dd4fc0e-f54e-46c6-ad71-bbc68ffe36c3',
 '830fe513-8fbc-4822-a34a-5660e87f99ff',
 '50cb71be-bf9e-4009-899e-8c077d1d96a7',
 'fa406558-e3f5-4a6b-942a-3718e06ca438',
 '3b75bdb7-8d7b-

In [55]:
pc_index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [56]:
print(nodes[0].metadata)
print(nodes[0])

{'document_title': '"Samsonite Spinner Luggage: Lightweight, Stylish, and Durable Options with TSA Locks and Micro-Diamond Polycarbonate Texture"', 'questions_this_excerpt_can_answer': '1. What are the dimensions and weight of the 24" spinner luggage from Samsonite?\n2. What type of warranty does Samsonite offer for their products, specifically for this spinner luggage?\n3. How does the micro-diamond polycarbonate texture of the luggage help maintain its appearance during travel?'}
Node ID: ffe64bcb-a112-42bb-9f0c-b81fe40b5103
Text: 24" SPINNER LUGGAGE maximizes your packing power and is the
ideal checked bag for longer tripsPACKING Dimensions: 24” x 17.5” x
11.5”, Overall Dimensions: 26.5” x 17.75” x 11.75”, Weight: 8.34
lbs.10 YEAR LIMITED WARRANTY: Samsonite products are rigorously tested
to ensure our products meet stringent standards. This bag comes with a
10-year warr...


In [57]:
from openai import OpenAI
client = OpenAI()

def get_sys_message(q, k):
    # get embedding
    result = client.embeddings.create(
        input=[q],
        model=embed_model_3_small
    )

    # retrieve from Pinecone vector store
    xq = result.data[0].embedding

    # search
    res2 = pc_index.query(vector=xq, top_k=k, include_metadata=True)
    
    context = ""
    for match in res2['matches']:
        dict_obj = json.loads(match['metadata']['_node_content'])
        context += dict_obj['text'] + " "
    context = context.replace("\n", "")

    return context

In [58]:
queries = ["Describe the product in detail. What are some core visual features (i.e. color, size dimension, material, shape, pattern, logo, etc.) of this product?",
           "What is the main purpose of this product? What do people use it for?",
           "What is most loved and hated about this product? Summarize the positive and negative comments."
          ]   

In [67]:
# try a query
get_sys_message(q=queries[0], k=20)

"Great product and love the look. I haven't used this yet but looking it over I'm very happy with my purchase. Very good product Haven’t used yet, but I’m happy so far with purchase.  Like darker color without being black.  Good size with expandable sides. My mother loved it! Nice color as well My son , and his girlfriend told me about this suit case. They love it when they go on their trips Nothing Love the color, got it due to easy spot. Combo  locks great for my upcoming trips, durable no issues or problems. I love the unique color. It helps at airports to have luggage that stands out. The only problem is durability. I am not saying it is not as durable as possible. The luggage in airports are severely abused so it seems no luggage is safe from cracks and dents. I intended to wait to write the review till after I had used this case a few times, but it’s such a great little case I couldn’t wait. It is very spacious inside, it’s lightweight, and the wheels spin perfectly. Plus, I rece

In [68]:
def rag_openAI_gpt(model, q, k, prompt):
    f=get_sys_message(q, k)
    
    response = client.chat.completions.create(
    model=model,
    messages=[
        {"role": "system", "content": "Instruction: use the information in" + f + "to answer the user's question."},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": f},
        {"role": "user", "content": "What is the answer?"}
    ],
    max_tokens=1000
    )
    return response.choices[0].message.content

In [75]:
# Run
analyses = {}
for i in range(len(queries)):
    response = rag_openAI_gpt(model=gpt_3_5_turbo, q=queries[i], k=15, prompt="Product Description")
    print(f"<Query: {queries[i]}>")
    print(response)
    analyses[i+1] = response
    print()
analyses

<Query: Describe the product in detail. What are some core visual features (i.e. color, size dimension, material, shape, pattern, logo, etc.) of this product?>
Based on the provided information, the product is described as a great, good quality suitcase that is stylish and has expandable sides. It comes in a darker color that is not black, making it easy to spot at airports. The suitcase has combo locks, making it secure for upcoming trips. While durability may be a concern due to airport handling, the user finds the suitcase spacious, lightweight, and with perfectly spinning wheels. They appreciate the considerate delivery notification and suggest adding a zipper separation for one side and an accessory pocket. Overall, the user is very happy with the purchase and considers it the best carry-on suitcase they have used.

<Query: What is the main purpose of this product? What do people use it for?>
The answer does not contain a specific question, but it seems to be a collection of revie

{1: 'Based on the provided information, the product is described as a great, good quality suitcase that is stylish and has expandable sides. It comes in a darker color that is not black, making it easy to spot at airports. The suitcase has combo locks, making it secure for upcoming trips. While durability may be a concern due to airport handling, the user finds the suitcase spacious, lightweight, and with perfectly spinning wheels. They appreciate the considerate delivery notification and suggest adding a zipper separation for one side and an accessory pocket. Overall, the user is very happy with the purchase and considers it the best carry-on suitcase they have used.',
 2: 'The answer does not contain a specific question, but it seems to be a collection of reviews and feedback about a suitcase. Customers have shared positive comments about the product, including its durability, ease of use, good design, and color options. Some have mentioned the need for additional features like a zip

In [76]:
# Save responses to the log file
logging.info(analyses)

## **Part 3: Image Generation**
- For each product, based on the information extracted from the product description and customer reviews, craft prompts to guide the image generation process effectively.
- Use the OpenAI’s DALLE 2 to generate 3~5 images for each product based on your crafted prompts. Experiment with different prompts and settings to best visualize what you believe is a good illustration of the product based on product description and customer reviews.
- If necessary, iterate on your prompts based on initial results to refine the illustrations.
- Compare AI-generated product images with the actual product images posted in real world. Are they similar or different? In what dimensions? Do you think AI is able to illustrate the products well? Why or why not?
- Provide analyses and explanations of your findings.

In [77]:
# Change keys to dictionary
analyses = {
    "Product Description": analyses[1],
   #"Product Features": analyses[2],
    "Product Usage": analyses[2],
    "Product Sentiment": analyses[3]
    #"Product Sentiment Bad": analyses[5]
}

In [78]:
analyses

{'Product Description': 'Based on the provided information, the product is described as a great, good quality suitcase that is stylish and has expandable sides. It comes in a darker color that is not black, making it easy to spot at airports. The suitcase has combo locks, making it secure for upcoming trips. While durability may be a concern due to airport handling, the user finds the suitcase spacious, lightweight, and with perfectly spinning wheels. They appreciate the considerate delivery notification and suggest adding a zipper separation for one side and an accessory pocket. Overall, the user is very happy with the purchase and considers it the best carry-on suitcase they have used.',
 'Product Usage': 'The answer does not contain a specific question, but it seems to be a collection of reviews and feedback about a suitcase. Customers have shared positive comments about the product, including its durability, ease of use, good design, and color options. Some have mentioned the need 

In [79]:
# Craft prompt
prompt = f"""   
        Generate an image of a product that closely matches the following description:
        Description: {analyses["Product Description"]}. 
        Usage: {analyses["Product Usage"]}. 
        Sentiment: {analyses["Product Sentiment"]}.
        Do not include text in the image.
        Create an image that illustrates a human being using the product in a real-world setting 
        (for example a person ready to travel), showcasing the usage of the product.
        Photograph, sharp focus, cinematic, Sigma 85mm f/1.4 and ISO 100.
        """

#Create an image focusing on the outer features of the product. Include size dimensions of the product.
#Create an image that shows the inside of the product.

In [80]:
# Generate images using DALL-E 3
response = openai.images.generate(
    model="dall-e-3",
    prompt=prompt,
    size="1024x1024",
    quality="standard",
    n=1,
)

import webbrowser
webbrowser.open(response.data[0].url)

True